58 files changed, 2972 insertions, 3990 deletions
diff --git a/build/make/ios-Info.plist b/build/make/ios-Info.plist
index 8d1da32fd..d157b11a0 100644
--- a/build/make/ios-Info.plist
+++ b/build/make/ios-Info.plist
@@ -31,5 +31,7 @@
 		<integer>1</integer>
 		<integer>2</integer>
 	</array>
+	<key>VPXFullVersion</key>
+	<string>${FULLVERSION}</string>
 </dict>
 </plist>
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 21610745c..96dc6cc8c 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -226,6 +226,7 @@ build_framework() {
 
     # Copy in Info.plist.
     cat "${SCRIPT_DIR}/ios-Info.plist" \
+      | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
       | sed "s/\${VERSION}/${VERSION}/g" \
       | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
       > "${FRAMEWORK_DIR}/Info.plist"
@@ -341,8 +342,9 @@ if [ "${ENABLE_SHARED}" = "yes" ]; then
   CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
 fi
 
-VERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}" \
-  | sed -E 's/^v(.*)$/\1/')
+FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}")
+VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
+
 if [ "$ENABLE_SHARED" = "yes" ]; then
   IOS_VERSION_OPTIONS="--enable-shared"
 else
@@ -369,6 +371,7 @@ cat << EOF
   OSX_TARGETS="${OSX_TARGETS}"
   SIM_TARGETS="${SIM_TARGETS}"
   SCRIPT_DIR="${SCRIPT_DIR}"
+  FULLVERSION="${FULLVERSION}"
   VERSION="${VERSION}"
   IOS_VERSION_MIN="${IOS_VERSION_MIN}"
 EOF
diff --git a/ivfdec.c b/ivfdec.c
index 6dcd66f73..6d1d679b0 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -23,7 +23,7 @@ static void fix_framerate(int *num, int *den) {
   // we can guess the framerate using only the timebase in this
   // case. Other files would require reading ahead to guess the
   // timebase, like we do for webm.
-  if (*num < 1000) {
+  if (*den > 0 && *num > 0 && *num < 1000) {
     // Correct for the factor of 2 applied to the timebase in the encoder.
     if (*num & 1)
       *den *= 2;
diff --git a/test/acm_random.h b/test/acm_random.h
index a29ced2f7..b94b6e195 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -35,7 +35,7 @@ class ACMRandom {
   int16_t Rand9Signed(void) {
     // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
     const uint32_t value = random_.Generate(512);
-    return static_cast<int16_t>(value - 256);
+    return static_cast<int16_t>(value) - 256;
   }
 
   uint8_t Rand8(void) {
diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
index d25e4f5f5..e9945c409 100644
--- a/test/add_noise_test.cc
+++ b/test/add_noise_test.cc
@@ -185,11 +185,6 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) {
 INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
                         ::testing::Values(vpx_plane_add_noise_c));
 
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, AddNoiseTest,
-                        ::testing::Values(vpx_plane_add_noise_mmx));
-#endif
-
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
                         ::testing::Values(vpx_plane_add_noise_sse2));
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 5467c46cf..3941e16fc 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -450,7 +450,28 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
   int denoiser_offon_period_;
 };
 
-// Check basic rate targeting,
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBR) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  for (int i = 400; i <= 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for CBR,
 TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
@@ -474,7 +495,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
   }
 }
 
-// Check basic rate targeting,
+// Check basic rate targeting for CBR.
 TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
   ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
 
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index c4dfaf345..ddaf9395b 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -373,10 +373,10 @@ class Trans16x16TestBase {
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int32_t diff =
             bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int32_t diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         if (max_error < error)
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 1cbac5c63..16d88255e 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -154,10 +154,10 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
 
     for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      const uint32_t diff =
+      const int32_t diff =
           bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-      const uint32_t diff = dst[j] - src[j];
+      const int32_t diff = dst[j] - src[j];
 #endif
       const uint32_t error = diff * diff;
       if (max_error < error)
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 0c91aee21..5a58830d5 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -487,19 +487,11 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    MMX, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#endif
-
-#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
     ::testing::Values(
+        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
 #endif
 
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index edf468216..0c081ee1f 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -458,7 +458,7 @@ class FwdTrans8x8TestBase {
         coeff_r[j] = static_cast<tran_low_t>(round(out_r[j]));
 
       for (int j = 0; j < kNumCoeffs; ++j) {
-        const uint32_t diff = coeff[j] - coeff_r[j];
+        const int32_t diff = coeff[j] - coeff_r[j];
         const uint32_t error = diff * diff;
         EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 DCT has error " << error
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 778a36ca3..94646e4ff 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -430,16 +430,6 @@ TEST_P(Loop8Test9Param, ValueCheck) {
 
 using std::tr1::make_tuple;
 
-#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    MMX, Loop8Test6Param,
-    ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_4_mmx,
-                   &vpx_lpf_horizontal_4_c, 8),
-        make_tuple(&vpx_lpf_vertical_4_mmx,
-                   &vpx_lpf_vertical_4_c, 8)));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@@ -497,12 +487,16 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_sse2,
+                   &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_horizontal_8_sse2,
                    &vpx_lpf_horizontal_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_edge_8_sse2,
                    &vpx_lpf_horizontal_edge_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_edge_16_sse2,
                    &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_sse2,
+                   &vpx_lpf_vertical_4_c, 8),
         make_tuple(&vpx_lpf_vertical_8_sse2,
                    &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_vertical_16_sse2,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 3e65fecfb..2acf744d5 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -191,14 +191,15 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
 INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
                 vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
                 vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
-                vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
+                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
                 vpx_tm_predictor_4x4_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL,
-                vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3,
+                NULL, NULL, NULL, NULL,
+                vpx_d153_predictor_4x4_ssse3, NULL,
                 vpx_d63_predictor_4x4_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
@@ -240,13 +241,13 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
 INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
                 vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
                 vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
-                vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_tm_predictor_8x8_sse2)
+                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL,
+                NULL, NULL, NULL, NULL,
                 vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
                 vpx_d63_predictor_8x8_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
diff --git a/test/variance_test.cc b/test/variance_test.cc
index a6efc92d7..cb6339041 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -976,16 +976,6 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(
-    MMX, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_mmx, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_mmx, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_mmx, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_mmx, 0)));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
                         ::testing::Values(vpx_get_mb_ss_sse2));
@@ -1026,8 +1016,8 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
                       make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
                       make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelAvgVarianceTest,
@@ -1043,8 +1033,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
         make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
         make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 #endif  // CONFIG_USE_X86INC
 
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/vp10_inv_txfm.c b/vp10/common/vp10_inv_txfm.c
index 403b209a2..00110f30b 100644
--- a/vp10/common/vp10_inv_txfm.c
+++ b/vp10/common/vp10_inv_txfm.c
@@ -35,10 +35,10 @@ void vp10_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, 8);
-    op[1] = WRAPLOW(b1, 8);
-    op[2] = WRAPLOW(c1, 8);
-    op[3] = WRAPLOW(d1, 8);
+    op[0] = WRAPLOW(a1);
+    op[1] = WRAPLOW(b1);
+    op[2] = WRAPLOW(c1);
+    op[3] = WRAPLOW(d1);
     ip += 4;
     op += 4;
   }
@@ -78,8 +78,8 @@ void vp10_iwht4x4_1_add_c(const tran_low_t *in,
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, 8);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+  op[0] = WRAPLOW(a1);
+  op[1] = op[2] = op[3] = WRAPLOW(e1);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -100,18 +100,18 @@ void vp10_idct4_c(const tran_low_t *input, tran_low_t *output) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], 8);
-  output[1] = WRAPLOW(step[1] + step[2], 8);
-  output[2] = WRAPLOW(step[1] - step[2], 8);
-  output[3] = WRAPLOW(step[0] - step[3], 8);
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
 }
 
 void vp10_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -143,8 +143,8 @@ void vp10_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
                          int dest_stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -166,48 +166,48 @@ void vp10_idct8_c(const tran_low_t *input, tran_low_t *output) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   temp1 = (step1[0] + step1[2]) * cospi_16_64;
   temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], 8);
-  output[1] = WRAPLOW(step1[1] + step1[6], 8);
-  output[2] = WRAPLOW(step1[2] + step1[5], 8);
-  output[3] = WRAPLOW(step1[3] + step1[4], 8);
-  output[4] = WRAPLOW(step1[3] - step1[4], 8);
-  output[5] = WRAPLOW(step1[2] - step1[5], 8);
-  output[6] = WRAPLOW(step1[1] - step1[6], 8);
-  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
 }
 
 void vp10_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -238,8 +238,8 @@ void vp10_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 void vp10_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -279,10 +279,10 @@ void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
 void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) {
@@ -313,14 +313,14 @@ void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
 
   // stage 2
   s0 = (int)x0;
@@ -332,14 +332,14 @@ void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 
-  x0 = WRAPLOW(s0 + s2, 8);
-  x1 = WRAPLOW(s1 + s3, 8);
-  x2 = WRAPLOW(s0 - s2, 8);
-  x3 = WRAPLOW(s1 - s3, 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
 
   // stage 3
   s2 = (int)(cospi_16_64 * (x2 + x3));
@@ -347,19 +347,19 @@ void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(cospi_16_64 * (x6 + x7));
   s7 = (int)(cospi_16_64 * (x6 - x7));
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x4, 8);
-  output[2] = WRAPLOW(x6, 8);
-  output[3] = WRAPLOW(-x2, 8);
-  output[4] = WRAPLOW(x3, 8);
-  output[5] = WRAPLOW(-x7, 8);
-  output[6] = WRAPLOW(x5, 8);
-  output[7] = WRAPLOW(-x1, 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x4);
+  output[2] = WRAPLOW(x6);
+  output[3] = WRAPLOW(-x2);
+  output[4] = WRAPLOW(x3);
+  output[5] = WRAPLOW(-x7);
+  output[6] = WRAPLOW(x5);
+  output[7] = WRAPLOW(-x1);
 }
 
 void vp10_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -422,23 +422,23 @@ void vp10_idct16_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 3
   step1[0] = step2[0];
@@ -448,109 +448,109 @@ void vp10_idct16_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], 8);
-  output[1] = WRAPLOW(step2[1] + step2[14], 8);
-  output[2] = WRAPLOW(step2[2] + step2[13], 8);
-  output[3] = WRAPLOW(step2[3] + step2[12], 8);
-  output[4] = WRAPLOW(step2[4] + step2[11], 8);
-  output[5] = WRAPLOW(step2[5] + step2[10], 8);
-  output[6] = WRAPLOW(step2[6] + step2[9], 8);
-  output[7] = WRAPLOW(step2[7] + step2[8], 8);
-  output[8] = WRAPLOW(step2[7] - step2[8], 8);
-  output[9] = WRAPLOW(step2[6] - step2[9], 8);
-  output[10] = WRAPLOW(step2[5] - step2[10], 8);
-  output[11] = WRAPLOW(step2[4] - step2[11], 8);
-  output[12] = WRAPLOW(step2[3] - step2[12], 8);
-  output[13] = WRAPLOW(step2[2] - step2[13], 8);
-  output[14] = WRAPLOW(step2[1] - step2[14], 8);
-  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+  output[0] = WRAPLOW(step2[0] + step2[15]);
+  output[1] = WRAPLOW(step2[1] + step2[14]);
+  output[2] = WRAPLOW(step2[2] + step2[13]);
+  output[3] = WRAPLOW(step2[3] + step2[12]);
+  output[4] = WRAPLOW(step2[4] + step2[11]);
+  output[5] = WRAPLOW(step2[5] + step2[10]);
+  output[6] = WRAPLOW(step2[6] + step2[9]);
+  output[7] = WRAPLOW(step2[7] + step2[8]);
+  output[8] = WRAPLOW(step2[7] - step2[8]);
+  output[9] = WRAPLOW(step2[6] - step2[9]);
+  output[10] = WRAPLOW(step2[5] - step2[10]);
+  output[11] = WRAPLOW(step2[4] - step2[11]);
+  output[12] = WRAPLOW(step2[3] - step2[12]);
+  output[13] = WRAPLOW(step2[2] - step2[13]);
+  output[14] = WRAPLOW(step2[1] - step2[14]);
+  output[15] = WRAPLOW(step2[0] - step2[15]);
 }
 
 void vp10_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
@@ -627,22 +627,22 @@ void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
 
   // stage 2
   s0 = x0;
@@ -662,22 +662,22 @@ void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, 8);
-  x1 = WRAPLOW(s1 + s5, 8);
-  x2 = WRAPLOW(s2 + s6, 8);
-  x3 = WRAPLOW(s3 + s7, 8);
-  x4 = WRAPLOW(s0 - s4, 8);
-  x5 = WRAPLOW(s1 - s5, 8);
-  x6 = WRAPLOW(s2 - s6, 8);
-  x7 = WRAPLOW(s3 - s7, 8);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
 
   // stage 3
   s0 = x0;
@@ -697,22 +697,22 @@ void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(check_range(s0 + s2), 8);
-  x1 = WRAPLOW(check_range(s1 + s3), 8);
-  x2 = WRAPLOW(check_range(s0 - s2), 8);
-  x3 = WRAPLOW(check_range(s1 - s3), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
-  x8 = WRAPLOW(check_range(s8 + s10), 8);
-  x9 = WRAPLOW(check_range(s9 + s11), 8);
-  x10 = WRAPLOW(check_range(s8 - s10), 8);
-  x11 = WRAPLOW(check_range(s9 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -724,31 +724,31 @@ void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
-
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x8, 8);
-  output[2] = WRAPLOW(x12, 8);
-  output[3] = WRAPLOW(-x4, 8);
-  output[4] = WRAPLOW(x6, 8);
-  output[5] = WRAPLOW(x14, 8);
-  output[6] = WRAPLOW(x10, 8);
-  output[7] = WRAPLOW(x2, 8);
-  output[8] = WRAPLOW(x3, 8);
-  output[9] = WRAPLOW(x11, 8);
-  output[10] = WRAPLOW(x15, 8);
-  output[11] = WRAPLOW(x7, 8);
-  output[12] = WRAPLOW(x5, 8);
-  output[13] = WRAPLOW(-x13, 8);
-  output[14] = WRAPLOW(x9, 8);
-  output[15] = WRAPLOW(-x1, 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
 }
 
 void vp10_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
@@ -784,8 +784,8 @@ void vp10_idct16x16_1_add_c(const tran_low_t *input,
                             int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -818,43 +818,43 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   step2[0] = step1[0];
@@ -868,40 +868,40 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
-  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
-  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
-  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
-  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
-  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
-  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
-  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
-  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
-  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
-  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
-  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step2[16] = WRAPLOW(step1[16] + step1[17]);
+  step2[17] = WRAPLOW(step1[16] - step1[17]);
+  step2[18] = WRAPLOW(-step1[18] + step1[19]);
+  step2[19] = WRAPLOW(step1[18] + step1[19]);
+  step2[20] = WRAPLOW(step1[20] + step1[21]);
+  step2[21] = WRAPLOW(step1[20] - step1[21]);
+  step2[22] = WRAPLOW(-step1[22] + step1[23]);
+  step2[23] = WRAPLOW(step1[22] + step1[23]);
+  step2[24] = WRAPLOW(step1[24] + step1[25]);
+  step2[25] = WRAPLOW(step1[24] - step1[25]);
+  step2[26] = WRAPLOW(-step1[26] + step1[27]);
+  step2[27] = WRAPLOW(step1[26] + step1[27]);
+  step2[28] = WRAPLOW(step1[28] + step1[29]);
+  step2[29] = WRAPLOW(step1[28] - step1[29]);
+  step2[30] = WRAPLOW(-step1[30] + step1[31]);
+  step2[31] = WRAPLOW(step1[30] + step1[31]);
 
   // stage 3
   step1[0] = step2[0];
@@ -911,42 +911,42 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -955,87 +955,87 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
-  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
-  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
-  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
-  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
-  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
-  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
-  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
-  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[19]);
+  step2[17] = WRAPLOW(step1[17] + step1[18]);
+  step2[18] = WRAPLOW(step1[17] - step1[18]);
+  step2[19] = WRAPLOW(step1[16] - step1[19]);
+  step2[20] = WRAPLOW(-step1[20] + step1[23]);
+  step2[21] = WRAPLOW(-step1[21] + step1[22]);
+  step2[22] = WRAPLOW(step1[21] + step1[22]);
+  step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27]);
+  step2[25] = WRAPLOW(step1[25] + step1[26]);
+  step2[26] = WRAPLOW(step1[25] - step1[26]);
+  step2[27] = WRAPLOW(step1[24] - step1[27]);
+  step2[28] = WRAPLOW(-step1[28] + step1[31]);
+  step2[29] = WRAPLOW(-step1[29] + step1[30]);
+  step2[30] = WRAPLOW(step1[29] + step1[30]);
+  step2[31] = WRAPLOW(step1[28] + step1[31]);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -1044,62 +1044,62 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
-  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
-  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
-  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
-  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
-  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
-  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
-  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
-  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
-  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[23]);
+  step2[17] = WRAPLOW(step1[17] + step1[22]);
+  step2[18] = WRAPLOW(step1[18] + step1[21]);
+  step2[19] = WRAPLOW(step1[19] + step1[20]);
+  step2[20] = WRAPLOW(step1[19] - step1[20]);
+  step2[21] = WRAPLOW(step1[18] - step1[21]);
+  step2[22] = WRAPLOW(step1[17] - step1[22]);
+  step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31]);
+  step2[25] = WRAPLOW(-step1[25] + step1[30]);
+  step2[26] = WRAPLOW(-step1[26] + step1[29]);
+  step2[27] = WRAPLOW(-step1[27] + step1[28]);
+  step2[28] = WRAPLOW(step1[27] + step1[28]);
+  step2[29] = WRAPLOW(step1[26] + step1[29]);
+  step2[30] = WRAPLOW(step1[25] + step1[30]);
+  step2[31] = WRAPLOW(step1[24] + step1[31]);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
-  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
-  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
-  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
-  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
-  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
-  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
-  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
-  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
-  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
-  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
-  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
-  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
-  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[15]);
+  step1[1] = WRAPLOW(step2[1] + step2[14]);
+  step1[2] = WRAPLOW(step2[2] + step2[13]);
+  step1[3] = WRAPLOW(step2[3] + step2[12]);
+  step1[4] = WRAPLOW(step2[4] + step2[11]);
+  step1[5] = WRAPLOW(step2[5] + step2[10]);
+  step1[6] = WRAPLOW(step2[6] + step2[9]);
+  step1[7] = WRAPLOW(step2[7] + step2[8]);
+  step1[8] = WRAPLOW(step2[7] - step2[8]);
+  step1[9] = WRAPLOW(step2[6] - step2[9]);
+  step1[10] = WRAPLOW(step2[5] - step2[10]);
+  step1[11] = WRAPLOW(step2[4] - step2[11]);
+  step1[12] = WRAPLOW(step2[3] - step2[12]);
+  step1[13] = WRAPLOW(step2[2] - step2[13]);
+  step1[14] = WRAPLOW(step2[1] - step2[14]);
+  step1[15] = WRAPLOW(step2[0] - step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -1107,58 +1107,58 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], 8);
-  output[1] = WRAPLOW(step1[1] + step1[30], 8);
-  output[2] = WRAPLOW(step1[2] + step1[29], 8);
-  output[3] = WRAPLOW(step1[3] + step1[28], 8);
-  output[4] = WRAPLOW(step1[4] + step1[27], 8);
-  output[5] = WRAPLOW(step1[5] + step1[26], 8);
-  output[6] = WRAPLOW(step1[6] + step1[25], 8);
-  output[7] = WRAPLOW(step1[7] + step1[24], 8);
-  output[8] = WRAPLOW(step1[8] + step1[23], 8);
-  output[9] = WRAPLOW(step1[9] + step1[22], 8);
-  output[10] = WRAPLOW(step1[10] + step1[21], 8);
-  output[11] = WRAPLOW(step1[11] + step1[20], 8);
-  output[12] = WRAPLOW(step1[12] + step1[19], 8);
-  output[13] = WRAPLOW(step1[13] + step1[18], 8);
-  output[14] = WRAPLOW(step1[14] + step1[17], 8);
-  output[15] = WRAPLOW(step1[15] + step1[16], 8);
-  output[16] = WRAPLOW(step1[15] - step1[16], 8);
-  output[17] = WRAPLOW(step1[14] - step1[17], 8);
-  output[18] = WRAPLOW(step1[13] - step1[18], 8);
-  output[19] = WRAPLOW(step1[12] - step1[19], 8);
-  output[20] = WRAPLOW(step1[11] - step1[20], 8);
-  output[21] = WRAPLOW(step1[10] - step1[21], 8);
-  output[22] = WRAPLOW(step1[9] - step1[22], 8);
-  output[23] = WRAPLOW(step1[8] - step1[23], 8);
-  output[24] = WRAPLOW(step1[7] - step1[24], 8);
-  output[25] = WRAPLOW(step1[6] - step1[25], 8);
-  output[26] = WRAPLOW(step1[5] - step1[26], 8);
-  output[27] = WRAPLOW(step1[4] - step1[27], 8);
-  output[28] = WRAPLOW(step1[3] - step1[28], 8);
-  output[29] = WRAPLOW(step1[2] - step1[29], 8);
-  output[30] = WRAPLOW(step1[1] - step1[30], 8);
-  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+  output[0] = WRAPLOW(step1[0] + step1[31]);
+  output[1] = WRAPLOW(step1[1] + step1[30]);
+  output[2] = WRAPLOW(step1[2] + step1[29]);
+  output[3] = WRAPLOW(step1[3] + step1[28]);
+  output[4] = WRAPLOW(step1[4] + step1[27]);
+  output[5] = WRAPLOW(step1[5] + step1[26]);
+  output[6] = WRAPLOW(step1[6] + step1[25]);
+  output[7] = WRAPLOW(step1[7] + step1[24]);
+  output[8] = WRAPLOW(step1[8] + step1[23]);
+  output[9] = WRAPLOW(step1[9] + step1[22]);
+  output[10] = WRAPLOW(step1[10] + step1[21]);
+  output[11] = WRAPLOW(step1[11] + step1[20]);
+  output[12] = WRAPLOW(step1[12] + step1[19]);
+  output[13] = WRAPLOW(step1[13] + step1[18]);
+  output[14] = WRAPLOW(step1[14] + step1[17]);
+  output[15] = WRAPLOW(step1[15] + step1[16]);
+  output[16] = WRAPLOW(step1[15] - step1[16]);
+  output[17] = WRAPLOW(step1[14] - step1[17]);
+  output[18] = WRAPLOW(step1[13] - step1[18]);
+  output[19] = WRAPLOW(step1[12] - step1[19]);
+  output[20] = WRAPLOW(step1[11] - step1[20]);
+  output[21] = WRAPLOW(step1[10] - step1[21]);
+  output[22] = WRAPLOW(step1[9] - step1[22]);
+  output[23] = WRAPLOW(step1[8] - step1[23]);
+  output[24] = WRAPLOW(step1[7] - step1[24]);
+  output[25] = WRAPLOW(step1[6] - step1[25]);
+  output[26] = WRAPLOW(step1[5] - step1[26]);
+  output[27] = WRAPLOW(step1[4] - step1[27]);
+  output[28] = WRAPLOW(step1[3] - step1[28]);
+  output[29] = WRAPLOW(step1[2] - step1[29]);
+  output[30] = WRAPLOW(step1[1] - step1[30]);
+  output[31] = WRAPLOW(step1[0] - step1[31]);
 }
 
 void vp10_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
@@ -1234,8 +1234,8 @@ void vp10_idct32x32_1_add_c(const tran_low_t *input,
   int i, j;
   tran_high_t a1;
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -1269,10 +1269,10 @@ void vp10_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, bd);
-    op[1] = WRAPLOW(b1, bd);
-    op[2] = WRAPLOW(c1, bd);
-    op[3] = WRAPLOW(d1, bd);
+    op[0] = HIGHBD_WRAPLOW(a1, bd);
+    op[1] = HIGHBD_WRAPLOW(b1, bd);
+    op[2] = HIGHBD_WRAPLOW(c1, bd);
+    op[3] = HIGHBD_WRAPLOW(d1, bd);
     ip += 4;
     op += 4;
   }
@@ -1313,8 +1313,8 @@ void vp10_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+  op[0] = HIGHBD_WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -1340,18 +1340,18 @@ void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], bd);
-  output[1] = WRAPLOW(step[1] + step[2], bd);
-  output[2] = WRAPLOW(step[1] - step[2], bd);
-  output[3] = WRAPLOW(step[0] - step[3], bd);
+  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
 }
 
 void vp10_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1385,11 +1385,11 @@ void vp10_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1411,39 +1411,39 @@ void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vp10_highbd_idct4_c(step1, step1, bd);
 
   // stage 2 - odd half
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   // stage 3 - odd half
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
 }
 
 void vp10_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1477,10 +1477,10 @@ void vp10_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -1521,10 +1521,10 @@ void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
-  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
-  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1555,14 +1555,14 @@ void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1574,14 +1574,14 @@ void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1589,19 +1589,19 @@ void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x4, bd);
-  output[2] = WRAPLOW(x6, bd);
-  output[3] = WRAPLOW(-x2, bd);
-  output[4] = WRAPLOW(x3, bd);
-  output[5] = WRAPLOW(-x7, bd);
-  output[6] = WRAPLOW(x5, bd);
-  output[7] = WRAPLOW(-x1, bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x4, bd);
+  output[2] = HIGHBD_WRAPLOW(x6, bd);
+  output[3] = HIGHBD_WRAPLOW(-x2, bd);
+  output[4] = HIGHBD_WRAPLOW(x3, bd);
+  output[5] = HIGHBD_WRAPLOW(-x7, bd);
+  output[6] = HIGHBD_WRAPLOW(x5, bd);
+  output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vp10_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1666,23 +1666,23 @@ void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1692,109 +1692,109 @@ void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 }
 
 void vp10_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1874,22 +1874,22 @@ void vp10_highbd_iadst16_c(const tran_low_t *input,
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
-  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
-  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
+  x8  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
+  x9  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1909,22 +1909,22 @@ void vp10_highbd_iadst16_c(const tran_low_t *input,
   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, bd);
-  x1 = WRAPLOW(s1 + s5, bd);
-  x2 = WRAPLOW(s2 + s6, bd);
-  x3 = WRAPLOW(s3 + s7, bd);
-  x4 = WRAPLOW(s0 - s4, bd);
-  x5 = WRAPLOW(s1 - s5, bd);
-  x6 = WRAPLOW(s2 - s6, bd);
-  x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -1944,22 +1944,22 @@ void vp10_highbd_iadst16_c(const tran_low_t *input,
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
-  x8 = WRAPLOW(s8 + s10, bd);
-  x9 = WRAPLOW(s9 + s11, bd);
-  x10 = WRAPLOW(s8 - s10, bd);
-  x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -1971,31 +1971,31 @@ void vp10_highbd_iadst16_c(const tran_low_t *input,
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
-
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x8, bd);
-  output[2] = WRAPLOW(x12, bd);
-  output[3] = WRAPLOW(-x4, bd);
-  output[4] = WRAPLOW(x6, bd);
-  output[5] = WRAPLOW(x14, bd);
-  output[6] = WRAPLOW(x10, bd);
-  output[7] = WRAPLOW(x2, bd);
-  output[8] = WRAPLOW(x3, bd);
-  output[9] = WRAPLOW(x11, bd);
-  output[10] = WRAPLOW(x15, bd);
-  output[11] = WRAPLOW(x7, bd);
-  output[12] = WRAPLOW(x5, bd);
-  output[13] = WRAPLOW(-x13, bd);
-  output[14] = WRAPLOW(x9, bd);
-  output[15] = WRAPLOW(-x1, bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
+
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x8, bd);
+  output[2] = HIGHBD_WRAPLOW(x12, bd);
+  output[3] = HIGHBD_WRAPLOW(-x4, bd);
+  output[4] = HIGHBD_WRAPLOW(x6, bd);
+  output[5] = HIGHBD_WRAPLOW(x14, bd);
+  output[6] = HIGHBD_WRAPLOW(x10, bd);
+  output[7] = HIGHBD_WRAPLOW(x2, bd);
+  output[8] = HIGHBD_WRAPLOW(x3, bd);
+  output[9] = HIGHBD_WRAPLOW(x11, bd);
+  output[10] = HIGHBD_WRAPLOW(x15, bd);
+  output[11] = HIGHBD_WRAPLOW(x7, bd);
+  output[12] = HIGHBD_WRAPLOW(x5, bd);
+  output[13] = HIGHBD_WRAPLOW(-x13, bd);
+  output[14] = HIGHBD_WRAPLOW(x9, bd);
+  output[15] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vp10_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2030,11 +2030,11 @@ void vp10_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -2069,43 +2069,43 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2119,40 +2119,40 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -2162,42 +2162,42 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2206,87 +2206,87 @@ static void highbd_idct32_c(const tran_low_t *input,
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
+
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2295,62 +2295,62 @@ static void highbd_idct32_c(const tran_low_t *input,
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
+
+  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -2358,58 +2358,58 @@ static void highbd_idct32_c(const tran_low_t *input,
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
 }
 
 void vp10_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2485,9 +2485,9 @@ void vp10_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/vp10/common/vp10_inv_txfm.h b/vp10/common/vp10_inv_txfm.h
index 52611acbd..1751f62c3 100644
--- a/vp10/common/vp10_inv_txfm.h
+++ b/vp10/common/vp10_inv_txfm.h
@@ -15,13 +15,14 @@
 
 #include "./vpx_config.h"
 #include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-static INLINE tran_low_t check_range(tran_high_t input) {
+static INLINE tran_high_t check_range(tran_high_t input) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid VP9 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
@@ -32,17 +33,17 @@ static INLINE tran_low_t check_range(tran_high_t input) {
   assert(INT16_MIN <= input);
   assert(input <= INT16_MAX);
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return check_range(rv);
+  return rv;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_low_t highbd_check_range(tran_high_t input,
-                                            int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input,
+                                             int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid highbitdepth VP9 streams, intermediate stage coefficients will
   // stay within the ranges:
@@ -56,13 +57,12 @@ static INLINE tran_low_t highbd_check_range(tran_high_t input,
   (void) int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   (void) bd;
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
-                                                      int bd) {
+static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return highbd_check_range(rv, bd);
+  return rv;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -83,9 +83,21 @@ static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) ((int32_t)(x))
+
+#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+        ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#else   // CONFIG_EMULATE_HARDWARE
+
+#define WRAPLOW(x) ((int32_t)check_range(x))
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+        ((int32_t)highbd_check_range((x), bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // CONFIG_EMULATE_HARDWARE
 
 void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
@@ -107,14 +119,14 @@ void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
 
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                              int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+  trans = HIGHBD_WRAPLOW(trans, bd);
+  return clip_pixel_highbd(dest + trans, bd);
 }
 #endif
 
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans, 8);
-  return clip_pixel(WRAPLOW(dest + trans, 8));
+  trans = WRAPLOW(trans);
+  return clip_pixel(dest + trans);
 }
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index f2414f811..c8a10e57f 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -398,7 +398,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp10_fht16x16 sse2/;
 
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
+  specialize qw/vp10_fwht4x4/, "$sse2_x86inc";
 } else {
   add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht4x4 sse2 msa/;
@@ -410,7 +410,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp10_fht16x16 sse2 msa/;
 
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
+  specialize qw/vp10_fwht4x4 msa/, "$sse2_x86inc";
 }
 
 # Inverse transform
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_intrin_sse2.c
index e1111570a..e1111570a 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_intrin_sse2.c
diff --git a/vp10/encoder/x86/dct_mmx.asm b/vp10/encoder/x86/dct_mmx.asm
deleted file mode 100644
index 2327fe9e6..000000000
--- a/vp10/encoder/x86/dct_mmx.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
-  paddw           m0,        m1
-  movq            m4,        m0
-  psubw           m3,        m2
-  psubw           m4,        m3
-  psraw           m4,        1
-  movq            m5,        m4
-  psubw           m5,        m1 ;b1
-  psubw           m4,        m2 ;c1
-  psubw           m0,        m4
-  paddw           m3,        m5
-                                ; m0 a0
-  SWAP            1,         4  ; m1 c1
-  SWAP            2,         3  ; m2 d1
-  SWAP            3,         5  ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  movq            m4,        m0
-  movq            m5,        m2
-  punpcklwd       m4,        m1
-  punpckhwd       m0,        m1
-  punpcklwd       m5,        m3
-  punpckhwd       m2,        m3
-  movq            m1,        m4
-  movq            m3,        m0
-  punpckldq       m1,        m5
-  punpckhdq       m4,        m5
-  punpckldq       m3,        m2
-  punpckhdq       m0,        m2
-  SWAP            2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
-  lea             r3q,       [inputq + strideq*4]
-  movq            m0,        [inputq] ;a1
-  movq            m1,        [inputq + strideq*2] ;b1
-  movq            m2,        [r3q] ;c1
-  movq            m3,        [r3q + strideq*2] ;d1
-
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-
-  psllw           m0,        2
-  psllw           m1,        2
-  psllw           m2,        2
-  psllw           m3,        2
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m0
-  pcmpgtw         m5,             m1
-  movq            m6,             m0
-  movq            m7,             m1
-  punpcklwd       m0,             m4
-  punpcklwd       m1,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m6
-  movq            [outputq + 16], m1
-  movq            [outputq + 24], m7
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m2
-  pcmpgtw         m5,             m3
-  movq            m6,             m2
-  movq            m7,             m3
-  punpcklwd       m2,             m4
-  punpcklwd       m3,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq + 32], m2
-  movq            [outputq + 40], m6
-  movq            [outputq + 48], m3
-  movq            [outputq + 56], m7
-%else
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m1
-  movq            [outputq + 16], m2
-  movq            [outputq + 24], m3
-%endif
-
-  RET
diff --git a/vp10/encoder/x86/dct_sse2.asm b/vp10/encoder/x86/dct_sse2.asm
new file mode 100644
index 000000000..c3a5fb552
--- /dev/null
+++ b/vp10/encoder/x86/dct_sse2.asm
@@ -0,0 +1,86 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp10
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+%else
+  mova            [outputq],      m0
+  mova            [outputq + 16], m1
+%endif
+
+  RET
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index dc3b27139..4f265b539 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -93,7 +93,7 @@ VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
 endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
-VP10_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
 endif
 
@@ -103,7 +103,7 @@ VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
 endif
 endif
 
-VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
 VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c
index 8a7e33205..5cdd2a249 100644
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -44,7 +44,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
     int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
     size_t bytes_left = br->user_buffer_end - bufptr;
     size_t bits_left = bytes_left * CHAR_BIT;
-    int x = (int)(shift + CHAR_BIT - bits_left);
+    int x = shift + CHAR_BIT - (int)bits_left;
     int loop_end = 0;
     unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
 
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index cc9eaaf43..1b1bbf868 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -83,7 +83,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
     }
 
     {
-        register unsigned int shift = vp8_norm[range];
+        register int shift = vp8_norm[range];
         range <<= shift;
         value <<= shift;
         count -= shift;
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index f3d91b552..3196422c2 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -163,7 +163,7 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 {
     const TOKENEXTRA *stop = p + xcount;
     unsigned int split;
-    unsigned int shift;
+    int shift;
     int count = w->count;
     unsigned int range = w->range;
     unsigned int lowvalue = w->lowvalue;
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 7c012a829..e66a2dbd8 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -65,7 +65,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
     int count = br->count;
     unsigned int range = br->range;
     unsigned int lowvalue = br->lowvalue;
-    register unsigned int shift;
+    register int shift;
 
 #ifdef VP8_ENTROPY_STATS
 #if defined(SECTIONBITS_OUTPUT)
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 4c2acc774..95bb39400 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -2808,7 +2808,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
              * static scene.
              */
             if ( detect_transition_to_still( cpi, i,
-                                             (cpi->key_frame_frequency-i),
+                                             ((int)(cpi->key_frame_frequency) -
+                                              (int)i),
                                              loop_decay_rate,
                                              decay_accumulator ) )
             {
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 9063cea76..6507ae9f1 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1899,7 +1899,8 @@ static int calculate_final_rd_costs(int this_rd,
                     int prob_skip_cost;
 
                     prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
-                    prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    prob_skip_cost -=
+                        (int)vp8_cost_bit(cpi->prob_skip_false, 0);
                     rd->rate2 += prob_skip_cost;
                     *other_cost += prob_skip_cost;
                 }
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 461462552..183dec4e7 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -298,196 +298,168 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
 
 static void filter_selectively_vert_row2(int subsampling_factor,
                                          uint8_t *s, int pitch,
-                                         unsigned int mask_16x16_l,
-                                         unsigned int mask_8x8_l,
-                                         unsigned int mask_4x4_l,
-                                         unsigned int mask_4x4_int_l,
-                                         const loop_filter_info_n *lfi_n,
+                                         unsigned int mask_16x16,
+                                         unsigned int mask_8x8,
+                                         unsigned int mask_4x4,
+                                         unsigned int mask_4x4_int,
+                                         const loop_filter_thresh *lfthr,
                                          const uint8_t *lfl) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff;
   const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  const unsigned int dual_one = 1 | (1 << lfl_forward);
   unsigned int mask;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                   lfi0->hev_thr);
-        } else if (mask_16x16_0 & 1) {
-          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                              lfi0->hev_thr);
+  uint8_t *ss[2];
+  ss[0] = s;
+
+  for (mask =
+           (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff;
+       mask; mask = (mask & ~dual_one) >> 1) {
+    if (mask & dual_one) {
+      const loop_filter_thresh *lfis[2];
+      lfis[0] = lfthr + *lfl;
+      lfis[1] = lfthr + *(lfl + lfl_forward);
+      ss[1] = ss[0] + 8 * pitch;
+
+      if (mask_16x16 & dual_one) {
+        if ((mask_16x16 & dual_one) == dual_one) {
+          vpx_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                   lfis[0]->hev_thr);
         } else {
-          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
-                              lfi1->lim, lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)];
+          vpx_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim,
+                              lfi->lim, lfi->hev_thr);
         }
       }
 
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_8x8_0 & 1) {
-          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+      if (mask_8x8 & dual_one) {
+        if ((mask_8x8 & dual_one) == dual_one) {
+          vpx_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                  lfis[0]->hev_thr, lfis[1]->mblim,
+                                  lfis[1]->lim, lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)];
+          vpx_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
         }
       }
 
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+      if (mask_4x4 & dual_one) {
+        if ((mask_4x4 & dual_one) == dual_one) {
+          vpx_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                  lfis[0]->hev_thr, lfis[1]->mblim,
+                                  lfis[1]->lim, lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)];
+          vpx_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
         }
       }
 
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_int_0 & 1) {
-          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                             lfi0->hev_thr);
+      if (mask_4x4_int & dual_one) {
+        if ((mask_4x4_int & dual_one) == dual_one) {
+          vpx_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim,
+                                  lfis[0]->lim, lfis[0]->hev_thr,
+                                  lfis[1]->mblim, lfis[1]->lim,
+                                  lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)];
+          vpx_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, lfi->mblim,
+                             lfi->lim, lfi->hev_thr);
         }
       }
     }
 
-    s += 8;
+    ss[0] += 8;
     lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                                 uint16_t *s, int pitch,
-                                                unsigned int mask_16x16_l,
-                                                unsigned int mask_8x8_l,
-                                                unsigned int mask_4x4_l,
-                                                unsigned int mask_4x4_int_l,
-                                                const loop_filter_info_n *lfi_n,
+                                                unsigned int mask_16x16,
+                                                unsigned int mask_8x8,
+                                                unsigned int mask_4x4,
+                                                unsigned int mask_4x4_int,
+                                                const loop_filter_thresh *lfthr,
                                                 const uint8_t *lfl, int bd) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff;
   const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  const unsigned int dual_one = 1 | (1 << lfl_forward);
   unsigned int mask;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, bd);
-        } else if (mask_16x16_0 & 1) {
-          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, bd);
+  uint16_t *ss[2];
+  ss[0] = s;
+
+  for (mask =
+           (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff;
+       mask; mask = (mask & ~dual_one) >> 1) {
+    if (mask & dual_one) {
+      const loop_filter_thresh *lfis[2];
+      lfis[0] = lfthr + *lfl;
+      lfis[1] = lfthr + *(lfl + lfl_forward);
+      ss[1] = ss[0] + 8 * pitch;
+
+      if (mask_16x16 & dual_one) {
+        if ((mask_16x16 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim,
+                                          lfis[0]->lim, lfis[0]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)];
+          vpx_highbd_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim,
+                                     lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_8x8_0 & 1) {
-          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_8x8 & dual_one) {
+        if ((mask_8x8 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)];
+          vpx_highbd_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim,
+                                    lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_4x4 & dual_one) {
+        if ((mask_4x4 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)];
+          vpx_highbd_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim,
+                                    lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_int_0 & 1) {
-          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_4x4_int & dual_one) {
+        if ((mask_4x4_int & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)];
+          vpx_highbd_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr, bd);
         }
       }
     }
 
-    s += 8;
+    ss[0] += 8;
     lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -497,17 +469,17 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_8x8,
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
-                                     const loop_filter_info_n *lfi_n,
+                                     const loop_filter_thresh *lfthr,
                                      const uint8_t *lfl) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
     count = 1;
     if (mask & 1) {
+      const loop_filter_thresh *lfi = lfthr + *lfl;
+
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
           vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
@@ -520,7 +492,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -549,7 +521,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -574,7 +546,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr);
         }
-      } else if (mask_4x4_int & 1) {
+      } else {
         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                              lfi->hev_thr);
       }
@@ -594,17 +566,17 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
                                             unsigned int mask_8x8,
                                             unsigned int mask_4x4,
                                             unsigned int mask_4x4_int,
-                                            const loop_filter_info_n *lfi_n,
+                                            const loop_filter_thresh *lfthr,
                                             const uint8_t *lfl, int bd) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
     count = 1;
     if (mask & 1) {
+      const loop_filter_thresh *lfi = lfthr + *lfl;
+
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
           vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
@@ -617,7 +589,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -650,7 +622,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -679,7 +651,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
                                         lfi->lim, lfi->hev_thr, bd);
           }
         }
-      } else if (mask_4x4_int & 1) {
+      } else {
         vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, bd);
       }
@@ -1079,13 +1051,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                    const loop_filter_info_n *lfi_n,
+                                    const loop_filter_thresh *lfthr,
                                     const uint8_t *lfl) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi = lfthr + *lfl;
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
@@ -1113,13 +1085,13 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
                                            unsigned int mask_8x8,
                                            unsigned int mask_4x4,
                                            unsigned int mask_4x4_int,
-                                           const loop_filter_info_n *lfi_n,
+                                           const loop_filter_thresh *lfthr,
                                            const uint8_t *lfl, int bd) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi = lfthr + *lfl;
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
@@ -1250,23 +1222,18 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
                                      mask_8x8_c & border_mask,
                                      mask_4x4_c & border_mask,
                                      mask_4x4_int[r],
-                                     &cm->lf_info, &lfl[r << 3],
+                                     cm->lf_info.lfthr, &lfl[r << 3],
                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_vert(dst->buf, dst->stride,
                               mask_16x16_c & border_mask,
                               mask_8x8_c & border_mask,
                               mask_4x4_c & border_mask,
                               mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << 3]);
+                              cm->lf_info.lfthr, &lfl[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_vert(dst->buf, dst->stride,
-                            mask_16x16_c & border_mask,
-                            mask_8x8_c & border_mask,
-                            mask_4x4_c & border_mask,
-                            mask_4x4_int[r],
-                            &cm->lf_info, &lfl[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
@@ -1299,23 +1266,18 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
                                       mask_8x8_r,
                                       mask_4x4_r,
                                       mask_4x4_int_r,
-                                      &cm->lf_info, &lfl[r << 3],
+                                      cm->lf_info.lfthr, &lfl[r << 3],
                                       (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int_r,
-                               &cm->lf_info, &lfl[r << 3]);
+                               cm->lf_info.lfthr, &lfl[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride,
-                             mask_16x16_r,
-                             mask_8x8_r,
-                             mask_4x4_r,
-                             mask_4x4_int_r,
-                             &cm->lf_info, &lfl[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
   }
@@ -1337,27 +1299,29 @@ void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
 
   // Vertical pass: do 2 rows at one time
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
-
-// Disable filtering on the leftmost column.
+    // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_vert_row2(
-          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+      highbd_filter_selectively_vert_row2(plane->subsampling_x,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          (unsigned int)mask_16x16,
+                                          (unsigned int)mask_8x8,
+                                          (unsigned int)mask_4x4,
+                                          (unsigned int)mask_4x4_int,
+                                          cm->lf_info.lfthr,
+                                          &lfm->lfl_y[r << 3],
+                                          (int)cm->bit_depth);
     } else {
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride,
+                                   (unsigned int)mask_16x16,
+                                   (unsigned int)mask_8x8,
+                                   (unsigned int)mask_4x4,
+                                   (unsigned int)mask_4x4_int,
+                                   cm->lf_info.lfthr, &lfm->lfl_y[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_vert_row2(
-        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 16 * dst->stride;
     mask_16x16 >>= 16;
@@ -1390,19 +1354,18 @@ void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
 
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
-          (int)cm->bit_depth);
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int & 0xff,
+                                      cm->lf_info.lfthr, &lfm->lfl_y[r << 3],
+                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+                               mask_4x4_r, mask_4x4_int & 0xff,
+                               cm->lf_info.lfthr, &lfm->lfl_y[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                             &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
@@ -1436,38 +1399,35 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
       lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
     }
 
-    {
-      unsigned int mask_16x16_l = mask_16x16 & 0xff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
-
-// Disable filtering on the leftmost column.
+    // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        highbd_filter_selectively_vert_row2(
-            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfl_uv[r << 1], (int)cm->bit_depth);
-      } else {
-        filter_selectively_vert_row2(
-            plane->subsampling_x, dst->buf, dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfl_uv[r << 1]);
-      }
-#else
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfl_uv[r << 1]);
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(plane->subsampling_x,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          (unsigned int)mask_16x16,
+                                          (unsigned int)mask_8x8,
+                                          (unsigned int)mask_4x4,
+                                          (unsigned int)mask_4x4_int,
+                                          cm->lf_info.lfthr, &lfl_uv[r << 1],
+                                          (int)cm->bit_depth);
+    } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 8;
-      mask_8x8 >>= 8;
-      mask_4x4 >>= 8;
-      mask_4x4_int >>= 8;
+      filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride,
+                                   (unsigned int)mask_16x16,
+                                   (unsigned int)mask_8x8,
+                                   (unsigned int)mask_4x4,
+                                   (unsigned int)mask_4x4_int,
+                                   cm->lf_info.lfthr, &lfl_uv[r << 1]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
   }
 
   // Horizontal pass
@@ -1499,17 +1459,16 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                       dst->stride, mask_16x16_r, mask_8x8_r,
-                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfl_uv[r << 1], (int)cm->bit_depth);
+                                      mask_4x4_r, mask_4x4_int_r,
+                                      cm->lf_info.lfthr, &lfl_uv[r << 1],
+                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                               mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr,
                                &lfl_uv[r << 1]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index c4d91c825..445785835 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -142,6 +142,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
   // 129  G   H  ..  S   T   T   T   T   T
+  // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1.
 
   // Get current frame pointer, width and height.
   if (plane == 0) {
@@ -177,7 +178,6 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
           left_col[i] = ref[i * ref_stride - 1];
       }
     } else {
-      // TODO(Peter): this value should probably change for high bitdepth
       vpx_memset16(left_col, base + 1, bs);
     }
   }
@@ -239,7 +239,6 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
           vpx_memset16(above_row + r, above_row[r - 1],
                        x0 + 2 * bs - frame_width);
         }
-        // TODO(Peter) this value should probably change for high bitdepth
         above_row[-1] = left_available ? above_ref[-1] : (base + 1);
       } else {
         /* faster path if the block does not need extension */
@@ -251,13 +250,11 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
             memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
           else
             vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          // TODO(Peter): this value should probably change for high bitdepth
           above_row[-1] = left_available ? above_ref[-1] : (base + 1);
         }
       }
     } else {
       vpx_memset16(above_row, base - 1, bs * 2);
-      // TODO(Peter): this value should probably change for high bitdepth
       above_row[-1] = base - 1;
     }
   }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index d7f5a2113..7b9869b52 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -245,7 +245,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fht16x16 sse2/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+  specialize qw/vp9_fwht4x4/, "$sse2_x86inc";
 } else {
   add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp9_fht4x4 sse2 msa/;
@@ -257,7 +257,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fht16x16 sse2 msa/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
+  specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc";
 }
 
 #
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6e21bb194..d63912932 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1339,22 +1339,23 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   // has valid dimensions.
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width,
-                                                ref_frame->buf->y_crop_height,
-                                                width, height);
+    has_valid_ref_frame |= (ref_frame->idx != INVALID_IDX &&
+                            valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                                                 ref_frame->buf->y_crop_height,
+                                                 width, height));
   }
   if (!has_valid_ref_frame)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    if (!valid_ref_frame_img_fmt(
-            ref_frame->buf->bit_depth,
-            ref_frame->buf->subsampling_x,
-            ref_frame->buf->subsampling_y,
-            cm->bit_depth,
-            cm->subsampling_x,
-            cm->subsampling_y))
+    if (ref_frame->idx == INVALID_IDX ||
+        !valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
+                                 ref_frame->buf->subsampling_x,
+                                 ref_frame->buf->subsampling_y,
+                                 cm->bit_depth,
+                                 cm->subsampling_x,
+                                 cm->subsampling_y))
       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
   }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 147743e8d..bbdfbb823 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -145,6 +145,11 @@ struct macroblock {
 
   uint8_t sb_is_skin;
 
+  // Used to save the status of whether a block has a low variance in
+  // choose_partitioning. 0 for 64x64, 1 2 for 64x32, 3 4 for 32x64, 5~8 for
+  // 32x32.
+  uint8_t variance_low[9];
+
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 67069e7c1..e3570504e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -747,6 +747,8 @@ static int choose_partitioning(VP9_COMP *cpi,
   const uint8_t *d;
   int sp;
   int dp;
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
   int pixels_wide = 64, pixels_high = 64;
   int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
       cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
@@ -771,6 +773,10 @@ static int choose_partitioning(VP9_COMP *cpi,
     }
   }
 
+  for (i = 0; i < 9; i++) {
+    x->variance_low[i] = 0;
+  }
+
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0)
@@ -831,8 +837,10 @@ static int choose_partitioning(VP9_COMP *cpi,
       mi->ref_frame[0] = GOLDEN_FRAME;
       mi->mv[0].as_int = 0;
       y_sad = y_sad_g;
+      ref_frame_partition = GOLDEN_FRAME;
     } else {
       x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+      ref_frame_partition = LAST_FRAME;
     }
 
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
@@ -1073,6 +1081,34 @@ static int choose_partitioning(VP9_COMP *cpi,
       }
     }
   }
+
+  if (cpi->sf.short_circuit_low_temp_var) {
+    // Set low variance flag, only for blocks >= 32x32 and if LAST_FRAME was
+    // selected.
+    if (ref_frame_partition == LAST_FRAME) {
+      if (xd->mi[0]->sb_type == BLOCK_64X64 &&
+          vt.part_variances.none.variance < (thresholds[0] >> 1)) {
+        x->variance_low[0] = 1;
+      } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+        if (vt.part_variances.horz[0].variance < (thresholds[0] >> 2))
+          x->variance_low[1] = 1;
+        if (vt.part_variances.horz[1].variance < (thresholds[0] >> 2))
+          x->variance_low[2] = 1;
+      } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+        if (vt.part_variances.vert[0].variance < (thresholds[0] >> 2))
+          x->variance_low[3] = 1;
+        if (vt.part_variances.vert[1].variance < (thresholds[0] >> 2))
+          x->variance_low[4] = 1;
+      } else {
+        // 32x32
+        for (i = 0; i < 4; i++) {
+          if (!force_split[i + 1] &&
+              vt.split[i].part_variances.none.variance < (thresholds[1] >> 1))
+            x->variance_low[i + 5] = 1;
+        }
+      }
+    }
+  }
   return 0;
 }
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index f456f37a1..a70eaea3e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -236,13 +236,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->duration   -= frame->duration;
 }
 
-// Calculate the linear size relative to a baseline of 1080P
-#define BASE_SIZE 2073600.0  // 1920x1080
-static double get_linear_size_factor(const VP9_COMP *cpi) {
-  const double this_area = cpi->initial_width * cpi->initial_height;
-  return pow(this_area / BASE_SIZE, 0.5);
-}
-
 // Calculate an active area of the image that discounts formatting
 // bars and partially discounts other 0 energy areas.
 #define MIN_ACTIVE_AREA 0.5
@@ -1247,14 +1240,15 @@ static double calc_correction_factor(double err_per_mb,
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-#define ERR_DIVISOR         100.0
-static int get_twopass_worst_quality(const VP9_COMP *cpi,
+#define ERR_DIVISOR         115.0
+static int get_twopass_worst_quality(VP9_COMP *cpi,
                                      const double section_err,
                                      double inactive_zone,
-                                     int section_target_bandwidth,
-                                     double group_weight_factor) {
+                                     int section_target_bandwidth) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+
   // Clamp the target rate to VBR min / max limts.
   const int target_rate =
       vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
@@ -1269,7 +1263,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
     const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    double ediv_size_correction;
+    double last_group_rate_err;
     const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
                                          BPER_MB_NORMBITS) / active_mbs;
     int q;
@@ -1278,29 +1272,27 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
     if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
       is_svc_upper_layer = 1;
 
-    // Larger image formats are expected to be a little harder to code
-    // relatively given the same prediction error score. This in part at
-    // least relates to the increased size and hence coding overheads of
-    // motion vectors. Some account of this is made through adjustment of
-    // the error divisor.
-    ediv_size_correction =
-        VPXMAX(0.2, VPXMIN(5.0, get_linear_size_factor(cpi)));
-    if (ediv_size_correction < 1.0)
-      ediv_size_correction = -(1.0 / ediv_size_correction);
-    ediv_size_correction *= 4.0;
+    // based on recent history adjust expectations of bits per macroblock.
+    last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits /
+        DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+    last_group_rate_err =
+        VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+    twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+    twopass->bpm_factor =
+        VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
           calc_correction_factor(av_err_per_mb,
-                                 ERR_DIVISOR - ediv_size_correction,
+                                 ERR_DIVISOR,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
                                  FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
                                  cpi->common.bit_depth);
       const int bits_per_mb =
         vp9_rc_bits_per_mb(INTER_FRAME, q,
-                           factor * speed_term * group_weight_factor,
+                           factor * speed_term * cpi->twopass.bpm_factor,
                            cpi->common.bit_depth);
       if (bits_per_mb <= target_norm_bits_per_mb)
         break;
@@ -2115,8 +2107,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     old_boost_score = boost_score;
   }
 
-  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
-
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
@@ -2184,24 +2174,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     const double group_av_inactive_zone =
       ((gf_group_inactive_zone_rows * 2) /
        (rc->baseline_gf_interval * (double)cm->mb_rows));
-
-    int tmp_q;
-    // rc factor is a weight factor that corrects for local rate control drift.
-    double rc_factor = 1.0;
-    if (rc->rate_error_estimate > 0) {
-      rc_factor = VPXMAX(RC_FACTOR_MIN,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    } else {
-      rc_factor = VPXMIN(RC_FACTOR_MAX,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    }
-    tmp_q =
-      get_twopass_worst_quality(cpi, group_av_err,
-                                (group_av_skip_pct + group_av_inactive_zone),
-                                vbr_group_bits_per_frame,
-                                twopass->kfgroup_inter_fraction * rc_factor);
+    int tmp_q =
+        get_twopass_worst_quality(cpi, group_av_err,
+                                  (group_av_skip_pct + group_av_inactive_zone),
+                                  vbr_group_bits_per_frame);
     twopass->active_worst_quality =
-        VPXMAX(tmp_q, twopass->active_worst_quality >> 1);
+        (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
   }
 #endif
 
@@ -2243,6 +2221,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Default to starting GF groups at normal frame size.
     cpi->rc.next_frame_size_selector = UNSCALED;
   }
+
+  // Reset rolling actual and target bits counters for ARF groups.
+  twopass->rolling_arf_group_target_bits = 0;
+  twopass->rolling_arf_group_actual_bits = 0;
 }
 
 // Threshold for use of the lagging second reference frame. High second ref
@@ -2580,16 +2562,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
                                   rc->kf_boost, twopass->kf_group_bits);
 
-  // Work out the fraction of the kf group bits reserved for the inter frames
-  // within the group after discounting the bits for the kf itself.
-  if (twopass->kf_group_bits) {
-    twopass->kfgroup_inter_fraction =
-      (double)(twopass->kf_group_bits - kf_bits) /
-      (double)twopass->kf_group_bits;
-  } else {
-    twopass->kfgroup_inter_fraction = 1.0;
-  }
-
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
@@ -2683,21 +2655,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
-  int frames_left;
   FIRSTPASS_STATS this_frame;
 
   int target_rate;
   LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
         &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
 
-  if (lc != NULL) {
-    frames_left = (int)(twopass->total_stats.count -
-                  lc->current_video_frame_in_layer);
-  } else {
-    frames_left = (int)(twopass->total_stats.count -
-                  cm->current_video_frame);
-  }
-
   if (!twopass->stats_in)
     return;
 
@@ -2739,6 +2702,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0 ||
              (lc != NULL && lc->current_video_frame_in_layer == 0)) {
+    const int frames_left = (int)(twopass->total_stats.count -
+        ((lc != NULL) ? lc->current_video_frame_in_layer
+                      : cm->current_video_frame));
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
@@ -2750,10 +2716,17 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     const double section_inactive_zone =
       (twopass->total_left_stats.inactive_zone_rows * 2) /
       ((double)cm->mb_rows * section_length);
-    const int tmp_q =
-      get_twopass_worst_quality(cpi, section_error,
-                                section_intra_skip + section_inactive_zone,
-                                section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+    int tmp_q;
+
+    // Initialize bits per macro_block estimate correction factor.
+    twopass->bpm_factor = 1.0;
+    // Initiallize actual and target bits counters for ARF groups so that
+    // at the start we have a neutral bpm adjustment.
+    twopass->rolling_arf_group_target_bits = 1;
+    twopass->rolling_arf_group_actual_bits = 1;
+
+    tmp_q = get_twopass_worst_quality(cpi, section_error,
+        section_intra_skip + section_inactive_zone, section_target_bandwidth);
 
     twopass->active_worst_quality = tmp_q;
     twopass->baseline_active_worst_quality = tmp_q;
@@ -2871,6 +2844,10 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
   twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);
 
+  // Target vs actual bits for this arf group.
+  twopass->rolling_arf_group_target_bits += rc->this_frame_target;
+  twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
   // Calculate the pct rc error.
   if (rc->total_actual_bits) {
     rc->rate_error_estimate =
@@ -2892,7 +2869,6 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
 
   // If the rate control is drifting consider adjustment to min or maxq.
   if ((cpi->oxcf.rc_mode != VPX_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
       !cpi->rc.is_src_frame_alt_ref) {
     const int maxq_adj_limit =
       rc->worst_quality - twopass->active_worst_quality;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 7eb44fa13..76072884d 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -39,8 +39,6 @@ typedef struct {
 } FIRSTPASS_MB_STATS;
 #endif
 
-#define VLOW_MOTION_THRESHOLD 950
-
 typedef struct {
   double frame;
   double weight;
@@ -124,14 +122,13 @@ typedef struct {
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
 
-  // The fraction for a kf groups total bits allocated to the inter frames
-  double kfgroup_inter_fraction;
+  double bpm_factor;
+  int rolling_arf_group_target_bits;
+  int rolling_arf_group_actual_bits;
 
   int sr_update_lag;
-
   int kf_zeromotion_pct;
   int last_kfgroup_zeromotion_pct;
-  int gf_zeromotion_pct;
   int active_worst_quality;
   int baseline_active_worst_quality;
   int extend_minq;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 918b3b10b..554409b74 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1126,34 +1126,38 @@ static INLINE void find_predictors(VP9_COMP *cpi, MACROBLOCK *x,
                                  TileDataEnc *tile_data,
                                  int mi_row, int mi_col,
                                  struct buf_2d yv12_mb[4][MAX_MB_PLANE],
-                                 BLOCK_SIZE bsize) {
+                                 BLOCK_SIZE bsize,
+                                 int force_skip_low_temp_var) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   TileInfo *const tile_info = &tile_data->tile_info;
-// TODO(jingning) placeholder for inter-frame non-RD mode decision.
+  // TODO(jingning) placeholder for inter-frame non-RD mode decision.
   x->pred_mv_sad[ref_frame] = INT_MAX;
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
-// this needs various further optimizations. to be continued..
+  // this needs various further optimizations. to be continued..
   if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                          sf, sf);
-    if (cm->use_prev_frame_mvs)
+    if (cm->use_prev_frame_mvs) {
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
                        candidates, mi_row, mi_col,
                        x->mbmi_ext->mode_context);
-    else
-    const_motion[ref_frame] =
-        mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
-            candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
-            (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+    } else {
+      const_motion[ref_frame] =
+          mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
+                     candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
+                     (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+    }
     vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
                           &frame_mv[NEARESTMV][ref_frame],
                           &frame_mv[NEARMV][ref_frame]);
-    if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) {
+    // Early exit for golden frame if force_skip_low_temp_var is set.
+    if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 &&
+        !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) {
       vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
                   ref_frame, bsize);
     }
@@ -1266,6 +1270,39 @@ static void recheck_zeromv_after_denoising(
 }
 #endif  // CONFIG_VP9_TEMPORAL_DENOISING
 
+static INLINE int set_force_skip_low_temp_var(uint8_t *variance_low,
+                                              int mi_row, int mi_col,
+                                              BLOCK_SIZE bsize) {
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  if (bsize == BLOCK_64X64) {
+    force_skip_low_temp_var = variance_low[0];
+  } else if (bsize == BLOCK_64X32) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[1];
+    } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[2];
+    }
+  } else if (bsize == BLOCK_32X64) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[3];
+    } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[4];
+    }
+  } else if (bsize == BLOCK_32X32) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[5];
+    } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[6];
+    } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[7];
+    } else if ((mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[8];
+    }
+  }
+  return force_skip_low_temp_var;
+}
+
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                          TileDataEnc *tile_data,
                          int mi_row, int mi_col, RD_COST *rd_cost,
@@ -1323,6 +1360,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int ref_frame_cost[MAX_REF_FRAMES];
   int svc_force_zero_mode[3] = {0};
   int perform_intra_pred = 1;
+  int use_golden_nonzeromv = 1;
+  int force_skip_low_temp_var = 0;
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
@@ -1409,10 +1448,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
+  if (cpi->sf.short_circuit_low_temp_var) {
+    force_skip_low_temp_var =
+        set_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+  }
+
+  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
+    use_golden_nonzeromv = 0;
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
     find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
                     &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col,
-                    yv12_mb, bsize);
+                    yv12_mb, bsize, force_skip_low_temp_var);
   }
 
   for (idx = 0; idx < RT_INTER_MODES; ++idx) {
@@ -1424,6 +1472,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int is_skippable;
     int this_early_term = 0;
     PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+
     if (cpi->use_svc)
       this_mode = ref_mode_set_svc[idx].pred_mode;
 
@@ -1442,17 +1491,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
+
     if (const_motion[ref_frame] && this_mode == NEARMV)
       continue;
 
+    // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
+    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+    // later.
+    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+        frame_mv[this_mode][ref_frame].as_int != 0) {
+      continue;
+    }
+
     if (cpi->use_svc) {
       if (svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
 
-    if (!(frame_mv[this_mode][ref_frame].as_int == 0 &&
-        ref_frame == LAST_FRAME)) {
+    if (!force_skip_low_temp_var &&
+        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+          ref_frame == LAST_FRAME)) {
       i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
       if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
         if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
@@ -1543,7 +1602,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
-    if (this_mode == NEWMV && ref_frame == LAST_FRAME &&
+    // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
+    // need to compute best_pred_sad which is only used to skip golden NEWMV.
+    if (use_golden_nonzeromv && this_mode == NEWMV &&
+        ref_frame == LAST_FRAME &&
         frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
       const int pre_stride = xd->plane[0].pre[0].stride;
       const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
@@ -1555,21 +1617,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
     }
 
-    if (cpi->use_svc) {
-      if (this_mode == NEWMV && ref_frame == GOLDEN_FRAME &&
-          frame_mv[NEWMV][GOLDEN_FRAME].as_int != INVALID_MV) {
-        const int pre_stride = xd->plane[0].pre[0].stride;
-        const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
-            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.row >> 3) * pre_stride +
-            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.col >> 3);
-        best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
-                                               x->plane[0].src.stride,
-                                               pre_buf, pre_stride);
-        x->pred_mv_sad[GOLDEN_FRAME] = best_pred_sad;
-      }
-    }
-
-
     if (this_mode != NEARESTMV &&
         frame_mv[this_mode][ref_frame].as_int ==
             frame_mv[NEARESTMV][ref_frame].as_int)
@@ -1795,11 +1842,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
   // Perform intra prediction search, if the best SAD is above a certain
-  // threshold.
-  if (perform_intra_pred &&
-      ((best_rdc.rdcost == INT64_MAX ||
-      (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
-       bsize <= cpi->sf.max_intra_bsize)))) {
+  // threshold. Skip intra prediction if force_skip_low_temp_var is set.
+  if (!force_skip_low_temp_var && perform_intra_pred &&
+      (best_rdc.rdcost == INT64_MAX ||
+       (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+        bsize <= cpi->sf.max_intra_bsize))) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 };
     int i;
     TX_SIZE best_intra_tx_size = TX_SIZES;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index b8a5e6e7d..6c3f91951 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1160,8 +1160,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
 
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if ((cpi->oxcf.rc_mode != VPX_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+  if (cpi->oxcf.rc_mode != VPX_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
          (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
@@ -1559,12 +1558,13 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     if (cm->current_video_frame > 30 &&
         rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 &&
         rc->avg_size_inter > (5 * rc->avg_frame_bandwidth) >> 1) {
-        rc->baseline_gf_interval = (3 * rc->baseline_gf_interval) >> 1;
+        rc->baseline_gf_interval =
+            VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
     } else if (cm->current_video_frame > 30 &&
                rc->avg_frame_low_motion < 20) {
       // Decrease boost and gf interval for high motion case.
       rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
-      rc->baseline_gf_interval = VPXMIN(6, rc->baseline_gf_interval >> 1);
+      rc->baseline_gf_interval = VPXMAX(5, rc->baseline_gf_interval >> 1);
     }
     adjust_gf_key_frame(cpi);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
@@ -1890,27 +1890,28 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
   RATE_CONTROL *const rc = &cpi->rc;
   int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
   int max_delta;
-  double position_factor = 1.0;
-
-  // How far through the clip are we.
-  // This number is used to damp the per frame rate correction.
-  // Range 0 - 1.0
-  if (cpi->twopass.total_stats.count) {
-    position_factor = sqrt((double)cpi->common.current_video_frame /
-                           cpi->twopass.total_stats.count);
-  }
-  max_delta = (int)(position_factor *
-                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
-
-  // vbr_bits_off_target > 0 means we have extra bits to spend
-  if (vbr_bits_off_target > 0) {
-    *this_frame_target +=
-      (vbr_bits_off_target > max_delta) ? max_delta
-                                        : (int)vbr_bits_off_target;
-  } else {
-    *this_frame_target -=
-      (vbr_bits_off_target < -max_delta) ? max_delta
-                                         : (int)-vbr_bits_off_target;
+  int frame_window = VPXMIN(16,
+      ((int)cpi->twopass.total_stats.count - cpi->common.current_video_frame));
+
+  // Calcluate the adjustment to rate for this frame.
+  if (frame_window > 0) {
+    max_delta = (vbr_bits_off_target > 0)
+        ? (int)(vbr_bits_off_target / frame_window)
+        : (int)(-vbr_bits_off_target / frame_window);
+
+    max_delta = VPXMIN(max_delta,
+        ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+    // vbr_bits_off_target > 0 means we have extra bits to spend
+    if (vbr_bits_off_target > 0) {
+      *this_frame_target +=
+        (vbr_bits_off_target > max_delta) ? max_delta
+                                          : (int)vbr_bits_off_target;
+    } else {
+      *this_frame_target -=
+        (vbr_bits_off_target < -max_delta) ? max_delta
+                                           : (int)-vbr_bits_off_target;
+    }
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 02be3c3f9..0090b4f40 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -429,6 +429,11 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.pass == 0 &&
+        content != VP9E_CONTENT_SCREEN) {
+      // Enable short circuit for low temporal variance.
+      sf->short_circuit_low_temp_var = 1;
+    }
   }
 
   if (speed >= 7) {
@@ -554,6 +559,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->default_interp_filter = SWITCHABLE;
   sf->simple_model_rd_from_var = 0;
   sf->short_circuit_flat_blocks = 0;
+  sf->short_circuit_low_temp_var = 0;
 
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->adaptive_rd_thresh = 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 90b32164b..71ff0ac10 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -446,6 +446,10 @@ typedef struct SPEED_FEATURES {
   // Skip a number of expensive mode evaluations for blocks with zero source
   // variance.
   int short_circuit_flat_blocks;
+
+  // Skip a number of expensive mode evaluations for blocks with very low
+  // temporal variance.
+  int short_circuit_low_temp_var;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index fa37b6fed..fa37b6fed 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm
deleted file mode 100644
index 7a7a6b655..000000000
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
-  paddw           m0,        m1
-  movq            m4,        m0
-  psubw           m3,        m2
-  psubw           m4,        m3
-  psraw           m4,        1
-  movq            m5,        m4
-  psubw           m5,        m1 ;b1
-  psubw           m4,        m2 ;c1
-  psubw           m0,        m4
-  paddw           m3,        m5
-                                ; m0 a0
-  SWAP            1,         4  ; m1 c1
-  SWAP            2,         3  ; m2 d1
-  SWAP            3,         5  ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  movq            m4,        m0
-  movq            m5,        m2
-  punpcklwd       m4,        m1
-  punpckhwd       m0,        m1
-  punpcklwd       m5,        m3
-  punpckhwd       m2,        m3
-  movq            m1,        m4
-  movq            m3,        m0
-  punpckldq       m1,        m5
-  punpckhdq       m4,        m5
-  punpckldq       m3,        m2
-  punpckhdq       m0,        m2
-  SWAP            2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
-  lea             r3q,       [inputq + strideq*4]
-  movq            m0,        [inputq] ;a1
-  movq            m1,        [inputq + strideq*2] ;b1
-  movq            m2,        [r3q] ;c1
-  movq            m3,        [r3q + strideq*2] ;d1
-
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-
-  psllw           m0,        2
-  psllw           m1,        2
-  psllw           m2,        2
-  psllw           m3,        2
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m0
-  pcmpgtw         m5,             m1
-  movq            m6,             m0
-  movq            m7,             m1
-  punpcklwd       m0,             m4
-  punpcklwd       m1,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m6
-  movq            [outputq + 16], m1
-  movq            [outputq + 24], m7
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m2
-  pcmpgtw         m5,             m3
-  movq            m6,             m2
-  movq            m7,             m3
-  punpcklwd       m2,             m4
-  punpcklwd       m3,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq + 32], m2
-  movq            [outputq + 40], m6
-  movq            [outputq + 48], m3
-  movq            [outputq + 56], m7
-%else
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m1
-  movq            [outputq + 16], m2
-  movq            [outputq + 24], m3
-%endif
-
-  RET
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm
new file mode 100644
index 000000000..d3b2a271b
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -0,0 +1,87 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+; TODO(linfeng): The duplication with vp10 should be resolved.
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+%else
+  mova            [outputq],      m0
+  mova            [outputq + 16], m1
+%endif
+
+  RET
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
deleted file mode 100644
index 0bc417fc1..000000000
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER)
-# include <intrin.h>
-#endif
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-# define LIKELY(v)    __builtin_expect(v, 1)
-# define UNLIKELY(v)  __builtin_expect(v, 0)
-#else
-# define LIKELY(v)    (v)
-# define UNLIKELY(v)  (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
-  int_mv result;
-  result.as_mv.row = row;
-  result.as_mv.col = col;
-  return result;
-}
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv,
-                          const int *joint_cost, int *const comp_cost[2]) {
-  return joint_cost[get_mv_joint(mv)] +
-         comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row,
-                                  mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost,
-                                              x->nmvsadcost) *
-                                              sad_per_bit, VP9_PROB_COST_SHIFT);
-}
-
-/*****************************************************************************
- * This function utilises 3 properties of the cost function lookup tables,   *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
- * vp9_encoder.c.                                                            *
- * For the joint cost:                                                       *
- *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
- * For the component costs:                                                  *
- *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
- *         (Equal costs for both components)                                 *
- *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
- *         (Cost function is even)                                           *
- * If these do not hold, then this function cannot be used without           *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties.                                  *
- *****************************************************************************/
-int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
-                               const search_site_config *cfg,
-                               MV *ref_mv, MV *best_mv, int search_param,
-                               int sad_per_bit, int *num00,
-                               const vp9_variance_fn_ptr_t *fn_ptr,
-                               const MV *center_mv) {
-  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
-  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
-  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
-  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
-
-  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
-
-  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
-  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
-  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
-  const int tot_steps = cfg->total_steps - search_param;
-
-  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
-                                        center_mv->col >> 3);
-  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
-
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
-
-  int_mv bmv = pack_int_mv(ref_row, ref_col);
-  int_mv new_bmv = bmv;
-  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
-
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
-                                 ref_row * in_what_stride + ref_col;
-
-  // Work out the start point for the search
-  const uint8_t *best_address = in_what;
-  const uint8_t *new_best_address = best_address;
-#if ARCH_X86_64
-  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-  unsigned int best_sad;
-
-  int i;
-  int j;
-  int step;
-
-  // Check the prerequisite cost function properties that are easy to check
-  // in an assert. See the function-level documentation for details on all
-  // prerequisites.
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
-  *num00 = 0;
-
-  for (i = 0, step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
-      __m128i v_sad_d;
-      __m128i v_cost_d;
-      __m128i v_outside_d;
-      __m128i v_inside_d;
-      __m128i v_diff_mv_w;
-#if ARCH_X86_64
-      __m128i v_blocka[2];
-#else
-      __m128i v_blocka[1];
-#endif
-
-      // Compute the candidate motion vectors
-      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
-      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
-      // Clamp them to the search bounds
-      __m128i v_these_mv_clamp_w = v_these_mv_w;
-      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
-      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
-      // The ones that did not change are inside the search area
-      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
-
-      // If none of them are inside, then move on
-      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
-        continue;
-      }
-
-      // The inverse mask indicates which of the MVs are outside
-      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
-      // Shift right to keep the sign bit clear, we will use this later
-      // to set the cost to the maximum value.
-      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
-
-      // Compute the difference MV
-      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
-      // We utilise the fact that the cost function is even, and use the
-      // absolute difference. This allows us to use unsigned indexes later
-      // and reduces cache pressure somewhat as only a half of the table
-      // is ever referenced.
-      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
-
-      // Compute the SIMD pointer offsets.
-      {
-#if ARCH_X86_64  //  sizeof(intptr_t) == 8
-        // Load the offsets
-        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
-        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
-        // Set the ones falling outside to zero
-        v_bo10_q = _mm_and_si128(v_bo10_q,
-                                 _mm_cvtepi32_epi64(v_inside_d));
-        v_bo32_q = _mm_and_si128(v_bo32_q,
-                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
-        // Compute the candidate addresses
-        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
-        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
-#else  // ARCH_X86 //  sizeof(intptr_t) == 4
-        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
-        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
-        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
-#endif
-      }
-
-      fn_ptr->sdx4df(what, what_stride,
-                     (const uint8_t **)&v_blocka[0], in_what_stride,
-                     (uint32_t*)&v_sad_d);
-
-      // Look up the component cost of the residual motion vector
-      {
-        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
-        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
-        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
-        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
-        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
-        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
-        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
-        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
-
-        // Note: This is a use case for vpgather in AVX2
-        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
-        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
-        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
-        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
-
-        __m128i v_cost_10_d, v_cost_32_d;
-
-        v_cost_10_d = _mm_cvtsi32_si128(cost0);
-        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
-
-        v_cost_32_d = _mm_cvtsi32_si128(cost2);
-        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
-
-        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
-      }
-
-      // Now add in the joint cost
-      {
-        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
-                                                _mm_setzero_si128());
-        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
-                                                       v_joint_cost_0_d,
-                                                       v_sel_d);
-        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
-      }
-
-      // Multiply by sad_per_bit
-      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
-      // ROUND_POWER_OF_TWO(v_cost_d, 8)
-      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
-      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
-      // Add the cost to the sad
-      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
-
-      // Make the motion vectors outside the search area have max cost
-      // by or'ing in the comparison mask, this way the minimum search won't
-      // pick them.
-      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
-
-      // Find the minimum value and index horizontally in v_sad_d
-      {
-        // Try speculatively on 16 bits, so we can use the minpos intrinsic
-        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
-        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
-
-        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
-        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
-
-        // If the local best value is not saturated, just use it, otherwise
-        // find the horizontal minimum again the hard way on 32 bits.
-        // This is executed rarely.
-        if (UNLIKELY(local_best_sad == 0xffff)) {
-          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
-
-          v_loval_d = v_sad_d;
-          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
-          v_hival_d = _mm_srli_si128(v_loval_d, 8);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-          v_hival_d = _mm_srli_si128(v_loval_d, 4);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-
-          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
-          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
-        }
-
-        // Update the global minimum if the local minimum is smaller
-        if (LIKELY(local_best_sad < best_sad)) {
-          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
-          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
-          best_sad = local_best_sad;
-        }
-      }
-    }
-
-    bmv = new_bmv;
-    best_address = new_best_address;
-
-    v_bmv_w = _mm_set1_epi32(bmv.as_int);
-#if ARCH_X86_64
-    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-    if (UNLIKELY(best_address == in_what)) {
-      (*num00)++;
-    }
-  }
-
-  *best_mv = bmv.as_mv;
-  return best_sad;
-}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 2930c23dd..7643b48df 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -96,13 +96,12 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
@@ -117,7 +116,7 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
 endif
 endif
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index ff7533745..ad91eadbe 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -35,10 +35,10 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, 8);
-    op[1] = WRAPLOW(b1, 8);
-    op[2] = WRAPLOW(c1, 8);
-    op[3] = WRAPLOW(d1, 8);
+    op[0] = WRAPLOW(a1);
+    op[1] = WRAPLOW(b1);
+    op[2] = WRAPLOW(c1);
+    op[3] = WRAPLOW(d1);
     ip += 4;
     op += 4;
   }
@@ -76,8 +76,8 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, 8);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+  op[0] = WRAPLOW(a1);
+  op[1] = op[2] = op[3] = WRAPLOW(e1);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -98,18 +98,18 @@ void idct4_c(const tran_low_t *input, tran_low_t *output) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], 8);
-  output[1] = WRAPLOW(step[1] + step[2], 8);
-  output[2] = WRAPLOW(step[1] - step[2], 8);
-  output[3] = WRAPLOW(step[0] - step[3], 8);
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
 }
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -141,8 +141,8 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
                          int dest_stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -164,48 +164,48 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   temp1 = (step1[0] + step1[2]) * cospi_16_64;
   temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], 8);
-  output[1] = WRAPLOW(step1[1] + step1[6], 8);
-  output[2] = WRAPLOW(step1[2] + step1[5], 8);
-  output[3] = WRAPLOW(step1[3] + step1[4], 8);
-  output[4] = WRAPLOW(step1[3] - step1[4], 8);
-  output[5] = WRAPLOW(step1[2] - step1[5], 8);
-  output[6] = WRAPLOW(step1[1] - step1[6], 8);
-  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
 }
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -236,8 +236,8 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -277,10 +277,10 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
@@ -311,14 +311,14 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
 
   // stage 2
   s0 = (int)x0;
@@ -330,14 +330,14 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 
-  x0 = WRAPLOW(s0 + s2, 8);
-  x1 = WRAPLOW(s1 + s3, 8);
-  x2 = WRAPLOW(s0 - s2, 8);
-  x3 = WRAPLOW(s1 - s3, 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
 
   // stage 3
   s2 = (int)(cospi_16_64 * (x2 + x3));
@@ -345,19 +345,19 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(cospi_16_64 * (x6 + x7));
   s7 = (int)(cospi_16_64 * (x6 - x7));
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x4, 8);
-  output[2] = WRAPLOW(x6, 8);
-  output[3] = WRAPLOW(-x2, 8);
-  output[4] = WRAPLOW(x3, 8);
-  output[5] = WRAPLOW(-x7, 8);
-  output[6] = WRAPLOW(x5, 8);
-  output[7] = WRAPLOW(-x1, 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x4);
+  output[2] = WRAPLOW(x6);
+  output[3] = WRAPLOW(-x2);
+  output[4] = WRAPLOW(x3);
+  output[5] = WRAPLOW(-x7);
+  output[6] = WRAPLOW(x5);
+  output[7] = WRAPLOW(-x1);
 }
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -420,23 +420,23 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 3
   step1[0] = step2[0];
@@ -446,109 +446,109 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], 8);
-  output[1] = WRAPLOW(step2[1] + step2[14], 8);
-  output[2] = WRAPLOW(step2[2] + step2[13], 8);
-  output[3] = WRAPLOW(step2[3] + step2[12], 8);
-  output[4] = WRAPLOW(step2[4] + step2[11], 8);
-  output[5] = WRAPLOW(step2[5] + step2[10], 8);
-  output[6] = WRAPLOW(step2[6] + step2[9], 8);
-  output[7] = WRAPLOW(step2[7] + step2[8], 8);
-  output[8] = WRAPLOW(step2[7] - step2[8], 8);
-  output[9] = WRAPLOW(step2[6] - step2[9], 8);
-  output[10] = WRAPLOW(step2[5] - step2[10], 8);
-  output[11] = WRAPLOW(step2[4] - step2[11], 8);
-  output[12] = WRAPLOW(step2[3] - step2[12], 8);
-  output[13] = WRAPLOW(step2[2] - step2[13], 8);
-  output[14] = WRAPLOW(step2[1] - step2[14], 8);
-  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+  output[0] = WRAPLOW(step2[0] + step2[15]);
+  output[1] = WRAPLOW(step2[1] + step2[14]);
+  output[2] = WRAPLOW(step2[2] + step2[13]);
+  output[3] = WRAPLOW(step2[3] + step2[12]);
+  output[4] = WRAPLOW(step2[4] + step2[11]);
+  output[5] = WRAPLOW(step2[5] + step2[10]);
+  output[6] = WRAPLOW(step2[6] + step2[9]);
+  output[7] = WRAPLOW(step2[7] + step2[8]);
+  output[8] = WRAPLOW(step2[7] - step2[8]);
+  output[9] = WRAPLOW(step2[6] - step2[9]);
+  output[10] = WRAPLOW(step2[5] - step2[10]);
+  output[11] = WRAPLOW(step2[4] - step2[11]);
+  output[12] = WRAPLOW(step2[3] - step2[12]);
+  output[13] = WRAPLOW(step2[2] - step2[13]);
+  output[14] = WRAPLOW(step2[1] - step2[14]);
+  output[15] = WRAPLOW(step2[0] - step2[15]);
 }
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
@@ -625,22 +625,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
 
   // stage 2
   s0 = x0;
@@ -660,22 +660,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, 8);
-  x1 = WRAPLOW(s1 + s5, 8);
-  x2 = WRAPLOW(s2 + s6, 8);
-  x3 = WRAPLOW(s3 + s7, 8);
-  x4 = WRAPLOW(s0 - s4, 8);
-  x5 = WRAPLOW(s1 - s5, 8);
-  x6 = WRAPLOW(s2 - s6, 8);
-  x7 = WRAPLOW(s3 - s7, 8);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
 
   // stage 3
   s0 = x0;
@@ -695,22 +695,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(check_range(s0 + s2), 8);
-  x1 = WRAPLOW(check_range(s1 + s3), 8);
-  x2 = WRAPLOW(check_range(s0 - s2), 8);
-  x3 = WRAPLOW(check_range(s1 - s3), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
-  x8 = WRAPLOW(check_range(s8 + s10), 8);
-  x9 = WRAPLOW(check_range(s9 + s11), 8);
-  x10 = WRAPLOW(check_range(s8 - s10), 8);
-  x11 = WRAPLOW(check_range(s9 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -722,31 +722,31 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
-
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x8, 8);
-  output[2] = WRAPLOW(x12, 8);
-  output[3] = WRAPLOW(-x4, 8);
-  output[4] = WRAPLOW(x6, 8);
-  output[5] = WRAPLOW(x14, 8);
-  output[6] = WRAPLOW(x10, 8);
-  output[7] = WRAPLOW(x2, 8);
-  output[8] = WRAPLOW(x3, 8);
-  output[9] = WRAPLOW(x11, 8);
-  output[10] = WRAPLOW(x15, 8);
-  output[11] = WRAPLOW(x7, 8);
-  output[12] = WRAPLOW(x5, 8);
-  output[13] = WRAPLOW(-x13, 8);
-  output[14] = WRAPLOW(x9, 8);
-  output[15] = WRAPLOW(-x1, 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
 }
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
@@ -779,8 +779,8 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -813,43 +813,43 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   step2[0] = step1[0];
@@ -863,40 +863,40 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
-  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
-  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
-  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
-  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
-  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
-  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
-  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
-  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
-  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
-  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
-  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step2[16] = WRAPLOW(step1[16] + step1[17]);
+  step2[17] = WRAPLOW(step1[16] - step1[17]);
+  step2[18] = WRAPLOW(-step1[18] + step1[19]);
+  step2[19] = WRAPLOW(step1[18] + step1[19]);
+  step2[20] = WRAPLOW(step1[20] + step1[21]);
+  step2[21] = WRAPLOW(step1[20] - step1[21]);
+  step2[22] = WRAPLOW(-step1[22] + step1[23]);
+  step2[23] = WRAPLOW(step1[22] + step1[23]);
+  step2[24] = WRAPLOW(step1[24] + step1[25]);
+  step2[25] = WRAPLOW(step1[24] - step1[25]);
+  step2[26] = WRAPLOW(-step1[26] + step1[27]);
+  step2[27] = WRAPLOW(step1[26] + step1[27]);
+  step2[28] = WRAPLOW(step1[28] + step1[29]);
+  step2[29] = WRAPLOW(step1[28] - step1[29]);
+  step2[30] = WRAPLOW(-step1[30] + step1[31]);
+  step2[31] = WRAPLOW(step1[30] + step1[31]);
 
   // stage 3
   step1[0] = step2[0];
@@ -906,42 +906,42 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -950,87 +950,87 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
-  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
-  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
-  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
-  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
-  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
-  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
-  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
-  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[19]);
+  step2[17] = WRAPLOW(step1[17] + step1[18]);
+  step2[18] = WRAPLOW(step1[17] - step1[18]);
+  step2[19] = WRAPLOW(step1[16] - step1[19]);
+  step2[20] = WRAPLOW(-step1[20] + step1[23]);
+  step2[21] = WRAPLOW(-step1[21] + step1[22]);
+  step2[22] = WRAPLOW(step1[21] + step1[22]);
+  step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27]);
+  step2[25] = WRAPLOW(step1[25] + step1[26]);
+  step2[26] = WRAPLOW(step1[25] - step1[26]);
+  step2[27] = WRAPLOW(step1[24] - step1[27]);
+  step2[28] = WRAPLOW(-step1[28] + step1[31]);
+  step2[29] = WRAPLOW(-step1[29] + step1[30]);
+  step2[30] = WRAPLOW(step1[29] + step1[30]);
+  step2[31] = WRAPLOW(step1[28] + step1[31]);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -1039,62 +1039,62 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
-  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
-  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
-  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
-  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
-  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
-  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
-  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
-  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
-  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[23]);
+  step2[17] = WRAPLOW(step1[17] + step1[22]);
+  step2[18] = WRAPLOW(step1[18] + step1[21]);
+  step2[19] = WRAPLOW(step1[19] + step1[20]);
+  step2[20] = WRAPLOW(step1[19] - step1[20]);
+  step2[21] = WRAPLOW(step1[18] - step1[21]);
+  step2[22] = WRAPLOW(step1[17] - step1[22]);
+  step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31]);
+  step2[25] = WRAPLOW(-step1[25] + step1[30]);
+  step2[26] = WRAPLOW(-step1[26] + step1[29]);
+  step2[27] = WRAPLOW(-step1[27] + step1[28]);
+  step2[28] = WRAPLOW(step1[27] + step1[28]);
+  step2[29] = WRAPLOW(step1[26] + step1[29]);
+  step2[30] = WRAPLOW(step1[25] + step1[30]);
+  step2[31] = WRAPLOW(step1[24] + step1[31]);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
-  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
-  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
-  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
-  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
-  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
-  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
-  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
-  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
-  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
-  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
-  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
-  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
-  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[15]);
+  step1[1] = WRAPLOW(step2[1] + step2[14]);
+  step1[2] = WRAPLOW(step2[2] + step2[13]);
+  step1[3] = WRAPLOW(step2[3] + step2[12]);
+  step1[4] = WRAPLOW(step2[4] + step2[11]);
+  step1[5] = WRAPLOW(step2[5] + step2[10]);
+  step1[6] = WRAPLOW(step2[6] + step2[9]);
+  step1[7] = WRAPLOW(step2[7] + step2[8]);
+  step1[8] = WRAPLOW(step2[7] - step2[8]);
+  step1[9] = WRAPLOW(step2[6] - step2[9]);
+  step1[10] = WRAPLOW(step2[5] - step2[10]);
+  step1[11] = WRAPLOW(step2[4] - step2[11]);
+  step1[12] = WRAPLOW(step2[3] - step2[12]);
+  step1[13] = WRAPLOW(step2[2] - step2[13]);
+  step1[14] = WRAPLOW(step2[1] - step2[14]);
+  step1[15] = WRAPLOW(step2[0] - step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -1102,58 +1102,58 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], 8);
-  output[1] = WRAPLOW(step1[1] + step1[30], 8);
-  output[2] = WRAPLOW(step1[2] + step1[29], 8);
-  output[3] = WRAPLOW(step1[3] + step1[28], 8);
-  output[4] = WRAPLOW(step1[4] + step1[27], 8);
-  output[5] = WRAPLOW(step1[5] + step1[26], 8);
-  output[6] = WRAPLOW(step1[6] + step1[25], 8);
-  output[7] = WRAPLOW(step1[7] + step1[24], 8);
-  output[8] = WRAPLOW(step1[8] + step1[23], 8);
-  output[9] = WRAPLOW(step1[9] + step1[22], 8);
-  output[10] = WRAPLOW(step1[10] + step1[21], 8);
-  output[11] = WRAPLOW(step1[11] + step1[20], 8);
-  output[12] = WRAPLOW(step1[12] + step1[19], 8);
-  output[13] = WRAPLOW(step1[13] + step1[18], 8);
-  output[14] = WRAPLOW(step1[14] + step1[17], 8);
-  output[15] = WRAPLOW(step1[15] + step1[16], 8);
-  output[16] = WRAPLOW(step1[15] - step1[16], 8);
-  output[17] = WRAPLOW(step1[14] - step1[17], 8);
-  output[18] = WRAPLOW(step1[13] - step1[18], 8);
-  output[19] = WRAPLOW(step1[12] - step1[19], 8);
-  output[20] = WRAPLOW(step1[11] - step1[20], 8);
-  output[21] = WRAPLOW(step1[10] - step1[21], 8);
-  output[22] = WRAPLOW(step1[9] - step1[22], 8);
-  output[23] = WRAPLOW(step1[8] - step1[23], 8);
-  output[24] = WRAPLOW(step1[7] - step1[24], 8);
-  output[25] = WRAPLOW(step1[6] - step1[25], 8);
-  output[26] = WRAPLOW(step1[5] - step1[26], 8);
-  output[27] = WRAPLOW(step1[4] - step1[27], 8);
-  output[28] = WRAPLOW(step1[3] - step1[28], 8);
-  output[29] = WRAPLOW(step1[2] - step1[29], 8);
-  output[30] = WRAPLOW(step1[1] - step1[30], 8);
-  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+  output[0] = WRAPLOW(step1[0] + step1[31]);
+  output[1] = WRAPLOW(step1[1] + step1[30]);
+  output[2] = WRAPLOW(step1[2] + step1[29]);
+  output[3] = WRAPLOW(step1[3] + step1[28]);
+  output[4] = WRAPLOW(step1[4] + step1[27]);
+  output[5] = WRAPLOW(step1[5] + step1[26]);
+  output[6] = WRAPLOW(step1[6] + step1[25]);
+  output[7] = WRAPLOW(step1[7] + step1[24]);
+  output[8] = WRAPLOW(step1[8] + step1[23]);
+  output[9] = WRAPLOW(step1[9] + step1[22]);
+  output[10] = WRAPLOW(step1[10] + step1[21]);
+  output[11] = WRAPLOW(step1[11] + step1[20]);
+  output[12] = WRAPLOW(step1[12] + step1[19]);
+  output[13] = WRAPLOW(step1[13] + step1[18]);
+  output[14] = WRAPLOW(step1[14] + step1[17]);
+  output[15] = WRAPLOW(step1[15] + step1[16]);
+  output[16] = WRAPLOW(step1[15] - step1[16]);
+  output[17] = WRAPLOW(step1[14] - step1[17]);
+  output[18] = WRAPLOW(step1[13] - step1[18]);
+  output[19] = WRAPLOW(step1[12] - step1[19]);
+  output[20] = WRAPLOW(step1[11] - step1[20]);
+  output[21] = WRAPLOW(step1[10] - step1[21]);
+  output[22] = WRAPLOW(step1[9] - step1[22]);
+  output[23] = WRAPLOW(step1[8] - step1[23]);
+  output[24] = WRAPLOW(step1[7] - step1[24]);
+  output[25] = WRAPLOW(step1[6] - step1[25]);
+  output[26] = WRAPLOW(step1[5] - step1[26]);
+  output[27] = WRAPLOW(step1[4] - step1[27]);
+  output[28] = WRAPLOW(step1[3] - step1[28]);
+  output[29] = WRAPLOW(step1[2] - step1[29]);
+  output[30] = WRAPLOW(step1[1] - step1[30]);
+  output[31] = WRAPLOW(step1[0] - step1[31]);
 }
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
@@ -1253,8 +1253,8 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -1288,10 +1288,10 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, bd);
-    op[1] = WRAPLOW(b1, bd);
-    op[2] = WRAPLOW(c1, bd);
-    op[3] = WRAPLOW(d1, bd);
+    op[0] = HIGHBD_WRAPLOW(a1, bd);
+    op[1] = HIGHBD_WRAPLOW(b1, bd);
+    op[2] = HIGHBD_WRAPLOW(c1, bd);
+    op[3] = HIGHBD_WRAPLOW(d1, bd);
     ip += 4;
     op += 4;
   }
@@ -1332,8 +1332,8 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+  op[0] = HIGHBD_WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -1359,18 +1359,18 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], bd);
-  output[1] = WRAPLOW(step[1] + step[2], bd);
-  output[2] = WRAPLOW(step[1] - step[2], bd);
-  output[3] = WRAPLOW(step[0] - step[3], bd);
+  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
 }
 
 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1404,11 +1404,11 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1430,39 +1430,39 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vpx_highbd_idct4_c(step1, step1, bd);
 
   // stage 2 - odd half
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   // stage 3 - odd half
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
 }
 
 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1496,10 +1496,10 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -1540,10 +1540,10 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
-  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
-  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1574,14 +1574,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1593,14 +1593,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1608,19 +1608,19 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x4, bd);
-  output[2] = WRAPLOW(x6, bd);
-  output[3] = WRAPLOW(-x2, bd);
-  output[4] = WRAPLOW(x3, bd);
-  output[5] = WRAPLOW(-x7, bd);
-  output[6] = WRAPLOW(x5, bd);
-  output[7] = WRAPLOW(-x1, bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x4, bd);
+  output[2] = HIGHBD_WRAPLOW(x6, bd);
+  output[3] = HIGHBD_WRAPLOW(-x2, bd);
+  output[4] = HIGHBD_WRAPLOW(x3, bd);
+  output[5] = HIGHBD_WRAPLOW(-x7, bd);
+  output[6] = HIGHBD_WRAPLOW(x5, bd);
+  output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1685,23 +1685,23 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1711,109 +1711,109 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 }
 
 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1889,22 +1889,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
-  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
-  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
+  x8  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
+  x9  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1924,22 +1924,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, bd);
-  x1 = WRAPLOW(s1 + s5, bd);
-  x2 = WRAPLOW(s2 + s6, bd);
-  x3 = WRAPLOW(s3 + s7, bd);
-  x4 = WRAPLOW(s0 - s4, bd);
-  x5 = WRAPLOW(s1 - s5, bd);
-  x6 = WRAPLOW(s2 - s6, bd);
-  x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -1959,22 +1959,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
-  x8 = WRAPLOW(s8 + s10, bd);
-  x9 = WRAPLOW(s9 + s11, bd);
-  x10 = WRAPLOW(s8 - s10, bd);
-  x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -1986,31 +1986,31 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
-
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x8, bd);
-  output[2] = WRAPLOW(x12, bd);
-  output[3] = WRAPLOW(-x4, bd);
-  output[4] = WRAPLOW(x6, bd);
-  output[5] = WRAPLOW(x14, bd);
-  output[6] = WRAPLOW(x10, bd);
-  output[7] = WRAPLOW(x2, bd);
-  output[8] = WRAPLOW(x3, bd);
-  output[9] = WRAPLOW(x11, bd);
-  output[10] = WRAPLOW(x15, bd);
-  output[11] = WRAPLOW(x7, bd);
-  output[12] = WRAPLOW(x5, bd);
-  output[13] = WRAPLOW(-x13, bd);
-  output[14] = WRAPLOW(x9, bd);
-  output[15] = WRAPLOW(-x1, bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
+
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x8, bd);
+  output[2] = HIGHBD_WRAPLOW(x12, bd);
+  output[3] = HIGHBD_WRAPLOW(-x4, bd);
+  output[4] = HIGHBD_WRAPLOW(x6, bd);
+  output[5] = HIGHBD_WRAPLOW(x14, bd);
+  output[6] = HIGHBD_WRAPLOW(x10, bd);
+  output[7] = HIGHBD_WRAPLOW(x2, bd);
+  output[8] = HIGHBD_WRAPLOW(x3, bd);
+  output[9] = HIGHBD_WRAPLOW(x11, bd);
+  output[10] = HIGHBD_WRAPLOW(x15, bd);
+  output[11] = HIGHBD_WRAPLOW(x7, bd);
+  output[12] = HIGHBD_WRAPLOW(x5, bd);
+  output[13] = HIGHBD_WRAPLOW(-x13, bd);
+  output[14] = HIGHBD_WRAPLOW(x9, bd);
+  output[15] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2045,11 +2045,11 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -2084,43 +2084,43 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2134,40 +2134,40 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -2177,42 +2177,42 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2221,87 +2221,87 @@ static void highbd_idct32_c(const tran_low_t *input,
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
+
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2310,62 +2310,62 @@ static void highbd_idct32_c(const tran_low_t *input,
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
+
+  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -2373,58 +2373,58 @@ static void highbd_idct32_c(const tran_low_t *input,
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
 }
 
 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2500,9 +2500,9 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 23588139e..c407dd896 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -21,7 +21,7 @@
 extern "C" {
 #endif
 
-static INLINE tran_low_t check_range(tran_high_t input) {
+static INLINE tran_high_t check_range(tran_high_t input) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid VP9 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
@@ -32,17 +32,17 @@ static INLINE tran_low_t check_range(tran_high_t input) {
   assert(INT16_MIN <= input);
   assert(input <= INT16_MAX);
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return check_range(rv);
+  return (tran_high_t)rv;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_low_t highbd_check_range(tran_high_t input,
-                                            int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input,
+                                             int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid highbitdepth VP9 streams, intermediate stage coefficients will
   // stay within the ranges:
@@ -56,13 +56,12 @@ static INLINE tran_low_t highbd_check_range(tran_high_t input,
   (void) int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   (void) bd;
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
-                                                      int bd) {
+static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return highbd_check_range(rv, bd);
+  return (tran_high_t)rv;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -83,9 +82,20 @@ static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) ((int32_t)(x))
+
+#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+    ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#else   // CONFIG_EMULATE_HARDWARE
+
+#define WRAPLOW(x) ((int32_t)check_range(x))
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+    ((int32_t)highbd_check_range((x), bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EMULATE_HARDWARE
 
 void idct4_c(const tran_low_t *input, tran_low_t *output);
@@ -107,14 +117,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
 
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                              int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+  trans = HIGHBD_WRAPLOW(trans, bd);
+  return clip_pixel_highbd(dest + trans, bd);
 }
 #endif
 
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans, 8);
-  return clip_pixel(WRAPLOW(dest + trans, 8));
+  trans = WRAPLOW(trans);
+  return clip_pixel(dest + trans);
 }
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index f812d4499..0e2b1a850 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -39,7 +39,6 @@ endif
 DSP_SRCS-yes += intrapred.c
 
 ifeq ($(CONFIG_USE_X86INC),yes)
-DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
@@ -56,7 +55,6 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
 ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
 DSP_SRCS-yes += add_noise.c
 DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
-DSP_SRCS-$(HAVE_MMX) += x86/add_noise_mmx.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
 endif # CONFIG_POSTPROC
 
@@ -323,8 +321,6 @@ DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
-DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
-DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d5561173b..414428127 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -55,13 +55,13 @@ if ($opts{arch} eq "x86_64") {
 #
 
 add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vpx_d207_predictor_4x4/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207e_predictor_4x4/;
 
 add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_4x4 neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_4x4/;
@@ -118,7 +118,7 @@ add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, c
 specialize qw/vpx_d207e_predictor_8x8/;
 
 add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_8x8 neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_8x8/;
@@ -543,7 +543,7 @@ specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
 $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_4 neon dspr2 msa/, "$mmx_x86inc";
+specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
@@ -564,7 +564,7 @@ specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
 $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_4 neon dspr2 msa/, "$mmx_x86inc";
+specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
@@ -1478,25 +1478,25 @@ add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int
   specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance16x16 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance8x8 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -1532,22 +1532,22 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, i
   specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 #
 # Specialty Subpixel
 #
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
+  specialize qw/vpx_variance_halfpixvar16x16_h sse2 media/;
 
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
+  specialize qw/vpx_variance_halfpixvar16x16_v sse2 media/;
 
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
+  specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
@@ -1913,7 +1913,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
     add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vpx_plane_add_noise mmx sse2 msa/;
+    specialize qw/vpx_plane_add_noise sse2 msa/;
 }
 
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpx_dsp/x86/add_noise_mmx.asm b/vpx_dsp/x86/add_noise_mmx.asm
deleted file mode 100644
index 8c2623db4..000000000
--- a/vpx_dsp/x86/add_noise_mmx.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vpx_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vpx_plane_add_noise_mmx) PRIVATE
-sym(vpx_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; get the clamps in registers
-    mov     rdx, arg(2) ; blackclamp
-    movq    mm3, [rdx]
-    mov     rdx, arg(3) ; whiteclamp
-    movq    mm4, [rdx]
-    mov     rdx, arg(4) ; bothclamp
-    movq    mm5, [rdx]
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, mm3 ; subtract black clamp
-            paddusb     mm1, mm5 ; add both clamp
-            psubusb     mm1, mm4 ; subtract whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-Blur:
-    times 16 dw 16
-    times  8 dw 64
-    times 16 dw 16
-    times  8 dw  0
-
-rd:
-    times 4 dw 0x40
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index c24d53686..cd6a6ae98 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -11,6 +11,7 @@
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION_RODATA
+pb_1: times 16 db 1
 pw_4:  times 8 dw 4
 pw_8:  times 8 dw 8
 pw_16: times 8 dw 16
@@ -23,6 +24,115 @@ pw2_32:  times 8 dw 16
 
 SECTION .text
 
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+  pavgb               %4, %1, %3
+  pxor                %3, %1
+  pand                %3, [GLOBAL(pb_1)]
+  psubb               %4, %3
+  pavgb               %4, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                 m0, [aboveq]
+  DEFINE_ARGS dst, stride, temp
+  psrldq               m1, m0, 1
+  psrldq               m2, m0, 2
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+
+  ; store 4 lines
+  movd   [dstq          ], m3
+  psrlq                m3, 8
+  movd   [dstq+strideq  ], m3
+  lea                dstq, [dstq+strideq*2]
+  psrlq                m3, 8
+  movd   [dstq          ], m3
+  psrlq                m3, 8
+  movd   [dstq+strideq  ], m3
+  psrlq                m0, 56
+  movd              tempq, m0
+  mov    [dstq+strideq+3], tempb
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movu                m1, [aboveq]
+  pslldq              m0, m1, 1
+  psrldq              m2, m1, 1
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+  punpckhbw           m0, m0 ; 7 7
+  punpcklwd           m0, m0 ; 7 7 7 7
+  punpckldq           m0, m0 ; 7 7 7 7 7 7 7 7
+  punpcklqdq          m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
+
+ ; store 4 lines
+  psrldq                m3, 1
+  movq    [dstq          ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq  ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq*2], m3
+  psrldq                m3, 1
+  movq    [dstq+stride3q ], m3
+  lea                 dstq, [dstq+strideq*4]
+
+  ; store next 4 lines
+  psrldq                m3, 1
+  movq    [dstq          ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq  ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq*2], m3
+  psrldq                m3, 1
+  movq    [dstq+stride3q ], m3
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
+  GET_GOT     goffsetq
+
+  movd                m0, [leftq]                ; abcd [byte]
+  punpcklbw           m4, m0, m0                 ; aabb ccdd
+  punpcklwd           m4, m4                     ; aaaa bbbb cccc dddd
+  psrldq              m4, 12                     ; dddd
+  punpckldq           m0, m4                     ; abcd dddd
+  psrldq              m1, m0, 1                  ; bcdd
+  psrldq              m2, m0, 2                  ; cddd
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3   ; a2bc b2cd c3d d
+  pavgb               m1, m0                     ; ab, bc, cd, d [byte]
+
+  punpcklbw           m1, m3             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+  movd    [dstq        ], m1
+  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
+  movd    [dstq+strideq], m1
+
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m1, 16             ; cd, c3d, d, d
+  movd    [dstq        ], m1
+  movd    [dstq+strideq], m4             ; d, d, d, d
+  RESTORE_GOT
+  RET
+
 INIT_XMM sse2
 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
diff --git a/vpx_dsp/x86/intrapred_ssse3.asm b/vpx_dsp/x86/intrapred_ssse3.asm
index d061278c7..5e0139fa8 100644
--- a/vpx_dsp/x86/intrapred_ssse3.asm
+++ b/vpx_dsp/x86/intrapred_ssse3.asm
@@ -13,7 +13,6 @@
 SECTION_RODATA
 
 pb_1: times 16 db 1
-sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
@@ -28,77 +27,9 @@ sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 
 SECTION .text
 
-INIT_MMX ssse3
-cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m0, [aboveq]
-  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
-  pshufb              m1, m0, [GLOBAL(sh_b01234577)]
-  pshufb              m0, [GLOBAL(sh_b12345677)]
-  pavgb               m3, m2, m1
-  pxor                m2, m1
-  pand                m2, [GLOBAL(pb_1)]
-  psubb               m3, m2
-  pavgb               m0, m3
-
-  ; store 4 lines
-  movd    [dstq        ], m0
-  psrlq               m0, 8
-  movd    [dstq+strideq], m0
-  lea               dstq, [dstq+strideq*2]
-  psrlq               m0, 8
-  movd    [dstq        ], m0
-  psrlq               m0, 8
-  movd    [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_MMX ssse3
-cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m0, [aboveq]
-  mova                m1, [GLOBAL(sh_b12345677)]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
-  pavgb               m3, m2, m0
-  pxor                m2, m0
-  pshufb              m0, m1
-  pand                m2, [GLOBAL(pb_1)]
-  psubb               m3, m2
-  pavgb               m0, m3
-
-  ; store 4 lines
-  movq  [dstq          ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq  ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq*2], m0
-  pshufb              m0, m1
-  movq  [dstq+stride3q ], m0
-  pshufb              m0, m1
-  lea               dstq, [dstq+strideq*4]
-
-  ; store next 4 lines
-  movq  [dstq          ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq  ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq*2], m0
-  pshufb              m0, m1
-  movq  [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
 INIT_XMM ssse3
 cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
   GET_GOT     goffsetq
@@ -715,28 +646,6 @@ cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX ssse3
-cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
-  GET_GOT     goffsetq
-  movd                m0, [leftq]                ; abcd [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
-  pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
-  pavgb               m1, m0             ; ab, bc, cd, d [byte]
-
-  punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
-  movd    [dstq        ], m1
-  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
-  movd    [dstq+strideq], m1
-  lea               dstq, [dstq+strideq*2]
-  psrlq               m1, 16             ; cd, c3d, d, d
-  movd    [dstq        ], m1
-  pshufw              m1, m1, q1111      ; d, d, d, d
-  movd    [dstq+strideq], m1
-  RESTORE_GOT
-  RET
-
 INIT_XMM ssse3
 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
   GET_GOT     goffsetq
diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
deleted file mode 100644
index 45d0ecc0d..000000000
--- a/vpx_dsp/x86/loopfilter_mmx.asm
+++ /dev/null
@@ -1,436 +0,0 @@
-;
-;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-tfe:
-    times 8 db 0xfe
-t80:
-    times 8 db 0x80
-t3:
-    times 8 db 0x03
-t4:
-    times 8 db 0x04
-ones:
-    times 4 dw 0x0001
-
-SECTION .text
-
-%define stkreg rsp
-
-%define t0                  0
-%define t1            t0 + 16
-%define p1            t1 + 16
-%define p0            p1 + 16
-%define q0            p0 + 16
-%define q1            q0 + 16
-%define lstacksize    q1 + 16
-
-%define goffsetq _limitq
-
-;void vpx_lpf_horizontal_4_mmx(unsigned char *src_ptr, int  src_pixel_step,
-;                              const char *blimit, const char *limit,
-;                              const char *thresh);
-INIT_MMX mmx
-cglobal lpf_horizontal_4, 5, 6, 8, 0 - lstacksize, \
-                                s, p, _blimit, _limit, _thresh, s1
-    movq                  m7, [_limitq]
-    GET_GOT         goffsetq
-%if GET_GOT_DEFINED=1
-    add rsp, gprsize                          ; restore stack
-%endif
-    lea                  s1q, [sq + pq]       ; s1q points to row +1
-
-    ; calculate breakout conditions
-    movq                  m2, [s1q + 2 * pq]  ; q3
-    movq                  m1, [ sq + 2 * pq]  ; q2
-    movq                  m6, m1              ; q2
-    psubusb               m1, m2              ; q2-=q3
-    psubusb               m2, m6              ; q3-=q2
-    por                   m1, m2              ; abs(q3-q2)
-    psubusb               m1, m7
-    movq                  m4, [sq + pq]       ; q1
-    movq                  m3, m4              ; q1
-    psubusb               m4, m6              ; q1-=q2
-    psubusb               m6, m3              ; q2-=q1
-    por                   m4, m6              ; abs(q2-q1)
-    psubusb               m4, m7
-    por                   m1, m4
-    movq                  m4, [sq]            ; q0
-    movq                  m0, m4              ; q0
-    psubusb               m4, m3              ; q0-=q1
-    psubusb               m3, m0              ; q1-=q0
-    por                   m4, m3              ; abs(q0-q1)
-    movq       [stkreg + t0], m4              ; save to t0
-    psubusb               m4, m7
-    por                   m1, m4
-    neg                   pq                  ; negate pitch to deal with
-                                              ; above border
-    movq                  m2, [ sq + 4 * pq]  ; p3
-    movq                  m4, [s1q + 4 * pq]  ; p2
-    movq                  m5, m4              ; p2
-    psubusb               m4, m2              ; p2-=p3
-    psubusb               m2, m5              ; p3-=p2
-    por                   m4, m2              ; abs(p3 - p2)
-    psubusb               m4, m7
-    por                   m1, m4
-    movq                  m4, [sq + 2 * pq]   ; p1
-    movq                  m3, m4              ; p1
-    psubusb               m4, m5              ; p1-=p2
-    psubusb               m5, m3              ; p2-=p1
-    por                   m4, m5              ; abs(p2 - p1)
-    psubusb               m4, m7
-    por                   m1, m4
-    movq                  m2, m3              ; p1
-    movq                  m4, [sq + pq]       ; p0
-    movq                  m5, m4              ; p0
-    psubusb               m4, m3              ; p0-=p1
-    psubusb               m3, m5              ; p1-=p0
-    por                   m4, m3              ; abs(p1 - p0)
-    movq       [stkreg + t1], m4              ; save to t1
-    psubusb               m4, m7
-    por                   m1, m4
-    movq                  m3, [s1q]           ; q1
-    movq                  m4, m3              ; q1
-    psubusb               m3, m2              ; q1-=p1
-    psubusb               m2, m4              ; p1-=q1
-    por                   m2, m3              ; abs(p1-q1)
-    pand                  m2, [GLOBAL(tfe)]   ; set lsb of each byte to zero
-    psrlw                 m2, 1               ; abs(p1-q1)/2
-    movq                  m6, m5              ; p0
-    movq                  m3, [sq]            ; q0
-    psubusb               m5, m3              ; p0-=q0
-    psubusb               m3, m6              ; q0-=p0
-    por                   m5, m3              ; abs(p0 - q0)
-    paddusb               m5, m5              ; abs(p0-q0)*2
-    paddusb               m5, m2              ; abs (p0 - q0) * 2 + abs(p1-q1)/2
-    movq                  m7, [_blimitq]            ; blimit
-    psubusb               m5, m7              ; abs (p0 - q0) * 2 +
-                                              ; abs(p1-q1)/2  > blimit
-    por                   m1, m5
-    pxor                  m5, m5
-    pcmpeqb               m1, m5              ; mask m1
-
-    ; calculate high edge variance
-    movq                  m7, [_threshq]
-    movq                  m4, [stkreg + t0]   ; get abs (q1 - q0)
-    psubusb               m4, m7
-    movq                  m3, [stkreg + t1]   ; get abs (p1 - p0)
-    psubusb               m3, m7
-    paddb                 m4, m3              ; abs(q1 - q0) > thresh ||
-                                              ; abs(p1 - p0) > thresh
-    pcmpeqb               m4, m5
-    pcmpeqb               m5, m5
-    movq                  m3, [GLOBAL(t80)]
-    pxor                  m4, m5
-
-    ; start work on filters
-    movq                  m2, [sq + 2 * pq]   ; p1
-    movq                  m7, [s1q]           ; q1
-    pxor                  m2, m3              ; p1 converted to signed values
-    pxor                  m7, m3              ; q1 converted to signed values
-    psubsb                m2, m7              ; p1 - q1
-    pand                  m2, m4              ; high var mask (hvm)(p1 - q1)
-    pxor                  m6, m3              ; p0 converted to signed values
-    pxor                  m0, m3              ; q0 converted to signed values
-    movq                  m3, m0              ; q0
-    psubsb                m0, m6              ; q0 - p0
-    paddsb                m2, m0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-    paddsb                m2, m0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-    paddsb                m2, m0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-    pand                  m1, m2              ; mask filter values we don't
-                                              ; care about
-    movq                  m2, m1
-    paddsb                m1, [GLOBAL(t4)]    ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-    paddsb                m2, [GLOBAL(t3)]    ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-    pxor                  m0, m0
-    pxor                  m5, m5
-    punpcklbw             m0, m2
-    punpckhbw             m5, m2
-    psraw                 m0, 11
-    psraw                 m5, 11
-    packsswb              m0, m5
-    movq                  m2, m0              ; (3* (q0 - p0) + hvm(p1 - q1)
-                                              ; + 3) >> 3;
-    pxor                  m0, m0
-    movq                  m5, m1              ; abcdefgh
-    punpcklbw             m0, m1              ; e0f0g0h0
-    psraw                 m0, 11              ; sign extended shift right by 3
-    pxor                  m1, m1
-    punpckhbw             m1, m5              ; a0b0c0d0
-    psraw                 m1, 11              ; sign extended shift right by 3
-    movq                  m5, m0              ; save results
-
-    packsswb              m0, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
-                                              ; + 4) >>3
-    paddsw                m5, [GLOBAL(ones)]
-    paddsw                m1, [GLOBAL(ones)]
-    psraw                 m5, 1
-    psraw                 m1, 1
-    packsswb              m5, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
-                                              ; + 4) >>4
-    movq                  m1, [GLOBAL(t80)]
-    pandn                 m4, m5              ; high edge variance additive
-    paddsb                m6, m2              ; p0+= p0 add
-    pxor                  m6, m1              ; unoffset
-    movq           [sq + pq], m6              ; write back
-    movq                  m6, [sq + 2 * pq]   ; p1
-    pxor                  m6, m1              ; reoffset
-    paddsb                m6, m4              ; p1+= p1 add
-    pxor                  m6, m1              ; unoffset
-    movq       [sq + 2 * pq], m6              ; write back
-    psubsb                m3, m0              ; q0-= q0 add
-    pxor                  m3, m1              ; unoffset
-    movq                [sq], m3              ; write back
-    psubsb                m7, m4              ; q1-= q1 add
-    pxor                  m7, m1              ; unoffset
-    movq               [s1q], m7              ; write back
-    RET
-
-;void vpx_lpf_vertical_4_mmx(unsigned char *src_ptr, int  src_pixel_step,
-;                            const char *blimit, const char *limit,
-;                            const char *thresh);
-INIT_MMX mmx
-cglobal lpf_vertical_4, 5, 6, 8, 0 - lstacksize, \
-                              s, p, _blimit, _limit, _thresh, s1
-    lea                   sq, [sq + pq * 4 - 4]
-    lea                  s1q, [sq + pq]       ; s1q points to row +1
-    ;transpose
-    movq                  m6, [ sq + 2 * pq]  ; 67 66 65 64 63 62 61 60
-    movq                  m7, m6              ; 77 76 75 74 73 72 71 70
-    punpckhbw             m7, [s1q + 2 * pq]  ; 77 67 76 66 75 65 74 64
-    punpcklbw             m6, [s1q + 2 * pq]  ; 73 63 72 62 71 61 70 60
-    movq                  m4, [sq]            ; 47 46 45 44 43 42 41 40
-    movq                  m5, m4              ; 47 46 45 44 43 42 41 40
-    punpckhbw             m5, [sq + pq]       ; 57 47 56 46 55 45 54 44
-    punpcklbw             m4, [sq + pq]       ; 53 43 52 42 51 41 50 40
-    movq                  m3, m5              ; 57 47 56 46 55 45 54 44
-    punpckhwd             m5, m7              ; 77 67 57 47 76 66 56 46
-    punpcklwd             m3, m7              ; 75 65 55 45 74 64 54 44
-    movq                  m2, m4              ; 53 43 52 42 51 41 50 40
-    punpckhwd             m4, m6              ; 73 63 53 43 72 62 52 42
-    punpcklwd             m2, m6              ; 71 61 51 41 70 60 50 40
-    neg                   pq
-    movq                  m6, [ sq + pq * 2]  ; 27 26 25 24 23 22 21 20
-    movq                  m1, m6              ; 27 26 25 24 23 22 21 20
-    punpckhbw             m6, [ sq + pq    ]  ; 37 27 36 36 35 25 34 24
-    punpcklbw             m1, [ sq + pq    ]  ; 33 23 32 22 31 21 30 20
-    movq                  m7, [ sq + pq * 4]; ; 07 06 05 04 03 02 01 00
-    punpckhbw             m7, [s1q + pq * 4]  ; 17 07 16 06 15 05 14 04
-    movq                  m0, m7              ; 17 07 16 06 15 05 14 04
-    punpckhwd             m7, m6              ; 37 27 17 07 36 26 16 06
-    punpcklwd             m0, m6              ; 35 25 15 05 34 24 14 04
-    movq                  m6, m7              ; 37 27 17 07 36 26 16 06
-    punpckhdq             m7, m5              ; 77 67 57 47 37 27 17 07  = q3
-    punpckldq             m6, m5              ; 76 66 56 46 36 26 16 06  = q2
-    movq                  m5, m6              ; 76 66 56 46 36 26 16 06
-    psubusb               m5, m7              ; q2-q3
-    psubusb               m7, m6              ; q3-q2
-    por                   m7, m5;             ; m7=abs (q3-q2)
-    movq                  m5, m0              ; 35 25 15 05 34 24 14 04
-    punpckhdq             m5, m3              ; 75 65 55 45 35 25 15 05 = q1
-    punpckldq             m0, m3              ; 74 64 54 44 34 24 15 04 = q0
-    movq                  m3, m5              ; 75 65 55 45 35 25 15 05 = q1
-    psubusb               m3, m6              ; q1-q2
-    psubusb               m6, m5              ; q2-q1
-    por                   m6, m3              ; m6=abs(q2-q1)
-
-    movq       [stkreg + q1], m5              ; save q1
-    movq       [stkreg + q0], m0              ; save q0
-
-    movq                  m3, [ sq + pq * 4]  ; 07 06 05 04 03 02 01 00
-    punpcklbw             m3, [s1q + pq * 4]  ; 13 03 12 02 11 01 10 00
-    movq                  m0, m3              ; 13 03 12 02 11 01 10 00
-    punpcklwd             m0, m1              ; 31 21 11 01 30 20 10 00
-    punpckhwd             m3, m1              ; 33 23 13 03 32 22 12 02
-    movq                  m1, m0              ; 31 21 11 01 30 20 10 00
-    punpckldq             m0, m2              ; 70 60 50 40 30 20 10 00  =p3
-    punpckhdq             m1, m2              ; 71 61 51 41 31 21 11 01  =p2
-    movq                  m2, m1              ; 71 61 51 41 31 21 11 01  =p2
-    psubusb               m2, m0              ; p2-p3
-    psubusb               m0, m1              ; p3-p2
-    por                   m0, m2              ; m0=abs(p3-p2)
-    movq                  m2, m3              ; 33 23 13 03 32 22 12 02
-    punpckldq             m2, m4              ; 72 62 52 42 32 22 12 02 = p1
-    punpckhdq             m3, m4              ; 73 63 53 43 33 23 13 03 = p0
-
-    movq       [stkreg + p0], m3              ; save p0
-    movq       [stkreg + p1], m2              ; save p1
-    movq                  m5, m2              ; m5 = p1
-    psubusb               m2, m1              ; p1-p2
-    psubusb               m1, m5              ; p2-p1
-    por                   m1, m2              ; m1=abs(p2-p1)
-    movq                  m4, [_limitq]
-    GET_GOT         goffsetq
-%if GET_GOT_DEFINED=1
-    add rsp, gprsize                          ; restore stack
-%endif
-    psubusb               m7, m4
-    psubusb               m0, m4
-    psubusb               m1, m4
-    psubusb               m6, m4
-    por                   m7, m6
-    por                   m0, m1
-    por                   m0, m7              ; abs(q3-q2) > limit ||
-                                              ; abs(p3-p2) > limit ||
-                                              ; abs(p2-p1) > limit ||
-                                              ; abs(q2-q1) > limit
-    movq                  m1, m5              ; p1
-    movq                  m7, m3              ; m3=m7=p0
-    psubusb               m7, m5              ; p0 - p1
-    psubusb               m5, m3              ; p1 - p0
-    por                   m5, m7              ; abs(p1-p0)
-    movq       [stkreg + t0], m5              ; save abs(p1-p0)
-    psubusb               m5, m4
-    por                   m0, m5              ; m0=mask
-    movq                  m5, [stkreg + q0]   ; m5=q0
-    movq                  m7, [stkreg + q1]   ; m7=q1
-    movq                  m6, m5              ; m6=q0
-    movq                  m2, m7              ; q1
-    psubusb               m5, m7              ; q0-q1
-    psubusb               m7, m6              ; q1-q0
-    por                   m7, m5              ; abs(q1-q0)
-    movq       [stkreg + t1], m7              ; save abs(q1-q0)
-    psubusb               m7, m4
-    por                   m0, m7              ; mask
-    movq                  m5, m2              ; q1
-    psubusb               m5, m1              ; q1-=p1
-    psubusb               m1, m2              ; p1-=q1
-    por                   m5, m1              ; abs(p1-q1)
-    pand                  m5, [GLOBAL(tfe)]   ; set lsb of each byte to zero
-    psrlw                 m5, 1               ; abs(p1-q1)/2
-    movq                  m4, [_blimitq]
-    movq                  m1, m3              ; m1=m3=p0
-    movq                  m7, m6              ; m7=m6=q0
-    psubusb               m1, m7              ; p0-q0
-    psubusb               m7, m3              ; q0-p0
-    por                   m1, m7              ; abs(q0-p0)
-    paddusb               m1, m1              ; abs(q0-p0)*2
-    paddusb               m1, m5              ; abs(p0 - q0)*2 + abs(p1-q1)/2
-    psubusb               m1, m4              ; abs(p0 - q0)*2 + abs(p1-q1)/2
-                                              ; > blimit
-    por                   m1, m0;             ; mask
-    pxor                  m0, m0
-    pcmpeqb               m1, m0
-
-    ; calculate high edge variance
-    movq                  m7, [_threshq]
-    movq                  m4, [stkreg + t0]   ; get abs (q1 - q0)
-    psubusb               m4, m7
-    movq                  m3, [stkreg + t1]   ; get abs (p1 - p0)
-    psubusb               m3, m7
-    por                   m4, m3              ; abs(q1 - q0) > thresh ||
-                                              ; abs(p1 - p0) > thresh
-    pcmpeqb               m4, m0
-    pcmpeqb               m0, m0
-    movq                  m3, [GLOBAL(t80)]
-    pxor                  m4, m0
-
-    ; start work on filters
-    movq                  m2, [stkreg + p1]
-    movq                  m7, [stkreg + q1]
-    movq                  m6, [stkreg + p0]
-    movq                  m0, [stkreg + q0]
-    pxor                  m2, m3
-    pxor                  m7, m3
-    psubsb                m2, m7              ; p1 - q1
-    pand                  m2, m4              ; high var mask (hvm)(p1 - q1)
-    pxor                  m6, m3
-    pxor                  m0, m3
-    movq                  m3, m0              ; q0
-    psubsb                m0, m6              ; q0 - p0
-    paddsb                m2, m0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-    paddsb                m2, m0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-    paddsb                m2, m0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-    pand                  m1, m2              ; mask filter values we don't
-                                              ; care about
-    movq                  m2, m1
-    paddsb                m1, [GLOBAL(t4)]    ; 3*(q0 - p0) + hvm(p1 - q1) + 4
-    paddsb                m2, [GLOBAL(t3)]    ; 3*(q0 - p0) + hvm(p1 - q1) + 3
-    pxor                  m0, m0
-    pxor                  m5, m5
-    punpcklbw             m0, m2
-    punpckhbw             m5, m2
-    psraw                 m0, 11
-    psraw                 m5, 11
-    packsswb              m0, m5
-    movq                  m2, m0              ; (3*(q0 - p0) + hvm(p1 - q1)
-                                              ; + 3) >> 3;
-    pxor                  m0, m0
-    movq                  m5, m1              ; abcdefgh
-    punpcklbw             m0, m1              ; e0f0g0h0
-    psraw                 m0, 11              ; sign extended shift right by 3
-    pxor                  m1, m1
-    punpckhbw             m1, m5              ; a0b0c0d0
-    psraw                 m1, 11              ; sign extended shift right by 3
-    movq                  m5, m0              ; save results
-    packsswb              m0, m1              ; (3*(q0 - p0) + hvm(p1 - q1)
-                                              ; + 4) >>3
-    paddsw                m5, [GLOBAL(ones)]
-    paddsw                m1, [GLOBAL(ones)]
-    psraw                 m5, 1
-    psraw                 m1, 1
-    packsswb              m5, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
-                                              ; + 4) >>4
-    pandn                 m4, m5              ; high edge variance additive
-    movq                  m5, [GLOBAL(t80)]
-    paddsb                m6, m2              ; p0+= p0 add
-    pxor                  m6, m5              ; unoffset
-    ; m6=p0
-    movq                  m1, [stkreg + p1]
-    pxor                  m1, m5              ; reoffset
-    paddsb                m1, m4              ; p1+= p1 add
-    pxor                  m1, m5              ; unoffset
-    ; m6 = p0 m1 = p1
-    psubsb                m3, m0              ; q0-= q0 add
-    pxor                  m3, m5              ; unoffset
-    ; m3 = q0
-    psubsb                m7, m4              ; q1-= q1 add
-    pxor                  m7, m5              ; unoffset
-    ; m7 = q1
-    ; transpose and write back
-    ; m1 =    72 62 52 42 32 22 12 02
-    ; m6 =    73 63 53 43 33 23 13 03
-    ; m3 =    74 64 54 44 34 24 14 04
-    ; m7 =    75 65 55 45 35 25 15 05
-    movq                  m2, m1              ; 72 62 52 42 32 22 12 02
-    punpcklbw             m2, m6              ; 33 32 23 22 13 12 03 02
-    movq                  m4, m3              ; 74 64 54 44 34 24 14 04
-    punpckhbw             m1, m6              ; 73 72 63 62 53 52 43 42
-    punpcklbw             m4, m7              ; 35 34 25 24 15 14 05 04
-    punpckhbw             m3, m7              ; 75 74 65 64 55 54 45 44
-    movq                  m6, m2              ; 33 32 23 22 13 12 03 02
-    punpcklwd             m2, m4              ; 15 14 13 12 05 04 03 02
-    punpckhwd             m6, m4              ; 35 34 33 32 25 24 23 22
-    movq                  m5, m1              ; 73 72 63 62 53 52 43 42
-    punpcklwd             m1, m3              ; 55 54 53 52 45 44 43 42
-    punpckhwd             m5, m3              ; 75 74 73 72 65 64 63 62
-
-    ; m2 = 15 14 13 12 05 04 03 02
-    ; m6 = 35 34 33 32 25 24 23 22
-    ; m5 = 55 54 53 52 45 44 43 42
-    ; m1 = 75 74 73 72 65 64 63 62
-    movd   [sq + pq * 4 + 2], m2
-    psrlq                 m2, 32
-    movd  [s1q + pq * 4 + 2], m2
-    movd   [sq + pq * 2 + 2], m6
-    psrlq                 m6, 32
-    movd       [sq + pq + 2], m6
-    movd            [sq + 2], m1
-    psrlq                 m1, 32
-    movd           [s1q + 2], m1
-    neg                   pq
-    movd      [s1q + pq + 2], m5
-    psrlq                 m5, 32
-    movd  [s1q + pq * 2 + 2], m5
-    RET
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index e03508a03..39a6ae3a8 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -18,6 +18,213 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK do {                                                   \
+  /* (abs(q1 - q0), abs(p1 - p0) */                                            \
+  __m128i flat = abs_diff(q1p1, q0p0);                                         \
+  /* abs(p1 - q1), abs(p0 - q0) */                                             \
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                           \
+  __m128i abs_p0q0, abs_p1q1, work;                                            \
+                                                                               \
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */          \
+  hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero);  \
+  hev = _mm_cmpgt_epi16(hev, thresh);                                          \
+  hev = _mm_packs_epi16(hev, hev);                                             \
+                                                                               \
+  /* const int8_t mask = filter_mask(*limit, *blimit, */                       \
+  /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */       \
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p0 - q0) * 2 */\
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p1 - q1) */\
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                      \
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1);  /* abs(p1 - q1) / 2 */      \
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                    \
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                    \
+  /* abs(p3 - p2), abs(p2 - p1) */                                             \
+  work = abs_diff(p3p2, p2p1);                                                 \
+  flat = _mm_max_epu8(work, flat);                                             \
+  /* abs(q3 - q2), abs(q2 - q1) */                                             \
+  work = abs_diff(q3q2, q2q1);                                                 \
+  flat = _mm_max_epu8(work, flat);                                             \
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                          \
+  mask = _mm_unpacklo_epi64(mask, flat);                                       \
+  mask = _mm_subs_epu8(mask, limit);                                           \
+  mask = _mm_cmpeq_epi8(mask, zero);                                           \
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                         \
+} while (0)
+
+#define FILTER4 do {                                                           \
+  const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3,                    \
+                                    4, 4, 4, 4, 4, 4, 4, 4);                   \
+  const __m128i t80 = _mm_set1_epi8(0x80);                                     \
+  __m128i filter, filter2filter1, work;                                        \
+                                                                               \
+  ps1ps0 = _mm_xor_si128(p1p0, t80);  /* ^ 0x80 */                             \
+  qs1qs0 = _mm_xor_si128(q1q0, t80);                                           \
+                                                                               \
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */                    \
+  work = _mm_subs_epi8(ps1ps0, qs1qs0);                                        \
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                        \
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */           \
+  filter = _mm_subs_epi8(filter, work);                                        \
+  filter = _mm_subs_epi8(filter, work);                                        \
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */               \
+  filter = _mm_and_si128(filter, mask);  /* & mask */                          \
+  filter = _mm_unpacklo_epi64(filter, filter);                                 \
+                                                                               \
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */                          \
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */                          \
+  filter2filter1 = _mm_adds_epi8(filter, t3t4);  /* signed_char_clamp */       \
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);                  \
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);          \
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);  /* >> 3 */             \
+  filter = _mm_srai_epi16(filter, 11);  /* >> 3 */                             \
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);                    \
+                                                                               \
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                        \
+  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */                       \
+  filter = _mm_unpacklo_epi8(filter, filter);                                  \
+  filter = _mm_srai_epi16(filter, 9);  /* round */                             \
+  filter = _mm_packs_epi16(filter, filter);                                    \
+  filter = _mm_andnot_si128(hev, filter);                                      \
+                                                                               \
+  hev = _mm_unpackhi_epi64(filter2filter1, filter);                            \
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);                 \
+                                                                               \
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */      \
+  qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                              \
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */      \
+  ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                         \
+  qs1qs0 = _mm_xor_si128(qs1qs0, t80);  /* ^ 0x80 */                           \
+  ps1ps0 = _mm_xor_si128(ps1ps0, t80);  /* ^ 0x80 */                           \
+} while (0)
+
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+                               const uint8_t *_blimit, const uint8_t *_limit,
+                               const uint8_t *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                         _mm_loadl_epi64((const __m128i *)_limit));
+  const __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+  __m128i mask, hev;
+
+  p3p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
+  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+  FILTER_HEV_MASK;
+  FILTER4;
+
+  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);  // *op0
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);  // *oq0
+  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
+}
+
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+                             const uint8_t *_blimit, const uint8_t *_limit,
+                             const uint8_t *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                         _mm_loadl_epi64((const __m128i *)_limit));
+  const __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+  __m128i x0, x1, x2, x3;
+  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+  __m128i mask, hev;
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
+                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+
+  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+
+  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+
+  // Transpose 8x8
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
+  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  x0 = _mm_unpacklo_epi16(x2, x3);
+  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
+  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
+  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
+  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
+
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
+  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  x2 = _mm_unpackhi_epi16(x2, x3);
+  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
+  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
+
+  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
+  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+  FILTER_HEV_MASK;
+  FILTER4;
+
+  // Transpose 8x4 to 4x8
+  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
+  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
+  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
+  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
+  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
+  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
+  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
+
+  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+
+  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+}
+
 void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit,
                                     const unsigned char *_limit,
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index be359759c..cee4468c1 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -57,8 +57,8 @@ SECTION .text
   paddd                %6, %1
 %endmacro
 
-%macro STORE_AND_RET 0
-%if mmsize == 16
+%macro STORE_AND_RET 1
+%if %1 > 4
   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
   ; We have to sign-extend it before adding the words within the register
@@ -78,16 +78,16 @@ SECTION .text
   movd               [r1], m7           ; store sse
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
-%else ; mmsize == 8
-  pshufw               m4, m6, 0xe
-  pshufw               m3, m7, 0xe
+%else ; 4xh
+  pshuflw              m4, m6, 0xe
+  pshuflw              m3, m7, 0xe
   paddw                m6, m4
   paddd                m7, m3
   pcmpgtw              m5, m6           ; mask for 0 > x
   mov                  r1, ssem         ; r1 = unsigned int *sse
   punpcklwd            m6, m5           ; sign-extend m6 word->dword
   movd               [r1], m7           ; store sse
-  pshufw               m4, m6, 0xe
+  pshuflw              m4, m6, 0xe
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
 %endif
@@ -196,6 +196,12 @@ SECTION .text
   %endif
 %endif
 
+%if %1 == 4
+  %define movx movd
+%else
+  %define movx movh
+%endif
+
   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   pxor                 m6, m6           ; sum
   pxor                 m7, m7           ; sse
@@ -228,6 +234,7 @@ SECTION .text
 %endif
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+
 %if %2 == 0 ; !avg
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
@@ -237,24 +244,37 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
+  movx                 m0, [srcq]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m0, [srcq+src_strideq]
-%else ; mmsize == 8
-  punpckldq            m0, [srcq+src_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
 %endif
 %else ; !avg
-  movh                 m2, [srcq+src_strideq]
+  movx                 m2, [srcq+src_strideq]
 %endif
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+
 %if %2 == 1 ; avg
+%if %1 > 4
   pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
+%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -271,7 +291,7 @@ SECTION .text
 %endif
   dec                   block_height
   jg .x_zero_y_zero_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_zero_y_nonzero:
   cmp           y_offsetd, 4
@@ -296,37 +316,41 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m2, [srcq+src_strideq]
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m2, [srcq+src_strideq*2]
-%else ; mmsize == 8
-%if %1 == 4
-  movh                 m1, [srcq+src_strideq*2]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq*2]
   punpckldq            m2, m1
-%else
-  punpckldq            m2, [srcq+src_strideq*2]
-%endif
 %endif
-  movh                 m1, [dstq]
-%if mmsize == 16
+  movx                 m1, [dstq]
+%if %1 > 4
   movlhps              m0, m2
-%else ; mmsize == 8
+%else ; 4xh
   punpckldq            m0, m2
 %endif
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m2
   punpcklbw            m1, m5
+%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m3, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m4, [secq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m4, [srcq+src_strideq*2]
-  movh                 m1, [dstq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m1, [dstq]
   pavgb                m0, m2
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -343,7 +367,7 @@ SECTION .text
 %endif
   dec                   block_height
   jg .x_zero_y_half_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
@@ -351,7 +375,7 @@ SECTION .text
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -424,12 +448,12 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m2, [srcq+src_strideq]
-  movh                 m4, [srcq+src_strideq*2]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -449,17 +473,27 @@ SECTION .text
   pmullw               m4, filter_y_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -475,7 +509,7 @@ SECTION .text
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonzero:
   cmp           x_offsetd, 4
@@ -503,30 +537,40 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m4, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m4, [srcq+1]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m0, [srcq+src_strideq]
   movhps               m4, [srcq+src_strideq+1]
-%else ; mmsize == 8
-  punpckldq            m0, [srcq+src_strideq]
-  punpckldq            m4, [srcq+src_strideq+1]
-%endif
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+  movx                 m2, [srcq+src_strideq+1]
+  punpckldq            m4, m2
+%endif
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m4
   punpcklbw            m3, m5
+%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m2, [srcq+src_strideq]
-  movh                 m1, [dstq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m1, [dstq]
   pavgb                m0, m4
-  movh                 m4, [srcq+src_strideq+1]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -543,7 +587,7 @@ SECTION .text
 %endif
   dec                   block_height
   jg .x_half_y_zero_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_half_y_nonzero:
   cmp           y_offsetd, 4
@@ -578,53 +622,58 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 .x_half_y_half_loop:
-  movh                 m2, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m2, [srcq]
+  movx                 m3, [srcq+1]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m2, [srcq+src_strideq]
   movhps               m3, [srcq+src_strideq+1]
 %else
-%if %1 == 4
-  movh                 m1, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq]
   punpckldq            m2, m1
-  movh                 m1, [srcq+src_strideq+1]
+  movx                 m1, [srcq+src_strideq+1]
   punpckldq            m3, m1
-%else
-  punpckldq            m2, [srcq+src_strideq]
-  punpckldq            m3, [srcq+src_strideq+1]
-%endif
 %endif
   pavgb                m2, m3
-%if mmsize == 16
+%if %1 > 4
   movlhps              m0, m2
   movhlps              m4, m2
-%else ; mmsize == 8
+%else ; 4xh
   punpckldq            m0, m2
-  pshufw               m4, m2, 0xe
+  pshuflw              m4, m2, 0xe
 %endif
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   pavgb                m0, m2
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
+%if %1 > 4
   pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
+%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m4, [srcq+src_strideq]
-  movh                 m1, [srcq+src_strideq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq+1]
   pavgb                m2, m3
   pavgb                m4, m1
   pavgb                m0, m2
   pavgb                m2, m4
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
@@ -641,7 +690,7 @@ SECTION .text
 %endif
   dec                   block_height
   jg .x_half_y_half_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
@@ -649,7 +698,7 @@ SECTION .text
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -724,23 +773,23 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 %if notcpuflag(ssse3)
   punpcklbw            m0, m5
 %endif
 .x_half_y_other_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
   pavgb                m2, m1
   pavgb                m4, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -760,16 +809,26 @@ SECTION .text
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -786,7 +845,7 @@ SECTION .text
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf:
   test          y_offsetd, y_offsetd
@@ -797,7 +856,7 @@ SECTION .text
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -865,14 +924,14 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m2, [srcq+src_strideq]
-  movh                 m4, [srcq+src_strideq+1]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_x_a
   pmaddubsw            m2, filter_x_a
@@ -892,17 +951,27 @@ SECTION .text
   pmullw               m4, filter_x_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -918,7 +987,7 @@ SECTION .text
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf_y_nonzero:
   cmp           y_offsetd, 4
@@ -929,7 +998,7 @@ SECTION .text
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1037,8 +1106,8 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
@@ -1054,17 +1123,17 @@ SECTION .text
   add                srcq, src_strideq
   psraw                m0, 4
 .x_other_y_half_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
 %else
@@ -1079,9 +1148,9 @@ SECTION .text
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m4, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
 %endif
   psraw                m2, 4
   psraw                m4, 4
@@ -1089,10 +1158,20 @@ SECTION .text
   pavgw                m2, m4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
@@ -1110,7 +1189,7 @@ SECTION .text
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
 %ifdef PIC
@@ -1118,7 +1197,7 @@ SECTION .text
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1261,8 +1340,8 @@ SECTION .text
   INC_SRC_BY_SRC_STRIDE
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
@@ -1283,20 +1362,20 @@ SECTION .text
   INC_SRC_BY_SRC_STRIDE
 
 .x_other_y_other_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
 
   INC_SRC_BY_SRC_STRIDE
-  movh                 m4, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m4, [srcq]
+  movx                 m3, [srcq+1]
 
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movh                 m3, [dstq+dst_strideq]
-  movh                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
@@ -1335,9 +1414,9 @@ SECTION .text
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m0, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   psraw                m0, 4
   psraw                m2, 4
   punpcklbw            m3, m5
@@ -1345,10 +1424,20 @@ SECTION .text
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
@@ -1366,7 +1455,8 @@ SECTION .text
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+%undef movx
+  STORE_AND_RET %1
 %endmacro
 
 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
@@ -1375,26 +1465,22 @@ SECTION .text
 ; location in the sse/2 version, rather than duplicating that code in the
 ; binary.
 
-INIT_MMX sse
-SUBPEL_VARIANCE  4
 INIT_XMM sse2
+SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_MMX ssse3
-SUBPEL_VARIANCE  4
 INIT_XMM ssse3
+SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_MMX sse
-SUBPEL_VARIANCE  4, 1
 INIT_XMM sse2
+SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
 
-INIT_MMX ssse3
-SUBPEL_VARIANCE  4, 1
 INIT_XMM ssse3
+SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 7851a98b1..f8c97117d 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -45,7 +45,7 @@ unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
                 sse, &sum, vpx_get16x16var_avx2, 16);
-  return *sse - (((unsigned int)sum * sum) >> 8);
+  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
 }
 
 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm
deleted file mode 100644
index f4de42a24..000000000
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ /dev/null
@@ -1,343 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define mmx_filter_shift            7
-
-;void vpx_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vpx_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
-sym(vpx_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64
diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c
deleted file mode 100644
index 636231d1c..000000000
--- a/vpx_dsp/x86/variance_mmx.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-
-#include "vpx_ports/mem.h"
-
-DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
-  { 128, 128, 128, 128,   0,   0,   0,   0 },
-  { 112, 112, 112, 112,  16,  16,  16,  16 },
-  {  96,  96,  96,  96,  32,  32,  32,  32 },
-  {  80,  80,  80,  80,  48,  48,  48,  48 },
-  {  64,  64,  64,  64,  64,  64,  64,  64 },
-  {  48,  48,  48,  48,  80,  80,  80,  80 },
-  {  32,  32,  32,  32,  96,  96,  96,  96 },
-  {  16,  16,  16,  16, 112, 112, 112, 112 }
-};
-
-extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
-                                              int ref_pixels_per_line,
-                                              const unsigned char *src_ptr,
-                                              int src_pixels_per_line,
-                                              const int16_t *HFilter,
-                                              const int16_t *VFilter,
-                                              int *sum,
-                                              unsigned int *sumsquared);
-
-extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
-                                           int ref_pixels_per_line,
-                                           const unsigned char *src_ptr,
-                                           int src_pixels_per_line,
-                                           unsigned int Height,
-                                           const int16_t *HFilter,
-                                           const int16_t *VFilter,
-                                           int *sum,
-                                           unsigned int *sumsquared);
-
-
-uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
-                                       int xoffset, int yoffset,
-                                       const uint8_t *b, int b_stride,
-                                       uint32_t *sse) {
-    int xsum;
-    unsigned int xxsum;
-    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
-                                      bilinear_filters_mmx[xoffset],
-                                      bilinear_filters_mmx[yoffset],
-                                      &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
-                                       int xoffset, int yoffset,
-                                       const uint8_t *b, int b_stride,
-                                       uint32_t *sse) {
-    int xsum;
-    uint32_t xxsum;
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((uint32_t)xsum * xsum) >> 6));
-}
-
-uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
-                                         int xoffset, int yoffset,
-                                         const uint8_t *b, int b_stride,
-                                         uint32_t *sse) {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum0, &xxsum0);
-
-    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
-                                        int xoffset, int yoffset,
-                                        const uint8_t *b, int b_stride,
-                                        uint32_t *sse) {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum0, &xxsum0);
-
-    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
-}
-
-uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
-                                        int xoffset, int yoffset,
-                                        const uint8_t *b, int b_stride,
-                                        uint32_t *sse) {
-    int xsum;
-    unsigned int xxsum;
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((uint32_t)xsum * xsum) >> 7));
-}
-
-uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
-}
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 43f4603ca..6987c2e24 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -320,11 +320,11 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
                                           int height, unsigned int *sse, \
                                           void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-  DECL(4, opt2); \
+  DECL(4, opt1); \
   DECL(8, opt1); \
   DECL(16, opt1)
 
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECLS
 #undef DECL
@@ -380,10 +380,10 @@ FN(16,  8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
 FN(8,  16,  8, 3, 4, opt1, (int32_t), (int32_t)); \
 FN(8,   8,  8, 3, 3, opt1, (int32_t), (int32_t)); \
 FN(8,   4,  8, 3, 2, opt1, (int32_t), (int32_t)); \
-FN(4,   8,  4, 2, 3, opt2, (int32_t), (int32_t)); \
-FN(4,   4,  4, 2, 2, opt2, (int32_t), (int32_t))
+FN(4,   8,  4, 2, 3, opt1, (int32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt1, (int32_t), (int32_t))
 
-FNS(sse2, sse);
+FNS(sse2, sse2);
 FNS(ssse3, ssse3);
 
 #undef FNS
@@ -401,11 +401,11 @@ int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
                                             int height, unsigned int *sse, \
                                             void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-DECL(4, opt2); \
+DECL(4, opt1); \
 DECL(8, opt1); \
 DECL(16, opt1)
 
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECL
 #undef DECLS
@@ -466,8 +466,8 @@ FN(16,  8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
 FN(8,  16,  8, 3, 4, opt1, (uint32_t), (int32_t)); \
 FN(8,   8,  8, 3, 3, opt1, (uint32_t), (int32_t)); \
 FN(8,   4,  8, 3, 2, opt1, (uint32_t), (int32_t)); \
-FN(4,   8,  4, 2, 3, opt2, (uint32_t), (int32_t)); \
-FN(4,   4,  4, 2, 2, opt2, (uint32_t), (int32_t))
+FN(4,   8,  4, 2, 3, opt1, (uint32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt1, (uint32_t), (int32_t))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h
index 1f8f914f1..620df31b2 100644
--- a/vpx_ports/mem_ops.h
+++ b/vpx_ports/mem_ops.h
@@ -89,7 +89,7 @@ static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) {
   unsigned MEM_VALUE_T  val;
   const MAU_T          *mem = (const MAU_T *)vmem;
 
-  val = mem[0] << 24;
+  val = ((unsigned MEM_VALUE_T)mem[0]) << 24;
   val |= mem[1] << 16;
   val |= mem[2] << 8;
   val |= mem[3];
@@ -125,7 +125,7 @@ static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
   unsigned MEM_VALUE_T  val;
   const MAU_T          *mem = (const MAU_T *)vmem;
 
-  val = mem[3] << 24;
+  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
   val |= mem[2] << 16;
   val |= mem[1] << 8;
   val |= mem[0];