11 files changed, 212 insertions, 235 deletions
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index ce0431860..8d115fad3 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -273,7 +273,7 @@ void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
 }
 
 void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_short_fht16x16_c(in, out, stride, tx_type);
+  vp9_fht16x16_c(in, out, stride, tx_type);
 }
 
 class Trans16x16TestBase {
@@ -507,10 +507,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
@@ -521,9 +521,9 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
 #endif
 }  // namespace
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 5db5f5cae..dc6668759 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -45,7 +45,7 @@ void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
 }
 
 void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_short_fht4x4_c(in, out, stride, tx_type);
+  vp9_fht4x4_c(in, out, stride, tx_type);
 }
 
 class Trans4x4TestBase {
@@ -281,10 +281,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
-        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
-        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
-        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
@@ -295,10 +295,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
-        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
-        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
-        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
 #endif
 
 }  // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index beef98055..98aabe6a2 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -44,7 +44,7 @@ void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
 }
 
 void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_short_fht8x8_c(in, out, stride, tx_type);
+  vp9_fht8x8_c(in, out, stride, tx_type);
 }
 
 class FwdTrans8x8TestBase {
@@ -308,10 +308,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
-        make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
-        make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
-        make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
@@ -321,9 +321,9 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
-        make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
-        make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
-        make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
 #endif
 }  // namespace
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 04a40bd58..7bdd11eb0 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -707,14 +707,14 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
 fi
 
 # fdct functions
-prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2 avx2
+prototype void vp9_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht4x4 sse2 avx2
 
-prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2 avx2
+prototype void vp9_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht8x8 sse2 avx2
 
-prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2 avx2
+prototype void vp9_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht16x16 sse2 avx2
 
 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
 specialize vp9_fwht4x4
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index a840b480a..a9d168cc8 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -18,8 +18,6 @@
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#include "vp9/encoder/vp9_dct.h"
-
 static INLINE int fdct_round_shift(int input) {
   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   assert(INT16_MIN <= rv && rv <= INT16_MAX);
@@ -157,32 +155,36 @@ static const transform_2d FHT_4[] = {
   { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
-                        int stride, int tx_type) {
-  int16_t out[4 * 4];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[4], temp_out[4];
-  const transform_2d ht = FHT_4[tx_type];
+void vp9_fht4x4_c(const int16_t *input, int16_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct4x4_c(input, output, stride);
+  } else {
+    int16_t out[4 * 4];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
 
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * stride + i] * 16;
-    if (i == 0 && temp_in[0])
-      temp_in[0] += 1;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      outptr[j * 4 + i] = temp_out[j];
-  }
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        outptr[j * 4 + i] = temp_out[j];
+    }
 
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j + i * 4];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
   }
 }
 
@@ -565,30 +567,34 @@ static const transform_2d FHT_8[] = {
   { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
-                        int stride, int tx_type) {
-  int16_t out[64];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[8], temp_out[8];
-  const transform_2d ht = FHT_8[tx_type];
-
-  // Columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      outptr[j * 8 + i] = temp_out[j];
-  }
+void vp9_fht8x8_c(const int16_t *input, int16_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct8x8_c(input, output, stride);
+  } else {
+    int16_t out[64];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        outptr[j * 8 + i] = temp_out[j];
+    }
 
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j + i * 8];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
   }
 }
 
@@ -958,31 +964,34 @@ static const transform_2d FHT_16[] = {
   { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
-                          int stride, int tx_type) {
-  int16_t out[256];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[16], temp_out[16];
-  const transform_2d ht = FHT_16[tx_type];
-
-  // Columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-//      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
+void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+                    int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct16x16_c(input, output, stride);
+  } else {
+    int16_t out[256];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[16], temp_out[16];
+    const transform_2d ht = FHT_16[tx_type];
+
+    // Columns
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
 
-  // Rows
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j + i * 16];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      output[j + i * 16] = temp_out[j];
+    // Rows
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j + i * 16];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        output[j + i * 16] = temp_out[j];
+    }
   }
 }
 
@@ -1375,27 +1384,3 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
       out[j + i * 32] = temp_out[j];
   }
 }
-
-void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct4x4(input, output, stride);
-  else
-    vp9_short_fht4x4(input, output, stride, tx_type);
-}
-
-void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct8x8(input, output, stride);
-  else
-    vp9_short_fht8x8(input, output, stride, tx_type);
-}
-
-void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                  int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct16x16(input, output, stride);
-  else
-    vp9_short_fht16x16(input, output, stride, tx_type);
-}
diff --git a/vp9/encoder/vp9_dct.h b/vp9/encoder/vp9_dct.h
deleted file mode 100644
index cf5f001a9..000000000
--- a/vp9/encoder/vp9_dct.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_DCT_H_
-#define VP9_ENCODER_VP9_DCT_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride);
-
-void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride);
-
-void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                  int stride);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_DCT_H_
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 376a899e0..c7507c13b 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -19,7 +19,6 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#include "vp9/encoder/vp9_dct.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rdopt.h"
@@ -571,7 +570,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       if (!x->skip_recode) {
         vp9_subtract_block(16, 16, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
-        vp9_fht16x16(tx_type, src_diff, coeff, diff_stride);
+        vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                        p->quant, p->quant_shift, qcoeff, dqcoeff,
                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
@@ -591,7 +590,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       if (!x->skip_recode) {
         vp9_subtract_block(8, 8, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
-        vp9_fht8x8(tx_type, src_diff, coeff, diff_stride);
+        vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
@@ -617,7 +616,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
         vp9_subtract_block(4, 4, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
         if (tx_type != DCT_DCT)
-          vp9_short_fht4x4(src_diff, coeff, diff_stride, tx_type);
+          vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, diff_stride);
         vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e5230feb4..7b17b8582 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1064,7 +1064,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         so = &vp9_scan_orders[TX_4X4][tx_type];
 
         if (tx_type != DCT_DCT)
-          vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
+          vp9_fht4x4(src_diff, coeff, 8, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
 
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index ea031fb07..2b82d9750 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -244,32 +244,36 @@ void fadst4_avx2(__m128i *in) {
   transpose_4x4_avx2(in);
 }
 
-void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[4];
-  load_buffer_4x4_avx2(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct4_avx2(in);
-      fdct4_avx2(in);
+    case DCT_DCT:
+      vp9_fdct4x4_avx2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_4x4_avx2(input, in, stride);
       fadst4_avx2(in);
       fdct4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_4x4_avx2(input, in, stride);
       fdct4_avx2(in);
       fadst4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_4x4_avx2(input, in, stride);
       fadst4_avx2(in);
       fadst4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
       break;
     default:
       assert(0);
       break;
   }
-  write_buffer_4x4_avx2(output, in);
 }
 
 void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
@@ -1028,33 +1032,39 @@ void fadst8_avx2(__m128i *in) {
   array_transpose_8x8_avx2(in, in);
 }
 
-void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht8x8_avx2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[8];
-  load_buffer_8x8_avx2(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct8_avx2(in);
-      fdct8_avx2(in);
+    case DCT_DCT:
+      vp9_fdct8x8_avx2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_8x8_avx2(input, in, stride);
       fadst8_avx2(in);
       fdct8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_8x8_avx2(input, in, stride);
       fdct8_avx2(in);
       fadst8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_8x8_avx2(input, in, stride);
       fadst8_avx2(in);
       fadst8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
       break;
     default:
       assert(0);
       break;
   }
-  right_shift_8x8_avx2(in, 1);
-  write_buffer_8x8_avx2(output, in, 8);
 }
 
 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
@@ -2534,36 +2544,39 @@ void fadst16_avx2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16_avx2(in0, in1);
 }
 
-void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output,
-                             int stride, int tx_type) {
+void vp9_fht16x16_avx2(const int16_t *input, int16_t *output,
+                      int stride, int tx_type) {
   __m128i in0[16], in1[16];
-  load_buffer_16x16_avx2(input, in0, in1, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct16_avx2(in0, in1);
-      right_shift_16x16_avx2(in0, in1);
-      fdct16_avx2(in0, in1);
+    case DCT_DCT:
+      vp9_fdct16x16_avx2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
       fadst16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
       fdct16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
       fdct16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
       fadst16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
       fadst16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
       fadst16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
       break;
     default:
       assert(0);
       break;
   }
-  write_buffer_16x16_avx2(output, in0, in1, 16);
 }
 
 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index c876cc273..852cf8667 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -242,32 +242,36 @@ void fadst4_sse2(__m128i *in) {
   transpose_4x4(in);
 }
 
-void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[4];
-  load_buffer_4x4(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct4_sse2(in);
-      fdct4_sse2(in);
+    case DCT_DCT:
+      vp9_fdct4x4_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
       fadst4_sse2(in);
       fdct4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
       fdct4_sse2(in);
       fadst4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_4x4(input, in, stride);
       fadst4_sse2(in);
       fadst4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
-    default:
-      assert(0);
-      break;
+   default:
+     assert(0);
+     break;
   }
-  write_buffer_4x4(output, in);
 }
 
 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
@@ -1026,33 +1030,39 @@ void fadst8_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
-void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[8];
-  load_buffer_8x8(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct8_sse2(in);
-      fdct8_sse2(in);
+    case DCT_DCT:
+      vp9_fdct8x8_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
       fadst8_sse2(in);
       fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
       fdct8_sse2(in);
       fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride);
       fadst8_sse2(in);
       fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
     default:
       assert(0);
       break;
   }
-  right_shift_8x8(in, 1);
-  write_buffer_8x8(output, in, 8);
 }
 
 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
@@ -2532,36 +2542,39 @@ void fadst16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
 }
 
-void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
-                             int stride, int tx_type) {
+void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
+                       int stride, int tx_type) {
   __m128i in0[16], in1[16];
-  load_buffer_16x16(input, in0, in1, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
+    case DCT_DCT:
+      vp9_fdct16x16_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
       fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
     default:
       assert(0);
       break;
   }
-  write_buffer_16x16(output, in0, in1, 16);
 }
 
 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c225f54b8..64f9f094c 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -19,7 +19,6 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c
 
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
-VP9_CX_SRCS-yes += encoder/vp9_dct.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c