From e20ca4fead6e48c2af1a5cff05b97c4b4cf2526c Mon Sep 17 00:00:00 2001
From: Linfeng Zhang <linfengz@google.com>
Date: Fri, 5 Jan 2018 09:57:56 -0800
Subject: Add vp9_highbd_iht4x4_16_add_sse4_1()

BUG=webm:1413

Change-Id: I14930d0af24370a44ab359de5bba5512eef4e29f
---
 vpx_dsp/x86/highbd_idct4x4_add_sse4.c | 26 ++------------------------
 vpx_dsp/x86/highbd_inv_txfm_sse2.h    |  4 ++++
 vpx_dsp/x86/highbd_inv_txfm_sse4.h    | 22 ++++++++++++++++++++++
 3 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'vpx_dsp')

diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
index 38e64f3bc..fe74d272a 100644
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -16,28 +16,6 @@
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 
-static INLINE void highbd_idct4(__m128i *const io) {
-  __m128i temp[2], step[4];
-
-  transpose_32bit_4x4(io, io);
-
-  // stage 1
-  temp[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
-  extend_64bit(temp[0], temp);
-  step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
-  temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
-  extend_64bit(temp[0], temp);
-  step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
-  highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
-                          &step[3]);
-
-  // stage 2
-  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
-  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
-  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
-  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
-}
-
 void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
                                       int stride, int bd) {
   __m128i io[4];
@@ -59,8 +37,8 @@ void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io[0] = _mm_srai_epi16(io_short[0], 4);
     io[1] = _mm_srai_epi16(io_short[1], 4);
   } else {
-    highbd_idct4(io);
-    highbd_idct4(io);
+    highbd_idct4_sse4_1(io);
+    highbd_idct4_sse4_1(io);
     io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
     io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
   }
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index e0f749552..c89666b1e 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -19,6 +19,10 @@
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
+// Note: There is no 64-bit bit-level shifting SIMD instruction. All
+// coefficients are left shifted by 2, so that dct_const_round_shift() can be
+// done by right shifting 2 bytes.
+
 static INLINE void extend_64bit(const __m128i in,
                                 __m128i *const out /*out[2]*/) {
   out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
index 9c8eef40f..435934f1b 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -84,4 +84,26 @@ static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,
   *out1 = multiplication_round_shift_sse4_1(temp, c1);
 }
 
+static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
+  __m128i temp[2], step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  temp[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
+  extend_64bit(temp[0], temp);
+  step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
+  extend_64bit(temp[0], temp);
+  step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+                          &step[3]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
-- 
cgit v1.2.3