summaryrefslogtreecommitdiff
path: root/vp9/common
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2013-03-27 12:36:08 -0700
committerYunqing Wang <yunqingwang@google.com>2013-03-27 12:36:08 -0700
commitc6c0657c60282395987c0e9b0682ac7d7384205a (patch)
treeff3bfa8008b5a6378ce61b152746ebe1d97c5ac3 /vp9/common
parent0e91bec4b54ae31cb3130aa78522b98414e00e98 (diff)
downloadlibvpx-c6c0657c60282395987c0e9b0682ac7d7384205a.tar
libvpx-c6c0657c60282395987c0e9b0682ac7d7384205a.tar.gz
libvpx-c6c0657c60282395987c0e9b0682ac7d7384205a.tar.bz2
libvpx-c6c0657c60282395987c0e9b0682ac7d7384205a.zip
Modify idct code to use macro
Small modification of idct code. Change-Id: I5c4e3223944c68e4ccf762f6cf07c990250e4290
Diffstat (limited to 'vp9/common')
-rw-r--r--vp9/common/x86/vp9_idct_x86.c542
1 files changed, 159 insertions, 383 deletions
diff --git a/vp9/common/x86/vp9_idct_x86.c b/vp9/common/x86/vp9_idct_x86.c
index c8a3873f7..811ed9899 100644
--- a/vp9/common/x86/vp9_idct_x86.c
+++ b/vp9/common/x86/vp9_idct_x86.c
@@ -298,129 +298,110 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \
}
-#define IDCT8x8_1D \
- /* Stage1 */ \
- { \
- const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
- const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
- const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
- const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
- \
- tmp0 = _mm_madd_epi16(lo_17, stg1_0); \
- tmp1 = _mm_madd_epi16(hi_17, stg1_0); \
- tmp2 = _mm_madd_epi16(lo_17, stg1_1); \
- tmp3 = _mm_madd_epi16(hi_17, stg1_1); \
- tmp4 = _mm_madd_epi16(lo_35, stg1_2); \
- tmp5 = _mm_madd_epi16(hi_35, stg1_2); \
- tmp6 = _mm_madd_epi16(lo_35, stg1_3); \
- tmp7 = _mm_madd_epi16(hi_35, stg1_3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- stp1_4 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_7 = _mm_packs_epi32(tmp2, tmp3); \
- stp1_5 = _mm_packs_epi32(tmp4, tmp5); \
- stp1_6 = _mm_packs_epi32(tmp6, tmp7); \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
- const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
- const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
- const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
- \
- tmp0 = _mm_madd_epi16(lo_04, stg2_0); \
- tmp1 = _mm_madd_epi16(hi_04, stg2_0); \
- tmp2 = _mm_madd_epi16(lo_04, stg2_1); \
- tmp3 = _mm_madd_epi16(hi_04, stg2_1); \
- tmp4 = _mm_madd_epi16(lo_26, stg2_2); \
- tmp5 = _mm_madd_epi16(hi_26, stg2_2); \
- tmp6 = _mm_madd_epi16(lo_26, stg2_3); \
- tmp7 = _mm_madd_epi16(hi_26, stg2_3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- stp2_0 = _mm_packs_epi32(tmp0, tmp1); \
- stp2_1 = _mm_packs_epi32(tmp2, tmp3); \
- stp2_2 = _mm_packs_epi32(tmp4, tmp5); \
- stp2_3 = _mm_packs_epi32(tmp6, tmp7); \
- \
- stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
- tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
- tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- } \
- \
- /* Stage4 */ \
- in0 = _mm_adds_epi16(stp1_0, stp2_7); \
- in1 = _mm_adds_epi16(stp1_1, stp1_6); \
- in2 = _mm_adds_epi16(stp1_2, stp1_5); \
- in3 = _mm_adds_epi16(stp1_3, stp2_4); \
- in4 = _mm_subs_epi16(stp1_3, stp2_4); \
- in5 = _mm_subs_epi16(stp1_2, stp1_5); \
- in6 = _mm_subs_epi16(stp1_1, stp1_6); \
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+ cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ tmp4 = _mm_madd_epi16(lo_1, cst2); \
+ tmp5 = _mm_madd_epi16(hi_1, cst2); \
+ tmp6 = _mm_madd_epi16(lo_1, cst3); \
+ tmp7 = _mm_madd_epi16(hi_1, cst3); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp4 = _mm_add_epi32(tmp4, rounding); \
+ tmp5 = _mm_add_epi32(tmp5, rounding); \
+ tmp6 = _mm_add_epi32(tmp6, rounding); \
+ tmp7 = _mm_add_epi32(tmp7, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ res2 = _mm_packs_epi32(tmp4, tmp5); \
+ res3 = _mm_packs_epi32(tmp6, tmp7); \
+ }
+
+#define IDCT8x8_1D \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_4, \
+ stp1_7, stp1_5, stp1_6) \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+ const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_0, \
+ stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+ tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+ tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ } \
+ \
+ /* Stage4 */ \
+ in0 = _mm_adds_epi16(stp1_0, stp2_7); \
+ in1 = _mm_adds_epi16(stp1_1, stp1_6); \
+ in2 = _mm_adds_epi16(stp1_2, stp1_5); \
+ in3 = _mm_adds_epi16(stp1_3, stp2_4); \
+ in4 = _mm_subs_epi16(stp1_3, stp2_4); \
+ in5 = _mm_subs_epi16(stp1_2, stp1_5); \
+ in6 = _mm_subs_epi16(stp1_1, stp1_6); \
in7 = _mm_subs_epi16(stp1_0, stp2_7);
void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
@@ -643,9 +624,9 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
}
-#define IDCT16x16_1D \
- /* Stage2 */ \
- { \
+#define IDCT16x16_1D \
+ /* Stage2 */ \
+ { \
const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \
@@ -654,250 +635,110 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
- \
- tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); \
- tmp1 = _mm_madd_epi16(hi_1_15, stg2_0); \
- tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); \
- tmp3 = _mm_madd_epi16(hi_1_15, stg2_1); \
- tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); \
- tmp5 = _mm_madd_epi16(hi_9_7, stg2_2); \
- tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); \
- tmp7 = _mm_madd_epi16(hi_9_7, stg2_3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- stp2_8 = _mm_packs_epi32(tmp0, tmp1); \
- stp2_15 = _mm_packs_epi32(tmp2, tmp3); \
- stp2_9 = _mm_packs_epi32(tmp4, tmp5); \
- stp2_14 = _mm_packs_epi32(tmp6, tmp7); \
- \
- tmp0 = _mm_madd_epi16(lo_5_11, stg2_4); \
- tmp1 = _mm_madd_epi16(hi_5_11, stg2_4); \
- tmp2 = _mm_madd_epi16(lo_5_11, stg2_5); \
- tmp3 = _mm_madd_epi16(hi_5_11, stg2_5); \
- tmp4 = _mm_madd_epi16(lo_13_3, stg2_6); \
- tmp5 = _mm_madd_epi16(hi_13_3, stg2_6); \
- tmp6 = _mm_madd_epi16(lo_13_3, stg2_7); \
- tmp7 = _mm_madd_epi16(hi_13_3, stg2_7); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- stp2_10 = _mm_packs_epi32(tmp0, tmp1); \
- stp2_13 = _mm_packs_epi32(tmp2, tmp3); \
- stp2_11 = _mm_packs_epi32(tmp4, tmp5); \
- stp2_12 = _mm_packs_epi32(tmp6, tmp7); \
- } \
- \
- /* Stage3 */ \
- { \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+ stg2_0, stg2_1, stg2_2, stg2_3, \
+ stp2_8, stp2_15, stp2_9, stp2_14) \
+ \
+ MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+ stg2_4, stg2_5, stg2_6, stg2_7, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
- \
- tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); \
- tmp1 = _mm_madd_epi16(hi_2_14, stg3_0); \
- tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); \
- tmp3 = _mm_madd_epi16(hi_2_14, stg3_1); \
- tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); \
- tmp5 = _mm_madd_epi16(hi_10_6, stg3_2); \
- tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); \
- tmp7 = _mm_madd_epi16(hi_10_6, stg3_3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- stp1_4 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_7 = _mm_packs_epi32(tmp2, tmp3); \
- stp1_5 = _mm_packs_epi32(tmp4, tmp5); \
- stp1_6 = _mm_packs_epi32(tmp6, tmp7); \
- \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+ stg3_0, stg3_1, stg3_2, stg3_3, \
+ stp1_4, stp1_7, stp1_5, stp1_6) \
+ \
stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- \
+ \
stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- } \
- \
- /* Stage4 */ \
- { \
+ } \
+ \
+ /* Stage4 */ \
+ { \
const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
- \
+ \
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); \
- tmp1 = _mm_madd_epi16(hi_0_8, stg4_0); \
- tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); \
- tmp3 = _mm_madd_epi16(hi_0_8, stg4_1); \
- tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); \
- tmp5 = _mm_madd_epi16(hi_4_12, stg4_2); \
- tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); \
- tmp7 = _mm_madd_epi16(hi_4_12, stg4_3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- stp2_0 = _mm_packs_epi32(tmp0, tmp1); \
- stp2_1 = _mm_packs_epi32(tmp2, tmp3); \
- stp2_2 = _mm_packs_epi32(tmp4, tmp5); \
- stp2_3 = _mm_packs_epi32(tmp6, tmp7); \
- \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+ stg4_0, stg4_1, stg4_2, stg4_3, \
+ stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- tmp0 = _mm_madd_epi16(lo_9_14, stg4_4); \
- tmp1 = _mm_madd_epi16(hi_9_14, stg4_4); \
- tmp2 = _mm_madd_epi16(lo_9_14, stg4_5); \
- tmp3 = _mm_madd_epi16(hi_9_14, stg4_5); \
- tmp4 = _mm_madd_epi16(lo_10_13, stg4_6); \
- tmp5 = _mm_madd_epi16(hi_10_13, stg4_6); \
- tmp6 = _mm_madd_epi16(lo_10_13, stg4_7); \
- tmp7 = _mm_madd_epi16(hi_10_13, stg4_7); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- stp2_9 = _mm_packs_epi32(tmp0, tmp1); \
- stp2_14 = _mm_packs_epi32(tmp2, tmp3); \
- stp2_10 = _mm_packs_epi32(tmp4, tmp5); \
- stp2_13 = _mm_packs_epi32(tmp6, tmp7); \
- } \
- \
- /* Stage5 */ \
- { \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+ stg4_4, stg4_5, stg4_6, stg4_7, \
+ stp2_9, stp2_14, stp2_10, stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
+ \
stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
+ \
tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
+ \
tmp0 = _mm_add_epi32(tmp0, rounding); \
tmp1 = _mm_add_epi32(tmp1, rounding); \
tmp2 = _mm_add_epi32(tmp2, rounding); \
tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
+ \
tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
+ \
stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
+ \
stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
+ \
stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
+ } \
+ \
+ /* Stage6 */ \
+ { \
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
+ \
stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
@@ -906,38 +747,10 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); \
- tmp1 = _mm_madd_epi16(hi_10_13, stg6_0); \
- tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_10_13, stg4_0); \
- tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); \
- tmp5 = _mm_madd_epi16(hi_11_12, stg6_0); \
- tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); \
- tmp7 = _mm_madd_epi16(hi_11_12, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- stp2_10 = _mm_packs_epi32(tmp0, tmp1); \
- stp2_13 = _mm_packs_epi32(tmp2, tmp3); \
- stp2_11 = _mm_packs_epi32(tmp4, tmp5); \
- stp2_12 = _mm_packs_epi32(tmp6, tmp7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
}
void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
@@ -1507,43 +1320,6 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
}
}
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
- cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- tmp4 = _mm_madd_epi16(lo_1, cst2); \
- tmp5 = _mm_madd_epi16(hi_1, cst2); \
- tmp6 = _mm_madd_epi16(lo_1, cst3); \
- tmp7 = _mm_madd_epi16(hi_1, cst3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- res2 = _mm_packs_epi32(tmp4, tmp5); \
- res3 = _mm_packs_epi32(tmp6, tmp7); \
- }
-
void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
const int half_pitch = pitch >> 1;
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);