From e8b133c79c6cf99102020f194513fb3e2feaa322 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Thu, 30 Jul 2015 18:53:18 -0700 Subject: Factor inverse transform functions into vpx_dsp This commit moves the module inverse transform functions from vp9 to vpx_dsp folder. The hybrid transform wrapper functions stay in the vp9 folder, since it involves codec-specific data structures. Change-Id: Ib066367c953d3d024c73ba65157bbd70a95c9ef8 --- test/idct8x8_test.cc | 3 +- vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c | 61 - .../arm/neon/vp9_idct16x16_1_add_neon_asm.asm | 198 - vp9/common/arm/neon/vp9_idct16x16_add_neon.c | 1317 ------- vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm | 1179 ------ vp9/common/arm/neon/vp9_idct16x16_neon.c | 186 - vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c | 165 - .../arm/neon/vp9_idct32x32_1_add_neon_asm.asm | 144 - vp9/common/arm/neon/vp9_idct32x32_add_neon.c | 719 ---- vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm | 1299 ------- vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c | 50 - vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm | 68 - vp9/common/arm/neon/vp9_idct4x4_add_neon.c | 151 - vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm | 190 - vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c | 64 - vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm | 88 - vp9/common/arm/neon/vp9_idct8x8_add_neon.c | 540 --- vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm | 519 --- vp9/common/vp9_idct.c | 2670 +------------ vp9/common/vp9_idct.h | 83 +- vp9/common/vp9_rtcd_defs.pl | 188 - vp9/common/x86/vp9_idct_intrin_sse2.c | 4045 +------------------ vp9/common/x86/vp9_idct_intrin_sse2.h | 174 - vp9/common/x86/vp9_idct_sse2.asm | 102 - vp9/common/x86/vp9_idct_ssse3_x86_64.asm | 300 -- vp9/encoder/x86/vp9_dct_ssse3.c | 2 +- vp9/vp9_common.mk | 34 - vpx_dsp/arm/idct16x16_1_add_neon.asm | 198 + vpx_dsp/arm/idct16x16_1_add_neon.c | 61 + vpx_dsp/arm/idct16x16_add_neon.asm | 1179 ++++++ vpx_dsp/arm/idct16x16_add_neon.c | 1317 +++++++ vpx_dsp/arm/idct16x16_neon.c | 185 + vpx_dsp/arm/idct32x32_1_add_neon.asm | 144 + vpx_dsp/arm/idct32x32_1_add_neon.c | 165 + vpx_dsp/arm/idct32x32_add_neon.asm | 1299 +++++++ vpx_dsp/arm/idct32x32_add_neon.c | 719 ++++ vpx_dsp/arm/idct4x4_1_add_neon.asm | 68 + vpx_dsp/arm/idct4x4_1_add_neon.c | 50 + vpx_dsp/arm/idct4x4_add_neon.asm | 190 + vpx_dsp/arm/idct4x4_add_neon.c | 151 + vpx_dsp/arm/idct8x8_1_add_neon.asm | 88 + vpx_dsp/arm/idct8x8_1_add_neon.c | 64 + vpx_dsp/arm/idct8x8_add_neon.asm | 519 +++ vpx_dsp/arm/idct8x8_add_neon.c | 540 +++ vpx_dsp/inv_txfm.c | 2476 ++++++++++++ vpx_dsp/inv_txfm.h | 124 + vpx_dsp/vpx_dsp.mk | 37 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 187 + vpx_dsp/x86/inv_txfm_sse2.asm | 102 + vpx_dsp/x86/inv_txfm_sse2.c | 4053 ++++++++++++++++++++ vpx_dsp/x86/inv_txfm_sse2.h | 184 + vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm | 300 ++ 52 files changed, 14509 insertions(+), 14430 deletions(-) delete mode 100644 vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm delete mode 100644 vp9/common/arm/neon/vp9_idct16x16_add_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm delete mode 100644 vp9/common/arm/neon/vp9_idct16x16_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm delete mode 100644 vp9/common/arm/neon/vp9_idct32x32_add_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm delete mode 100644 vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm delete mode 100644 vp9/common/arm/neon/vp9_idct4x4_add_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm delete mode 100644 vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm delete mode 100644 vp9/common/arm/neon/vp9_idct8x8_add_neon.c delete mode 100644 vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm delete mode 100644 vp9/common/x86/vp9_idct_intrin_sse2.h delete mode 100644 vp9/common/x86/vp9_idct_sse2.asm delete mode 100644 vp9/common/x86/vp9_idct_ssse3_x86_64.asm create mode 100644 vpx_dsp/arm/idct16x16_1_add_neon.asm create mode 100644 vpx_dsp/arm/idct16x16_1_add_neon.c create mode 100644 vpx_dsp/arm/idct16x16_add_neon.asm create mode 100644 vpx_dsp/arm/idct16x16_add_neon.c create mode 100644 vpx_dsp/arm/idct16x16_neon.c create mode 100644 vpx_dsp/arm/idct32x32_1_add_neon.asm create mode 100644 vpx_dsp/arm/idct32x32_1_add_neon.c create mode 100644 vpx_dsp/arm/idct32x32_add_neon.asm create mode 100644 vpx_dsp/arm/idct32x32_add_neon.c create mode 100644 vpx_dsp/arm/idct4x4_1_add_neon.asm create mode 100644 vpx_dsp/arm/idct4x4_1_add_neon.c create mode 100644 vpx_dsp/arm/idct4x4_add_neon.asm create mode 100644 vpx_dsp/arm/idct4x4_add_neon.c create mode 100644 vpx_dsp/arm/idct8x8_1_add_neon.asm create mode 100644 vpx_dsp/arm/idct8x8_1_add_neon.c create mode 100644 vpx_dsp/arm/idct8x8_add_neon.asm create mode 100644 vpx_dsp/arm/idct8x8_add_neon.c create mode 100644 vpx_dsp/inv_txfm.c create mode 100644 vpx_dsp/inv_txfm.h create mode 100644 vpx_dsp/x86/inv_txfm_sse2.asm create mode 100644 vpx_dsp/x86/inv_txfm_sse2.c create mode 100644 vpx_dsp/x86/inv_txfm_sse2.h create mode 100644 vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc index f488cb4af..1a1a2344e 100644 --- a/test/idct8x8_test.cc +++ b/test/idct8x8_test.cc @@ -14,8 +14,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vp9_rtcd.h" - +#include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" #include "vpx/vpx_integer.h" diff --git a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c deleted file mode 100644 index 9b14be1c1..000000000 --- a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "vp9/common/vp9_idct.h" -#include "vpx_ports/mem.h" - -void vp9_idct16x16_1_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, j, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - for (d1 = d2 = dest, i = 0; i < 4; i++) { - for (j = 0; j < 2; j++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - } - return; -} diff --git a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm deleted file mode 100644 index b1fd21bb6..000000000 --- a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm +++ /dev/null @@ -1,198 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp9_idct16x16_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vp9_idct16x16_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asr r0, r0, #6 ; >> 6 - - vdup.s16 q0, r0 ; duplicate a1 - mov r0, #8 - sub r2, #8 - - ; load destination data row0 - row3 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row4 - row7 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row8 - row11 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row12 - row15 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |vp9_idct16x16_1_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c deleted file mode 100644 index 545388ae6..000000000 --- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c +++ /dev/null @@ -1,1317 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./vpx_config.h" -#include "vpx_dsp/txfm_common.h" - -static INLINE void TRANSPOSE8X8( - int16x8_t *q8s16, - int16x8_t *q9s16, - int16x8_t *q10s16, - int16x8_t *q11s16, - int16x8_t *q12s16, - int16x8_t *q13s16, - int16x8_t *q14s16, - int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), - vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), - vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), - vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), - vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - -void vp9_idct16x16_256_add_neon_pass1( - int16_t *in, - int16_t *out, - int output_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d18s16, d1s16); - q6s32 = vmull_s16(d19s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlal_s16(q5s32, d30s16, d0s16); - q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q5s32, 14); - d15s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - q2s32 = vmull_s16(d26s16, d2s16); - q3s32 = vmull_s16(d27s16, d2s16); - q9s32 = vmull_s16(d26s16, d3s16); - q15s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q15s32 = vmlal_s16(q15s32, d23s16, d2s16); - - d10s16 = vqrshrn_n_s32(q2s32, 14); - d11s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q15s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - d30s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d30s16); - q11s32 = vmull_s16(d17s16, d30s16); - q0s32 = vmull_s16(d24s16, d30s16); - q1s32 = vmull_s16(d25s16, d30s16); - - d30s16 = vdup_n_s16(cospi_24_64); - d31s16 = vdup_n_s16(cospi_8_64); - - q3s32 = vaddq_s32(q2s32, q0s32); - q12s32 = vaddq_s32(q11s32, q1s32); - q13s32 = vsubq_s32(q2s32, q0s32); - q1s32 = vsubq_s32(q11s32, q1s32); - - d16s16 = vqrshrn_n_s32(q3s32, 14); - d17s16 = vqrshrn_n_s32(q12s32, 14); - d18s16 = vqrshrn_n_s32(q13s32, 14); - d19s16 = vqrshrn_n_s32(q1s32, 14); - q8s16 = vcombine_s16(d16s16, d17s16); - q9s16 = vcombine_s16(d18s16, d19s16); - - q0s32 = vmull_s16(d20s16, d31s16); - q1s32 = vmull_s16(d21s16, d31s16); - q12s32 = vmull_s16(d20s16, d30s16); - q13s32 = vmull_s16(d21s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d30s16); - q1s32 = vmlal_s16(q1s32, d29s16, d30s16); - q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); - - d22s16 = vqrshrn_n_s32(q0s32, 14); - d23s16 = vqrshrn_n_s32(q1s32, 14); - d20s16 = vqrshrn_n_s32(q12s32, 14); - d21s16 = vqrshrn_n_s32(q13s32, 14); - q10s16 = vcombine_s16(d20s16, d21s16); - q11s16 = vcombine_s16(d22s16, d23s16); - - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q15s16 = vaddq_s16(q6s16, q7s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - // stage 5 - q0s16 = vaddq_s16(q8s16, q11s16); - q1s16 = vaddq_s16(q9s16, q10s16); - q2s16 = vsubq_s16(q9s16, q10s16); - q3s16 = vsubq_s16(q8s16, q11s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q11s32 = vmull_s16(d26s16, d16s16); - q12s32 = vmull_s16(d27s16, d16s16); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - - q6s32 = vsubq_s32(q9s32, q11s32); - q13s32 = vsubq_s32(q10s32, q12s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d10s16 = vqrshrn_n_s32(q6s32, 14); - d11s16 = vqrshrn_n_s32(q13s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q8s16 = vaddq_s16(q0s16, q15s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d16u64); - out += output_stride; - vst1_u64((uint64_t *)out, d17u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} - -void vp9_idct16x16_256_add_neon_pass2( - int16_t *src, - int16_t *out, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride) { - uint8_t *d; - uint8x8_t d12u8, d13u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d24u64, d25u64, d26u64, d27u64; - int64x1_t d12s64, d13s64; - uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; - uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d12s16 = vdup_n_s16(cospi_30_64); - d13s16 = vdup_n_s16(cospi_2_64); - - q2s32 = vmull_s16(d16s16, d12s16); - q3s32 = vmull_s16(d17s16, d12s16); - q1s32 = vmull_s16(d16s16, d13s16); - q4s32 = vmull_s16(d17s16, d13s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); - q1s32 = vmlal_s16(q1s32, d30s16, d12s16); - q4s32 = vmlal_s16(q4s32, d31s16, d12s16); - - d0s16 = vqrshrn_n_s32(q2s32, 14); - d1s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q1s32, 14); - d15s16 = vqrshrn_n_s32(q4s32, 14); - q0s16 = vcombine_s16(d0s16, d1s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d30s16 = vdup_n_s16(cospi_14_64); - d31s16 = vdup_n_s16(cospi_18_64); - - q2s32 = vmull_s16(d24s16, d30s16); - q3s32 = vmull_s16(d25s16, d30s16); - q4s32 = vmull_s16(d24s16, d31s16); - q5s32 = vmull_s16(d25s16, d31s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); - q4s32 = vmlal_s16(q4s32, d22s16, d30s16); - q5s32 = vmlal_s16(q5s32, d23s16, d30s16); - - d2s16 = vqrshrn_n_s32(q2s32, 14); - d3s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q4s32, 14); - d13s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(cospi_22_64); - d31s16 = vdup_n_s16(cospi_10_64); - - q11s32 = vmull_s16(d20s16, d30s16); - q12s32 = vmull_s16(d21s16, d30s16); - q4s32 = vmull_s16(d20s16, d31s16); - q5s32 = vmull_s16(d21s16, d31s16); - - q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); - q4s32 = vmlal_s16(q4s32, d26s16, d30s16); - q5s32 = vmlal_s16(q5s32, d27s16, d30s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d11s16 = vqrshrn_n_s32(q5s32, 14); - d10s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - d30s16 = vdup_n_s16(cospi_6_64); - d31s16 = vdup_n_s16(cospi_26_64); - - q10s32 = vmull_s16(d28s16, d30s16); - q11s32 = vmull_s16(d29s16, d30s16); - q12s32 = vmull_s16(d28s16, d31s16); - q13s32 = vmull_s16(d29s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); - q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); - q12s32 = vmlal_s16(q12s32, d18s16, d30s16); - q13s32 = vmlal_s16(q13s32, d19s16, d30s16); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q11s32, 14); - d8s16 = vqrshrn_n_s32(q12s32, 14); - d9s16 = vqrshrn_n_s32(q13s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 3 - q9s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q10s16 = vsubq_s16(q3s16, q2s16); - q11s16 = vaddq_s16(q2s16, q3s16); - q12s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q6s16, q7s16); - - // stage 4 - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - q2s32 = vmull_s16(d18s16, d31s16); - q3s32 = vmull_s16(d19s16, d31s16); - q4s32 = vmull_s16(d28s16, d31s16); - q5s32 = vmull_s16(d29s16, d31s16); - - q2s32 = vmlal_s16(q2s32, d28s16, d30s16); - q3s32 = vmlal_s16(q3s32, d29s16, d30s16); - q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); - - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q3s32, 14); - d2s16 = vqrshrn_n_s32(q4s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q3s16 = q11s16; - q4s16 = q12s16; - - d30s16 = vdup_n_s16(-cospi_8_64); - q11s32 = vmull_s16(d26s16, d30s16); - q12s32 = vmull_s16(d27s16, d30s16); - q8s32 = vmull_s16(d20s16, d30s16); - q9s32 = vmull_s16(d21s16, d30s16); - - q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); - q8s32 = vmlal_s16(q8s32, d26s16, d31s16); - q9s32 = vmlal_s16(q9s32, d27s16, d31s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16(cospi_16_64); - - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q10s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q10s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 - if (skip_adding != 0) { - d = dest; - // load the data in pass1 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), - vreinterpret_u8_s64(d12s64)); - q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), - vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), - vreinterpret_u8_s64(d12s64)); - q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), - vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), - vreinterpret_u8_s64(d12s64)); - q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), - vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), - vreinterpret_u8_s64(d12s64)); - q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), - vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - // store the data out 8,9,10,11,12,13,14,15 - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q8s16 = vrshrq_n_s16(q8s16, 6); - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q9s16 = vrshrq_n_s16(q9s16, 6); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q2s16 = vrshrq_n_s16(q2s16, 6); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q3s16 = vrshrq_n_s16(q3s16, 6); - q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q4s16 = vrshrq_n_s16(q4s16, 6); - q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q5s16 = vrshrq_n_s16(q5s16, 6); - q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q14s16 = vrshrq_n_s16(q14s16, 6); - q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - q15s16 = vrshrq_n_s16(q15s16, 6); - q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - } else { // skip_adding_dest - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); - } - return; -} - -void vp9_idct16x16_10_add_neon_pass1( - int16_t *in, - int16_t *out, - int output_stride) { - int16x4_t d4s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - // stage 3 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - // stage 4 - q1s16 = vdupq_n_s16(cospi_16_64 * 2); - d4s16 = vdup_n_s16(cospi_16_64); - - q8s16 = vqrdmulhq_s16(q8s16, q1s16); - - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - q9s32 = vmull_s16(d14s16, d4s16); - q10s32 = vmull_s16(d15s16, d4s16); - q12s32 = vmull_s16(d9s16, d4s16); - q11s32 = vmull_s16(d8s16, d4s16); - - q15s32 = vsubq_s32(q10s32, q12s32); - q6s32 = vsubq_s32(q9s32, q11s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d11s16 = vqrshrn_n_s32(q15s32, 14); - d10s16 = vqrshrn_n_s32(q6s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q2s16 = vaddq_s16(q8s16, q7s16); - q9s16 = vaddq_s16(q8s16, q6s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q8s16, q4s16); - q12s16 = vsubq_s16(q8s16, q4s16); - q13s16 = vsubq_s16(q8s16, q5s16); - q14s16 = vsubq_s16(q8s16, q6s16); - q15s16 = vsubq_s16(q8s16, q7s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d4u64); - out += output_stride; - vst1_u64((uint64_t *)out, d5u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} - -void vp9_idct16x16_10_add_neon_pass2( - int16_t *src, - int16_t *out, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; - uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; - uint64x1_t d16u64, d17u64, d18u64, d19u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - (void)skip_adding; - (void)dest; - (void)dest_stride; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - // stage 3 - q6s16 = vdupq_n_s16(cospi_30_64 * 2); - q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16(cospi_2_64 * 2); - q7s16 = vqrdmulhq_s16(q8s16, q6s16); - - q15s16 = vdupq_n_s16(-cospi_26_64 * 2); - q14s16 = vdupq_n_s16(cospi_6_64 * 2); - q3s16 = vqrdmulhq_s16(q9s16, q15s16); - q4s16 = vqrdmulhq_s16(q9s16, q14s16); - - // stage 4 - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - d6s16 = vget_low_s16(q3s16); - d7s16 = vget_high_s16(q3s16); - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - q12s32 = vmull_s16(d14s16, d31s16); - q5s32 = vmull_s16(d15s16, d31s16); - q2s32 = vmull_s16(d0s16, d31s16); - q11s32 = vmull_s16(d1s16, d31s16); - - q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); - q2s32 = vmlal_s16(q2s32, d14s16, d30s16); - q11s32 = vmlal_s16(q11s32, d15s16, d30s16); - - d2s16 = vqrshrn_n_s32(q12s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q11s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(-cospi_8_64); - q10s32 = vmull_s16(d8s16, d30s16); - q13s32 = vmull_s16(d9s16, d30s16); - q8s32 = vmull_s16(d6s16, d30s16); - q9s32 = vmull_s16(d7s16, d30s16); - - q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); - q8s32 = vmlal_s16(q8s32, d8s16, d31s16); - q9s32 = vmlal_s16(q9s32, d9s16, d31s16); - - d4s16 = vqrshrn_n_s32(q10s32, 14); - d5s16 = vqrshrn_n_s32(q13s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16(cospi_16_64); - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q0s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q0s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); - d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); - d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); - d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); - d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); - d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - vst1_u64((uint64_t *)out, d16u64); - out += 4; - vst1_u64((uint64_t *)out, d17u64); - out += 12; - vst1_u64((uint64_t *)out, d18u64); - out += 4; - vst1_u64((uint64_t *)out, d19u64); - out += 12; - vst1_u64((uint64_t *)out, d4u64); - out += 4; - vst1_u64((uint64_t *)out, d5u64); - out += 12; - vst1_u64((uint64_t *)out, d6u64); - out += 4; - vst1_u64((uint64_t *)out, d7u64); - out += 12; - vst1_u64((uint64_t *)out, d8u64); - out += 4; - vst1_u64((uint64_t *)out, d9u64); - out += 12; - vst1_u64((uint64_t *)out, d10u64); - out += 4; - vst1_u64((uint64_t *)out, d11u64); - out += 12; - vst1_u64((uint64_t *)out, d28u64); - out += 4; - vst1_u64((uint64_t *)out, d29u64); - out += 12; - vst1_u64((uint64_t *)out, d30u64); - out += 4; - vst1_u64((uint64_t *)out, d31u64); - return; -} diff --git a/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm b/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm deleted file mode 100644 index a13c0d04b..000000000 --- a/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm +++ /dev/null @@ -1,1179 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp9_idct16x16_256_add_neon_pass1| - EXPORT |vp9_idct16x16_256_add_neon_pass2| - EXPORT |vp9_idct16x16_10_add_neon_pass1| - EXPORT |vp9_idct16x16_10_add_neon_pass2| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vp9_idct16x16_256_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64 = 3196 - mov r3, #0xc00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r12, #0x3e00 - add r12, #0xc5 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r12 ; duplicate cospi_4_64 - - ; preloading to avoid stall - ; generate cospi_12_64 = 13623 - mov r3, #0x3500 - add r3, #0x37 - - ; generate cospi_20_64 = 9102 - mov r12, #0x2300 - add r12, #0x8e - - ; step2[4] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; step2[4] * cospi_4_64 - vmull.s16 q5, d18, d1 - vmull.s16 q6, d19, d1 - - ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 - vmlal.s16 q5, d30, d0 - vmlal.s16 q6, d31, d0 - - vdup.16 d2, r3 ; duplicate cospi_12_64 - vdup.16 d3, r12 ; duplicate cospi_20_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q5, #14 ; >> 14 - vqrshrn.s32 d15, q6, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; generate cospi_24_64 = 6270 - mov r12, #0x1800 - add r12, #0x7e - - ; step2[5] * cospi_12_64 - vmull.s16 q2, d26, d2 - vmull.s16 q3, d27, d2 - - ; step2[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q15, d27, d3 - - ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q2, d22, d3 - vmlsl.s16 q3, d23, d3 - - ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q15, d23, d2 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q2, #14 ; >> 14 - vqrshrn.s32 d11, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q15, #14 ; >> 14 - - ; stage 4 - vdup.16 d30, r3 ; cospi_16_64 - - ; step1[0] * cospi_16_64 - vmull.s16 q2, d16, d30 - vmull.s16 q11, d17, d30 - - ; step1[1] * cospi_16_64 - vmull.s16 q0, d24, d30 - vmull.s16 q1, d25, d30 - - ; generate cospi_8_64 = 15137 - mov r3, #0x3b00 - add r3, #0x21 - - vdup.16 d30, r12 ; duplicate cospi_24_64 - vdup.16 d31, r3 ; duplicate cospi_8_64 - - ; temp1 = (step1[0] + step1[1]) * cospi_16_64 - vadd.s32 q3, q2, q0 - vadd.s32 q12, q11, q1 - - ; temp2 = (step1[0] - step1[1]) * cospi_16_64 - vsub.s32 q13, q2, q0 - vsub.s32 q1, q11, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d16, q3, #14 ; >> 14 - vqrshrn.s32 d17, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d18, q13, #14 ; >> 14 - vqrshrn.s32 d19, q1, #14 ; >> 14 - - ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - ; step1[2] * cospi_8_64 - vmull.s16 q0, d20, d31 - vmull.s16 q1, d21, d31 - - ; step1[2] * cospi_24_64 - vmull.s16 q12, d20, d30 - vmull.s16 q13, d21, d30 - - ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q0, d28, d30 - vmlal.s16 q1, d29, d30 - - ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q12, d28, d31 - vmlsl.s16 q13, d29, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d22, q0, #14 ; >> 14 - vqrshrn.s32 d23, q1, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d20, q12, #14 ; >> 14 - vqrshrn.s32 d21, q13, #14 ; >> 14 - - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; - vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; stage 5 - vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; - vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; - vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; - vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; - - vdup.16 d16, r3; ; duplicate cospi_16_64 - - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q6, q9, q11 - vsub.s32 q13, q10, q12 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q6, #14 ; >> 14 - vqrshrn.s32 d11, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d16}, [r1], r2 - vst1.64 {d17}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |vp9_idct16x16_256_add_neon_pass1| - -;void vp9_idct16x16_256_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vp9_idct16x16_256_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate cospi_30_64 = 1606 - mov r3, #0x0600 - add r3, #0x46 - - ; generate cospi_2_64 = 16305 - mov r12, #0x3f00 - add r12, #0xb1 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d12, r3 ; duplicate cospi_30_64 - vdup.16 d13, r12 ; duplicate cospi_2_64 - - ; preloading to avoid stall - ; generate cospi_14_64 = 12665 - mov r3, #0x3100 - add r3, #0x79 - - ; generate cospi_18_64 = 10394 - mov r12, #0x2800 - add r12, #0x9a - - ; step1[8] * cospi_30_64 - vmull.s16 q2, d16, d12 - vmull.s16 q3, d17, d12 - - ; step1[8] * cospi_2_64 - vmull.s16 q1, d16, d13 - vmull.s16 q4, d17, d13 - - ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 - vmlsl.s16 q2, d30, d13 - vmlsl.s16 q3, d31, d13 - - ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 - vmlal.s16 q1, d30, d12 - vmlal.s16 q4, d31, d12 - - vdup.16 d30, r3 ; duplicate cospi_14_64 - vdup.16 d31, r12 ; duplicate cospi_18_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d0, q2, #14 ; >> 14 - vqrshrn.s32 d1, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q1, #14 ; >> 14 - vqrshrn.s32 d15, q4, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_22_64 = 7723 - mov r3, #0x1e00 - add r3, #0x2b - - ; generate cospi_10_64 = 14449 - mov r12, #0x3800 - add r12, #0x71 - - ; step1[9] * cospi_14_64 - vmull.s16 q2, d24, d30 - vmull.s16 q3, d25, d30 - - ; step1[9] * cospi_18_64 - vmull.s16 q4, d24, d31 - vmull.s16 q5, d25, d31 - - ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 - vmlsl.s16 q2, d22, d31 - vmlsl.s16 q3, d23, d31 - - ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 - vmlal.s16 q4, d22, d30 - vmlal.s16 q5, d23, d30 - - vdup.16 d30, r3 ; duplicate cospi_22_64 - vdup.16 d31, r12 ; duplicate cospi_10_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q2, #14 ; >> 14 - vqrshrn.s32 d3, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q4, #14 ; >> 14 - vqrshrn.s32 d13, q5, #14 ; >> 14 - - ; step1[10] * cospi_22_64 - vmull.s16 q11, d20, d30 - vmull.s16 q12, d21, d30 - - ; step1[10] * cospi_10_64 - vmull.s16 q4, d20, d31 - vmull.s16 q5, d21, d31 - - ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 - vmlsl.s16 q11, d26, d31 - vmlsl.s16 q12, d27, d31 - - ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 - vmlal.s16 q4, d26, d30 - vmlal.s16 q5, d27, d30 - - ; preloading to avoid stall - ; generate cospi_6_64 = 15679 - mov r3, #0x3d00 - add r3, #0x3f - - ; generate cospi_26_64 = 4756 - mov r12, #0x1200 - add r12, #0x94 - - vdup.16 d30, r3 ; duplicate cospi_6_64 - vdup.16 d31, r12 ; duplicate cospi_26_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d11, q5, #14 ; >> 14 - vqrshrn.s32 d10, q4, #14 ; >> 14 - - ; step1[11] * cospi_6_64 - vmull.s16 q10, d28, d30 - vmull.s16 q11, d29, d30 - - ; step1[11] * cospi_26_64 - vmull.s16 q12, d28, d31 - vmull.s16 q13, d29, d31 - - ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 - vmlsl.s16 q10, d18, d31 - vmlsl.s16 q11, d19, d31 - - ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 - vmlal.s16 q12, d18, d30 - vmlal.s16 q13, d19, d30 - - vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] - vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q11, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q12, #14 ; >> 14 - vqrshrn.s32 d9, q13, #14 ; >> 14 - - ; stage 3 - vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] - vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] - vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] - vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] - vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] - vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - - ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vdup.16 d30, r12 ; duplicate cospi_8_64 - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d18, d31 - vmull.s16 q3, d19, d31 - - ; step1[14] * cospi_24_64 - vmull.s16 q4, d28, d31 - vmull.s16 q5, d29, d31 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d28, d30 - vmlal.s16 q3, d29, d30 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q4, d18, d30 - vmlsl.s16 q5, d19, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q4, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - vmov.s16 q3, q11 - vmov.s16 q4, q12 - - ; - step1[13] * cospi_8_64 - vmull.s16 q11, d26, d30 - vmull.s16 q12, d27, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d20, d30 - vmull.s16 q9, d21, d30 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlsl.s16 q11, d20, d31 - vmlsl.s16 q12, d21, d31 - - ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d26, d31 - vmlal.s16 q9, d27, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q10, q3, q0 - vadd.s32 q4, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q10, #14 ; >> 14 - vqrshrn.s32 d11, q4, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - cmp r3, #0 ; check if need adding dest data - beq skip_adding_dest - - ldr r7, [sp, #28] ; dest used to save element 0-7 - mov r9, r7 ; save dest pointer for later use - ldr r8, [sp, #32] ; load dest_stride - - ; stage 7 - ; load the data in pass1 - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - - ; store the data output 8,9,10,11,12,13,14,15 - vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q8 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q9, q9, #6 - vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q9 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q2, q2, #6 - vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q2 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q3, q3, #6 - vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q3 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q4, q4, #6 - vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q4 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q5, q5, #6 - vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q5 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q14, q14, #6 - vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q14 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q15, q15, #6 - vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q15 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - b end_idct16x16_pass2 - -skip_adding_dest - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |vp9_idct16x16_256_add_neon_pass2| - -;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vp9_idct16x16_10_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64*2 = 6392 - mov r3, #0x1800 - add r3, #0xf8 - - ; generate cospi_4_64*2 = 32138 - mov r12, #0x7d00 - add r12, #0x8a - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q0, r3 ; duplicate cospi_28_64*2 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, - ; double, and return the high 16 bits, effectively giving >> 15. Doubling - ; the constant will change this to >> 14. - ; dct_const_round_shift(step2[4] * cospi_28_64); - vqrdmulh.s16 q4, q9, q0 - - ; preloading to avoid stall - ; generate cospi_16_64*2 = 23170 - mov r3, #0x5a00 - add r3, #0x82 - - ; dct_const_round_shift(step2[4] * cospi_4_64); - vqrdmulh.s16 q7, q9, q1 - - ; stage 4 - vdup.16 q1, r3 ; cospi_16_64*2 - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - vdup.16 d4, r3; ; duplicate cospi_16_64 - - ; dct_const_round_shift(step1[0] * cospi_16_64) - vqrdmulh.s16 q8, q8, q1 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d14, d4 - vmull.s16 q10, d15, d4 - - ; step2[5] * cospi_16_64 - vmull.s16 q12, d9, d4 - vmull.s16 q11, d8, d4 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q15, q10, q12 - vsub.s32 q6, q9, q11 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d11, q15, #14 ; >> 14 - vqrshrn.s32 d10, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; - vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; - vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d4}, [r1], r2 - vst1.64 {d5}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |vp9_idct16x16_10_add_neon_pass1| - -;void vp9_idct16x16_10_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vp9_idct16x16_10_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate 2*cospi_30_64 = 3212 - mov r3, #0xc00 - add r3, #0x8c - - ; generate 2*cospi_2_64 = 32610 - mov r12, #0x7f00 - add r12, #0x62 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q6, r3 ; duplicate 2*cospi_30_64 - - ; dct_const_round_shift(step1[8] * cospi_30_64) - vqrdmulh.s16 q0, q8, q6 - - vdup.16 q6, r12 ; duplicate 2*cospi_2_64 - - ; dct_const_round_shift(step1[8] * cospi_2_64) - vqrdmulh.s16 q7, q8, q6 - - ; preloading to avoid stall - ; generate 2*cospi_26_64 = 9512 - mov r12, #0x2500 - add r12, #0x28 - rsb r12, #0 - vdup.16 q15, r12 ; duplicate -2*cospi_26_64 - - ; generate 2*cospi_6_64 = 31358 - mov r3, #0x7a00 - add r3, #0x7e - vdup.16 q14, r3 ; duplicate 2*cospi_6_64 - - ; dct_const_round_shift(- step1[12] * cospi_26_64) - vqrdmulh.s16 q3, q9, q15 - - ; dct_const_round_shift(step1[12] * cospi_6_64) - vqrdmulh.s16 q4, q9, q14 - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - vdup.16 d30, r12 ; duplicate cospi_8_64 - - ; step1[14] * cospi_24_64 - vmull.s16 q12, d14, d31 - vmull.s16 q5, d15, d31 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d0, d31 - vmull.s16 q11, d1, d31 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q12, d0, d30 - vmlsl.s16 q5, d1, d30 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d14, d30 - vmlal.s16 q11, d15, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q12, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q11, #14 ; >> 14 - - ; - step1[13] * cospi_8_64 - vmull.s16 q10, d8, d30 - vmull.s16 q13, d9, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d6, d30 - vmull.s16 q9, d7, d30 - - ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 - vmlsl.s16 q10, d6, d31 - vmlsl.s16 q13, d7, d31 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d8, d31 - vmlal.s16 q9, d9, d31 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q10, #14 ; >> 14 - vqrshrn.s32 d5, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q0, q3, q0 - vadd.s32 q1, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q0, #14 ; >> 14 - vqrshrn.s32 d11, q1, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct10_16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |vp9_idct16x16_10_add_neon_pass2| - END diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c deleted file mode 100644 index f2c4ec451..000000000 --- a/vp9/common/arm/neon/vp9_idct16x16_neon.c +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" - -void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, - int16_t *output, - int output_stride); -void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); -void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, - int16_t *output, - int output_stride); -void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); - -#if HAVE_NEON_ASM -/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ -extern void vp9_push_neon(int64_t *store); -extern void vp9_pop_neon(int64_t *store); -#endif // HAVE_NEON_ASM - -void vp9_idct16x16_256_add_neon(const int16_t *input, - uint8_t *dest, int dest_stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16*16] = {0}; - int16_t row_idct_output[16*16] = {0}; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - vp9_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vp9_idct16x16_256_add_neon_pass2(input+1, - row_idct_output, - pass1_output, - 0, - dest, - dest_stride); - - /* Parallel idct on the lower 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vp9_idct16x16_256_add_neon_pass2(input+8*16+1, - row_idct_output+8, - pass1_output, - 0, - dest, - dest_stride); - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, - row_idct_output, - pass1_output, - 1, - dest, - dest_stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, - row_idct_output+8, - pass1_output, - 1, - dest+8, - dest_stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - vp9_pop_neon(store_reg); -#endif - - return; -} - -void vp9_idct16x16_10_add_neon(const int16_t *input, - uint8_t *dest, int dest_stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16*16] = {0}; - int16_t row_idct_output[16*16] = {0}; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - vp9_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vp9_idct16x16_10_add_neon_pass2(input+1, - row_idct_output, - pass1_output, - 0, - dest, - dest_stride); - - /* Skip Parallel idct on the lower 8 rows as they are all 0s */ - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, - row_idct_output, - pass1_output, - 1, - dest, - dest_stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, - row_idct_output+8, - pass1_output, - 1, - dest+8, - dest_stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - vp9_pop_neon(store_reg); -#endif - - return; -} diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c deleted file mode 100644 index 702efa747..000000000 --- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./vpx_config.h" - -#include "vp9/common/vp9_idct.h" -#include "vpx_ports/mem.h" - -static INLINE void LD_16x8( - uint8_t *d, - int d_stride, - uint8x16_t *q8u8, - uint8x16_t *q9u8, - uint8x16_t *q10u8, - uint8x16_t *q11u8, - uint8x16_t *q12u8, - uint8x16_t *q13u8, - uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vld1q_u8(d); - d += d_stride; - *q9u8 = vld1q_u8(d); - d += d_stride; - *q10u8 = vld1q_u8(d); - d += d_stride; - *q11u8 = vld1q_u8(d); - d += d_stride; - *q12u8 = vld1q_u8(d); - d += d_stride; - *q13u8 = vld1q_u8(d); - d += d_stride; - *q14u8 = vld1q_u8(d); - d += d_stride; - *q15u8 = vld1q_u8(d); - return; -} - -static INLINE void ADD_DIFF_16x8( - uint8x16_t qdiffu8, - uint8x16_t *q8u8, - uint8x16_t *q9u8, - uint8x16_t *q10u8, - uint8x16_t *q11u8, - uint8x16_t *q12u8, - uint8x16_t *q13u8, - uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqaddq_u8(*q8u8, qdiffu8); - *q9u8 = vqaddq_u8(*q9u8, qdiffu8); - *q10u8 = vqaddq_u8(*q10u8, qdiffu8); - *q11u8 = vqaddq_u8(*q11u8, qdiffu8); - *q12u8 = vqaddq_u8(*q12u8, qdiffu8); - *q13u8 = vqaddq_u8(*q13u8, qdiffu8); - *q14u8 = vqaddq_u8(*q14u8, qdiffu8); - *q15u8 = vqaddq_u8(*q15u8, qdiffu8); - return; -} - -static INLINE void SUB_DIFF_16x8( - uint8x16_t qdiffu8, - uint8x16_t *q8u8, - uint8x16_t *q9u8, - uint8x16_t *q10u8, - uint8x16_t *q11u8, - uint8x16_t *q12u8, - uint8x16_t *q13u8, - uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqsubq_u8(*q8u8, qdiffu8); - *q9u8 = vqsubq_u8(*q9u8, qdiffu8); - *q10u8 = vqsubq_u8(*q10u8, qdiffu8); - *q11u8 = vqsubq_u8(*q11u8, qdiffu8); - *q12u8 = vqsubq_u8(*q12u8, qdiffu8); - *q13u8 = vqsubq_u8(*q13u8, qdiffu8); - *q14u8 = vqsubq_u8(*q14u8, qdiffu8); - *q15u8 = vqsubq_u8(*q15u8, qdiffu8); - return; -} - -static INLINE void ST_16x8( - uint8_t *d, - int d_stride, - uint8x16_t *q8u8, - uint8x16_t *q9u8, - uint8x16_t *q10u8, - uint8x16_t *q11u8, - uint8x16_t *q12u8, - uint8x16_t *q13u8, - uint8x16_t *q14u8, - uint8x16_t *q15u8) { - vst1q_u8(d, *q8u8); - d += d_stride; - vst1q_u8(d, *q9u8); - d += d_stride; - vst1q_u8(d, *q10u8); - d += d_stride; - vst1q_u8(d, *q11u8); - d += d_stride; - vst1q_u8(d, *q12u8); - d += d_stride; - vst1q_u8(d, *q13u8); - d += d_stride; - vst1q_u8(d, *q14u8); - d += d_stride; - vst1q_u8(d, *q15u8); - return; -} - -void vp9_idct32x32_1_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int i, j, dest_stride8; - uint8_t *d; - int16_t a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - dest_stride8 = dest_stride * 8; - if (a1 >= 0) { // diff_positive_32_32 - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - d += dest_stride8; - } - } - } else { // diff_negative_32_32 - a1 = -a1; - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - d += dest_stride8; - } - } - } - return; -} diff --git a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm deleted file mode 100644 index d290d0753..000000000 --- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm +++ /dev/null @@ -1,144 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - EXPORT |vp9_idct32x32_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ;TODO(hkuang): put the following macros in a seperate - ;file so other idct function could also use them. - MACRO - LD_16x8 $src, $stride - vld1.8 {q8}, [$src], $stride - vld1.8 {q9}, [$src], $stride - vld1.8 {q10}, [$src], $stride - vld1.8 {q11}, [$src], $stride - vld1.8 {q12}, [$src], $stride - vld1.8 {q13}, [$src], $stride - vld1.8 {q14}, [$src], $stride - vld1.8 {q15}, [$src], $stride - MEND - - MACRO - ADD_DIFF_16x8 $diff - vqadd.u8 q8, q8, $diff - vqadd.u8 q9, q9, $diff - vqadd.u8 q10, q10, $diff - vqadd.u8 q11, q11, $diff - vqadd.u8 q12, q12, $diff - vqadd.u8 q13, q13, $diff - vqadd.u8 q14, q14, $diff - vqadd.u8 q15, q15, $diff - MEND - - MACRO - SUB_DIFF_16x8 $diff - vqsub.u8 q8, q8, $diff - vqsub.u8 q9, q9, $diff - vqsub.u8 q10, q10, $diff - vqsub.u8 q11, q11, $diff - vqsub.u8 q12, q12, $diff - vqsub.u8 q13, q13, $diff - vqsub.u8 q14, q14, $diff - vqsub.u8 q15, q15, $diff - MEND - - MACRO - ST_16x8 $dst, $stride - vst1.8 {q8}, [$dst], $stride - vst1.8 {q9}, [$dst], $stride - vst1.8 {q10},[$dst], $stride - vst1.8 {q11},[$dst], $stride - vst1.8 {q12},[$dst], $stride - vst1.8 {q13},[$dst], $stride - vst1.8 {q14},[$dst], $stride - vst1.8 {q15},[$dst], $stride - MEND - -;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride - -|vp9_idct32x32_1_add_neon| PROC - push {lr} - pld [r1] - add r3, r1, #16 ; r3 dest + 16 for second loop - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asrs r0, r0, #6 ; >> 6 - bge diff_positive_32_32 - -diff_negative_32_32 - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_negative_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_negative_32_32_loop - pop {pc} - -diff_positive_32_32 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_positive_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_positive_32_32_loop - pop {pc} - - ENDP ; |vp9_idct32x32_1_add_neon| - END diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/vp9/common/arm/neon/vp9_idct32x32_add_neon.c deleted file mode 100644 index 3656a7696..000000000 --- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c +++ /dev/null @@ -1,719 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./vpx_config.h" -#include "vpx_dsp/txfm_common.h" - -#define LOAD_FROM_TRANSPOSED(prev, first, second) \ - q14s16 = vld1q_s16(trans_buf + first * 8); \ - q13s16 = vld1q_s16(trans_buf + second * 8); - -#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ - qA = vld1q_s16(out + first * 32); \ - qB = vld1q_s16(out + second * 32); - -#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ - vst1q_s16(out + first * 32, qA); \ - vst1q_s16(out + second * 32, qB); - -#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ - __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ - q6s16, q7s16, q8s16, q9s16); -static INLINE void __STORE_COMBINE_CENTER_RESULTS( - uint8_t *p1, - uint8_t *p2, - int stride, - int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16) { - int16x4_t d8s16, d9s16, d10s16, d11s16; - - d8s16 = vld1_s16((int16_t *)p1); - p1 += stride; - d11s16 = vld1_s16((int16_t *)p2); - p2 -= stride; - d9s16 = vld1_s16((int16_t *)p1); - d10s16 = vld1_s16((int16_t *)p2); - - q7s16 = vrshrq_n_s16(q7s16, 6); - q8s16 = vrshrq_n_s16(q8s16, 6); - q9s16 = vrshrq_n_s16(q9s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - - q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), - vreinterpret_u8_s16(d9s16))); - q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_s16(d10s16))); - q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_s16(d11s16))); - q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), - vreinterpret_u8_s16(d8s16))); - - d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); - d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); - d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - - vst1_s16((int16_t *)p1, d9s16); - p1 -= stride; - vst1_s16((int16_t *)p2, d10s16); - p2 += stride; - vst1_s16((int16_t *)p1, d8s16); - vst1_s16((int16_t *)p2, d11s16); - return; -} - -#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ - __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ - q4s16, q5s16, q6s16, q7s16); -static INLINE void __STORE_COMBINE_EXTREME_RESULTS( - uint8_t *p1, - uint8_t *p2, - int stride, - int16x8_t q4s16, - int16x8_t q5s16, - int16x8_t q6s16, - int16x8_t q7s16) { - int16x4_t d4s16, d5s16, d6s16, d7s16; - - d4s16 = vld1_s16((int16_t *)p1); - p1 += stride; - d7s16 = vld1_s16((int16_t *)p2); - p2 -= stride; - d5s16 = vld1_s16((int16_t *)p1); - d6s16 = vld1_s16((int16_t *)p2); - - q5s16 = vrshrq_n_s16(q5s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - q7s16 = vrshrq_n_s16(q7s16, 6); - q4s16 = vrshrq_n_s16(q4s16, 6); - - q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16), - vreinterpret_u8_s16(d5s16))); - q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), - vreinterpret_u8_s16(d6s16))); - q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), - vreinterpret_u8_s16(d7s16))); - q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16), - vreinterpret_u8_s16(d4s16))); - - d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); - d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); - - vst1_s16((int16_t *)p1, d5s16); - p1 -= stride; - vst1_s16((int16_t *)p2, d6s16); - p2 += stride; - vst1_s16((int16_t *)p2, d7s16); - vst1_s16((int16_t *)p1, d4s16); - return; -} - -#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ - DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); -static INLINE void DO_BUTTERFLY( - int16x8_t q14s16, - int16x8_t q13s16, - int16_t first_const, - int16_t second_const, - int16x8_t *qAs16, - int16x8_t *qBs16) { - int16x4_t d30s16, d31s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; - int16x4_t dCs16, dDs16, dAs16, dBs16; - - dCs16 = vget_low_s16(q14s16); - dDs16 = vget_high_s16(q14s16); - dAs16 = vget_low_s16(q13s16); - dBs16 = vget_high_s16(q13s16); - - d30s16 = vdup_n_s16(first_const); - d31s16 = vdup_n_s16(second_const); - - q8s32 = vmull_s16(dCs16, d30s16); - q10s32 = vmull_s16(dAs16, d31s16); - q9s32 = vmull_s16(dDs16, d30s16); - q11s32 = vmull_s16(dBs16, d31s16); - q12s32 = vmull_s16(dCs16, d31s16); - - q8s32 = vsubq_s32(q8s32, q10s32); - q9s32 = vsubq_s32(q9s32, q11s32); - - q10s32 = vmull_s16(dDs16, d31s16); - q11s32 = vmull_s16(dAs16, d30s16); - q15s32 = vmull_s16(dBs16, d30s16); - - q11s32 = vaddq_s32(q12s32, q11s32); - q10s32 = vaddq_s32(q10s32, q15s32); - - *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), - vqrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), - vqrshrn_n_s32(q10s32, 14)); - return; -} - -static INLINE void idct32_transpose_pair( - int16_t *input, - int16_t *t_buf) { - int16_t *in; - int i; - const int stride = 32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - for (i = 0; i < 4; i++, input += 8) { - in = input; - q8s16 = vld1q_s16(in); - in += stride; - q9s16 = vld1q_s16(in); - in += stride; - q10s16 = vld1q_s16(in); - in += stride; - q11s16 = vld1q_s16(in); - in += stride; - q12s16 = vld1q_s16(in); - in += stride; - q13s16 = vld1q_s16(in); - in += stride; - q14s16 = vld1q_s16(in); - in += stride; - q15s16 = vld1q_s16(in); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - q12s16 = vcombine_s16(d17s16, d25s16); - q13s16 = vcombine_s16(d19s16, d27s16); - q14s16 = vcombine_s16(d21s16, d29s16); - q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), - vreinterpretq_s32_s16(q10s16)); - q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16), - vreinterpretq_s32_s16(q11s16)); - q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16), - vreinterpretq_s32_s16(q14s16)); - q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16), - vreinterpretq_s32_s16(q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - vst1q_s16(t_buf, q0x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q0x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[1]); - t_buf += 8; - } - return; -} - -static INLINE void idct32_bands_end_1st_pass( - int16_t *out, - int16x8_t q2s16, - int16x8_t q3s16, - int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16, - int16x8_t q10s16, - int16x8_t q11s16, - int16x8_t q12s16, - int16x8_t q13s16, - int16x8_t q14s16, - int16x8_t q15s16) { - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); - STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); - - LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); - STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); - - LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); - STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); - - LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); - STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); - - LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); - STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); - - LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); - STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); - - LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); - STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); - - LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); - STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); - return; -} - -static INLINE void idct32_bands_end_2nd_pass( - int16_t *out, - uint8_t *dest, - int stride, - int16x8_t q2s16, - int16x8_t q3s16, - int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16, - int16x8_t q10s16, - int16x8_t q11s16, - int16x8_t q12s16, - int16x8_t q13s16, - int16x8_t q14s16, - int16x8_t q15s16) { - uint8_t *r6 = dest + 31 * stride; - uint8_t *r7 = dest/* + 0 * stride*/; - uint8_t *r9 = dest + 15 * stride; - uint8_t *r10 = dest + 16 * stride; - int str2 = stride << 1; - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; r9 -= str2; - - LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; r6 -= str2; - - LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; r9 -= str2; - - LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; r6 -= str2; - - LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; r9 -= str2; - - LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; r6 -= str2; - - LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - - LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - return; -} - -void vp9_idct32x32_1024_add_neon( - int16_t *input, - uint8_t *dest, - int stride) { - int i, idct32_pass_loop; - int16_t trans_buf[32 * 8]; - int16_t pass1[32 * 32]; - int16_t pass2[32 * 32]; - int16_t *out; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - - for (idct32_pass_loop = 0, out = pass1; - idct32_pass_loop < 2; - idct32_pass_loop++, - input = pass1, // the input of pass2 is the result of pass1 - out = pass2) { - for (i = 0; - i < 4; i++, - input += 32 * 8, out += 8) { // idct32_bands_loop - idct32_transpose_pair(input, trans_buf); - - // ----------------------------------------- - // BLOCK A: 16-19,28-31 - // ----------------------------------------- - // generate 16,17,30,31 - // part of stage 1 - LOAD_FROM_TRANSPOSED(0, 1, 31) - DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(31, 17, 15) - DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) - // part of stage 2 - q4s16 = vaddq_s16(q0s16, q1s16); - q13s16 = vsubq_s16(q0s16, q1s16); - q6s16 = vaddq_s16(q2s16, q3s16); - q14s16 = vsubq_s16(q2s16, q3s16); - // part of stage 3 - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) - - // generate 18,19,28,29 - // part of stage 1 - LOAD_FROM_TRANSPOSED(15, 9, 23) - DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(23, 25, 7) - DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) - // part of stage 2 - q13s16 = vsubq_s16(q3s16, q2s16); - q3s16 = vaddq_s16(q3s16, q2s16); - q14s16 = vsubq_s16(q1s16, q0s16); - q2s16 = vaddq_s16(q1s16, q0s16); - // part of stage 3 - DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) - // part of stage 4 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q10s16 = vaddq_s16(q7s16, q1s16); - q15s16 = vaddq_s16(q6s16, q3s16); - q13s16 = vsubq_s16(q5s16, q0s16); - q14s16 = vsubq_s16(q7s16, q1s16); - STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) - STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) - // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) - STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) - // part of stage 4 - q13s16 = vsubq_s16(q4s16, q2s16); - q14s16 = vsubq_s16(q6s16, q3s16); - // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) - STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) - - // ----------------------------------------- - // BLOCK B: 20-23,24-27 - // ----------------------------------------- - // generate 20,21,26,27 - // part of stage 1 - LOAD_FROM_TRANSPOSED(7, 5, 27) - DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(27, 21, 11) - DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) - // part of stage 2 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 3 - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) - - // generate 22,23,24,25 - // part of stage 1 - LOAD_FROM_TRANSPOSED(11, 13, 19) - DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(19, 29, 3) - DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) - // part of stage 2 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); - // part of stage 3 - DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) - // part of stage 4 - q10s16 = vaddq_s16(q7s16, q1s16); - q11s16 = vaddq_s16(q5s16, q0s16); - q12s16 = vaddq_s16(q6s16, q2s16); - q15s16 = vaddq_s16(q4s16, q3s16); - // part of stage 6 - LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q11s16); - q9s16 = vaddq_s16(q13s16, q10s16); - q13s16 = vsubq_s16(q13s16, q10s16); - q11s16 = vsubq_s16(q14s16, q11s16); - STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) - LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) - q8s16 = vsubq_s16(q9s16, q12s16); - q10s16 = vaddq_s16(q14s16, q15s16); - q14s16 = vsubq_s16(q14s16, q15s16); - q12s16 = vaddq_s16(q9s16, q12s16); - STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) - // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) - q13s16 = q11s16; - q14s16 = q8s16; - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) - // part of stage 4 - q14s16 = vsubq_s16(q5s16, q0s16); - q13s16 = vsubq_s16(q6s16, q2s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); - q14s16 = vsubq_s16(q7s16, q1s16); - q13s16 = vsubq_s16(q4s16, q3s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); - // part of stage 6 - LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q1s16); - q9s16 = vaddq_s16(q13s16, q6s16); - q13s16 = vsubq_s16(q13s16, q6s16); - q1s16 = vsubq_s16(q14s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) - LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) - q14s16 = vsubq_s16(q8s16, q5s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q9s16, q0s16); - q0s16 = vsubq_s16(q9s16, q0s16); - STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) - // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) - DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, - &q1s16, &q0s16); - STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) - - // ----------------------------------------- - // BLOCK C: 8-10,11-15 - // ----------------------------------------- - // generate 8,9,14,15 - // part of stage 2 - LOAD_FROM_TRANSPOSED(3, 2, 30) - DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(30, 18, 14) - DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) - // part of stage 3 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 4 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) - - // generate 10,11,12,13 - // part of stage 2 - LOAD_FROM_TRANSPOSED(14, 10, 22) - DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(22, 26, 6) - DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) - // part of stage 3 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); - // part of stage 4 - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) - // part of stage 5 - q8s16 = vaddq_s16(q0s16, q5s16); - q9s16 = vaddq_s16(q1s16, q7s16); - q13s16 = vsubq_s16(q1s16, q7s16); - q14s16 = vsubq_s16(q3s16, q4s16); - q10s16 = vaddq_s16(q3s16, q4s16); - q15s16 = vaddq_s16(q2s16, q6s16); - STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) - STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) - // part of stage 6 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) - q13s16 = vsubq_s16(q0s16, q5s16); - q14s16 = vsubq_s16(q2s16, q6s16); - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) - - // ----------------------------------------- - // BLOCK D: 0-3,4-7 - // ----------------------------------------- - // generate 4,5,6,7 - // part of stage 3 - LOAD_FROM_TRANSPOSED(6, 4, 28) - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(28, 20, 12) - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) - // part of stage 4 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 5 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - - // generate 0,1,2,3 - // part of stage 4 - LOAD_FROM_TRANSPOSED(12, 0, 16) - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(16, 8, 24) - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) - // part of stage 5 - q4s16 = vaddq_s16(q7s16, q6s16); - q7s16 = vsubq_s16(q7s16, q6s16); - q6s16 = vsubq_s16(q5s16, q14s16); - q5s16 = vaddq_s16(q5s16, q14s16); - // part of stage 6 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q3s16); - q10s16 = vaddq_s16(q6s16, q1s16); - q11s16 = vaddq_s16(q7s16, q0s16); - q12s16 = vsubq_s16(q7s16, q0s16); - q13s16 = vsubq_s16(q6s16, q1s16); - q14s16 = vsubq_s16(q5s16, q3s16); - q15s16 = vsubq_s16(q4s16, q2s16); - // part of stage 7 - LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) - q2s16 = vaddq_s16(q8s16, q1s16); - q3s16 = vaddq_s16(q9s16, q0s16); - q4s16 = vsubq_s16(q9s16, q0s16); - q5s16 = vsubq_s16(q8s16, q1s16); - LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - - if (idct32_pass_loop == 0) { - idct32_bands_end_1st_pass(out, - q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, - q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); - } else { - idct32_bands_end_2nd_pass(out, dest, stride, - q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, - q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); - dest += 8; - } - } - } - return; -} diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm b/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm deleted file mode 100644 index 72e933eee..000000000 --- a/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm +++ /dev/null @@ -1,1299 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -;TODO(cd): adjust these constant to be able to use vqdmulh for faster -; dct_const_round_shift(a * b) within butterfly calculations. -cospi_1_64 EQU 16364 -cospi_2_64 EQU 16305 -cospi_3_64 EQU 16207 -cospi_4_64 EQU 16069 -cospi_5_64 EQU 15893 -cospi_6_64 EQU 15679 -cospi_7_64 EQU 15426 -cospi_8_64 EQU 15137 -cospi_9_64 EQU 14811 -cospi_10_64 EQU 14449 -cospi_11_64 EQU 14053 -cospi_12_64 EQU 13623 -cospi_13_64 EQU 13160 -cospi_14_64 EQU 12665 -cospi_15_64 EQU 12140 -cospi_16_64 EQU 11585 -cospi_17_64 EQU 11003 -cospi_18_64 EQU 10394 -cospi_19_64 EQU 9760 -cospi_20_64 EQU 9102 -cospi_21_64 EQU 8423 -cospi_22_64 EQU 7723 -cospi_23_64 EQU 7005 -cospi_24_64 EQU 6270 -cospi_25_64 EQU 5520 -cospi_26_64 EQU 4756 -cospi_27_64 EQU 3981 -cospi_28_64 EQU 3196 -cospi_29_64 EQU 2404 -cospi_30_64 EQU 1606 -cospi_31_64 EQU 804 - - - EXPORT |vp9_idct32x32_1024_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY - - ; -------------------------------------------------------------------------- - ; Load from transposed_buffer - ; q13 = transposed_buffer[first_offset] - ; q14 = transposed_buffer[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; transposed_buffer must be passed in. use 0 for first use. - MACRO - LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset - ; address calculation with proper stride and loading - add r0, #($first_offset - $prev_offset )*8*2 - vld1.s16 {q14}, [r0] - add r0, #($second_offset - $first_offset)*8*2 - vld1.s16 {q13}, [r0] - ; (used) two registers (q14, q13) - MEND - ; -------------------------------------------------------------------------- - ; Load from output (used as temporary storage) - ; reg1 = output[first_offset] - ; reg2 = output[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and loading - add r1, #($first_offset - $prev_offset )*32*2 - vld1.s16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vld1.s16 {$reg2}, [r1] - ; (used) two registers ($reg1, $reg2) - MEND - ; -------------------------------------------------------------------------- - ; Store into output (sometimes as as temporary storage) - ; output[first_offset] = reg1 - ; output[second_offset] = reg2 - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and storing - add r1, #($first_offset - $prev_offset )*32*2 - vst1.16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vst1.16 {$reg2}, [r1] - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10] - vst1.16 {d11}, [r9] - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10]! - vst1.16 {d11}, [r9]! - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6] - vst1.16 {d4}, [r7] - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6]! - vst1.16 {d4}, [r7]! - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - ; TODO(cd): have special case to re-use constants when they are similar for - ; consecutive butterflies - ; TODO(cd): have special case when both constants are the same, do the - ; additions/subtractions before the multiplies. - ; generate the constants - ; generate scalar constants - mov r8, #$first_constant & 0xFF00 - mov r12, #$second_constant & 0xFF00 - add r8, #$first_constant & 0x00FF - add r12, #$second_constant & 0x00FF - ; generate vector constants - vdup.16 d30, r8 - vdup.16 d31, r12 - ; (used) two for inputs (regA-regD), one for constants (q15) - ; do some multiplications (ordered for maximum latency hiding) - vmull.s16 q8, $regC, d30 - vmull.s16 q10, $regA, d31 - vmull.s16 q9, $regD, d30 - vmull.s16 q11, $regB, d31 - vmull.s16 q12, $regC, d31 - ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/subtractions (to get back two register) - vsub.s32 q8, q8, q10 - vsub.s32 q9, q9, q11 - ; do more multiplications (ordered for maximum latency hiding) - vmull.s16 q10, $regD, d31 - vmull.s16 q11, $regA, d30 - vmull.s16 q15, $regB, d30 - ; (used) six for intermediate (q8-q12, q15) - ; do more addition/subtractions - vadd.s32 q11, q12, q11 - vadd.s32 q10, q10, q15 - ; (used) four for intermediate (q8-q11) - ; dct_const_round_shift - vqrshrn.s32 $reg1, q8, #14 - vqrshrn.s32 $reg2, q9, #14 - vqrshrn.s32 $reg3, q11, #14 - vqrshrn.s32 $reg4, q10, #14 - ; (used) two for results, well four d registers - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - MEND - ; -------------------------------------------------------------------------- - -;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); -; -; r0 int16_t *input, -; r1 uint8_t *dest, -; r2 int dest_stride) -; loop counters -; r4 bands loop counter -; r5 pass loop counter -; r8 transpose loop counter -; combine-add pointers -; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) -; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) -; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) -; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) - -|vp9_idct32x32_1024_add_neon| PROC - ; This function does one pass of idct32x32 transform. - ; - ; This is done by transposing the input and then doing a 1d transform on - ; columns. In the first pass, the transposed columns are the original - ; rows. In the second pass, after the transposition, the colums are the - ; original columns. - ; The 1d transform is done by looping over bands of eight columns (the - ; idct32_bands loop). For each band, the transform input transposition - ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are transposed by pairs (the idct32_transpose_pair loop). - push {r4-r11} - vpush {d8-d15} - ; stack operation - ; internal buffer used to transpose 8 lines into before transforming them - ; int16_t transpose_buffer[32 * 8]; - ; at sp + [4096, 4607] - ; results of the first pass (transpose and transform rows) - ; int16_t pass1[32 * 32]; - ; at sp + [0, 2047] - ; results of the second pass (transpose and transform columns) - ; int16_t pass2[32 * 32]; - ; at sp + [2048, 4095] - sub sp, sp, #512+2048+2048 - - ; r6 = dest + 31 * dest_stride - ; r7 = dest + 0 * dest_stride - ; r9 = dest + 15 * dest_stride - ; r10 = dest + 16 * dest_stride - rsb r6, r2, r2, lsl #5 - rsb r9, r2, r2, lsl #4 - add r10, r1, r2, lsl #4 - mov r7, r1 - add r6, r6, r1 - add r9, r9, r1 - ; r11 = -dest_stride - neg r11, r2 - ; r3 = input - mov r3, r0 - ; parameters for first pass - ; r0 = transpose_buffer[32 * 8] - add r0, sp, #4096 - ; r1 = pass1[32 * 32] - mov r1, sp - - mov r5, #0 ; initialize pass loop counter -idct32_pass_loop - mov r4, #4 ; initialize bands loop counter -idct32_bands_loop - mov r8, #2 ; initialize transpose loop counter -idct32_transpose_pair_loop - ; Load two horizontally consecutive 8x8 16bit data matrices. The first one - ; into q0-q7 and the second one into q8-q15. There is a stride of 64, - ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r3]! - vld1.s16 {q0}, [r3]! - add r3, #32 - vld1.s16 {q9}, [r3]! - vld1.s16 {q1}, [r3]! - add r3, #32 - vld1.s16 {q10}, [r3]! - vld1.s16 {q2}, [r3]! - add r3, #32 - vld1.s16 {q11}, [r3]! - vld1.s16 {q3}, [r3]! - add r3, #32 - vld1.s16 {q12}, [r3]! - vld1.s16 {q4}, [r3]! - add r3, #32 - vld1.s16 {q13}, [r3]! - vld1.s16 {q5}, [r3]! - add r3, #32 - vld1.s16 {q14}, [r3]! - vld1.s16 {q6}, [r3]! - add r3, #32 - vld1.s16 {q15}, [r3]! - vld1.s16 {q7}, [r3]! - - ; Transpose the two 8x8 16bit data matrices. - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vswp d1, d8 - vswp d7, d14 - vswp d5, d12 - vswp d3, d10 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; Store both matrices after each other. There is a stride of 32, which - ; adjusts to nothing because of the post-increments. - vst1.16 {q8}, [r0]! - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! - vst1.16 {q0}, [r0]! - vst1.16 {q1}, [r0]! - vst1.16 {q2}, [r0]! - vst1.16 {q3}, [r0]! - vst1.16 {q4}, [r0]! - vst1.16 {q5}, [r0]! - vst1.16 {q6}, [r0]! - vst1.16 {q7}, [r0]! - - ; increment pointers by adjusted stride (not necessary for r0/out) - ; go back by 7*32 for the seven lines moved fully by read and add - ; go back by 32 for the eigth line only read - ; advance by 16*2 to go the next pair - sub r3, r3, #7*32*2 + 32 - 16*2 - ; transpose pair loop processing - subs r8, r8, #1 - bne idct32_transpose_pair_loop - - ; restore r0/input to its original value - sub r0, r0, #32*8*2 - - ; Instead of doing the transforms stage by stage, it is done by loading - ; some input values and doing as many stages as possible to minimize the - ; storing/loading of intermediate results. To fit within registers, the - ; final coefficients are cut into four blocks: - ; BLOCK A: 16-19,28-31 - ; BLOCK B: 20-23,24-27 - ; BLOCK C: 8-10,11-15 - ; BLOCK D: 0-3,4-7 - ; Blocks A and C are straight calculation through the various stages. In - ; block B, further calculations are performed using the results from - ; block A. In block D, further calculations are performed using the results - ; from block C and then the final calculations are done using results from - ; block A and B which have been combined at the end of block B. - - ; -------------------------------------------------------------------------- - ; BLOCK A: 16-19,28-31 - ; -------------------------------------------------------------------------- - ; generate 16,17,30,31 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; - ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; - ;step1b[16][i] = dct_const_round_shift(temp1); - ;step1b[31][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 0, 1, 31 - DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; - ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; - ;step1b[17][i] = dct_const_round_shift(temp1); - ;step1b[30][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 31, 17, 15 - DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[16] = step1b[16][i] + step1b[17][i]; - ;step2[17] = step1b[16][i] - step1b[17][i]; - ;step2[30] = -step1b[30][i] + step1b[31][i]; - ;step2[31] = step1b[30][i] + step1b[31][i]; - vadd.s16 q4, q0, q1 - vsub.s16 q13, q0, q1 - vadd.s16 q6, q2, q3 - vsub.s16 q14, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; - ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; - ;step3[17] = dct_const_round_shift(temp1); - ;step3[30] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; generate 18,19,28,29 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; - ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; - ;step1b[18][i] = dct_const_round_shift(temp1); - ;step1b[29][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 15, 9, 23 - DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; - ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; - ;step1b[19][i] = dct_const_round_shift(temp1); - ;step1b[28][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 23, 25, 7 - DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[18] = -step1b[18][i] + step1b[19][i]; - ;step2[19] = step1b[18][i] + step1b[19][i]; - ;step2[28] = step1b[28][i] + step1b[29][i]; - ;step2[29] = step1b[28][i] - step1b[29][i]; - vsub.s16 q13, q3, q2 - vadd.s16 q3, q3, q2 - vsub.s16 q14, q1, q0 - vadd.s16 q2, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); - ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); - ;step3[29] = dct_const_round_shift(temp1); - ;step3[18] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 - ; -------------------------------------------------------------------------- - ; combine 16-19,28-31 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[16] = step1b[16][i] + step1b[19][i]; - ;step1[17] = step1b[17][i] + step1b[18][i]; - ;step1[18] = step1b[17][i] - step1b[18][i]; - ;step1[29] = step1b[30][i] - step1b[29][i]; - ;step1[30] = step1b[30][i] + step1b[29][i]; - ;step1[31] = step1b[31][i] + step1b[28][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q0 - vadd.s16 q10, q7, q1 - vadd.s16 q15, q6, q3 - vsub.s16 q13, q5, q0 - vsub.s16 q14, q7, q1 - STORE_IN_OUTPUT 0, 16, 31, q8, q15 - STORE_IN_OUTPUT 31, 17, 30, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; - ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; - ;step2[18] = dct_const_round_shift(temp1); - ;step2[29] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 - STORE_IN_OUTPUT 30, 29, 18, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[19] = step1b[16][i] - step1b[19][i]; - ;step1[28] = step1b[31][i] - step1b[28][i]; - vsub.s16 q13, q4, q2 - vsub.s16 q14, q6, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; - ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; - ;step2[19] = dct_const_round_shift(temp1); - ;step2[28] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 - STORE_IN_OUTPUT 18, 19, 28, q4, q6 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK B: 20-23,24-27 - ; -------------------------------------------------------------------------- - ; generate 20,21,26,27 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; - ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; - ;step1b[20][i] = dct_const_round_shift(temp1); - ;step1b[27][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 7, 5, 27 - DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; - ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; - ;step1b[21][i] = dct_const_round_shift(temp1); - ;step1b[26][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 27, 21, 11 - DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[20] = step1b[20][i] + step1b[21][i]; - ;step2[21] = step1b[20][i] - step1b[21][i]; - ;step2[26] = -step1b[26][i] + step1b[27][i]; - ;step2[27] = step1b[26][i] + step1b[27][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; - ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; - ;step3[21] = dct_const_round_shift(temp1); - ;step3[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 22,23,24,25 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; - ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; - ;step1b[22][i] = dct_const_round_shift(temp1); - ;step1b[25][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 11, 13, 19 - DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; - ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; - ;step1b[23][i] = dct_const_round_shift(temp1); - ;step1b[24][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 19, 29, 3 - DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[22] = -step1b[22][i] + step1b[23][i]; - ;step2[23] = step1b[22][i] + step1b[23][i]; - ;step2[24] = step1b[24][i] + step1b[25][i]; - ;step2[25] = step1b[24][i] - step1b[25][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); - ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); - ;step3[25] = dct_const_round_shift(temp1); - ;step3[22] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 20-23,24-27 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[22] = step1b[22][i] + step1b[21][i]; - ;step1[23] = step1b[23][i] + step1b[20][i]; - vadd.s16 q10, q7, q1 - vadd.s16 q11, q5, q0 - ;step1[24] = step1b[24][i] + step1b[27][i]; - ;step1[25] = step1b[25][i] + step1b[26][i]; - vadd.s16 q12, q6, q2 - vadd.s16 q15, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[16] = step1b[16][i] + step1b[23][i]; - ;step3[17] = step1b[17][i] + step1b[22][i]; - ;step3[22] = step1b[17][i] - step1b[22][i]; - ;step3[23] = step1b[16][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 - vadd.s16 q8, q14, q11 - vadd.s16 q9, q13, q10 - vsub.s16 q13, q13, q10 - vsub.s16 q11, q14, q11 - STORE_IN_OUTPUT 17, 17, 16, q9, q8 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[24] = step1b[31][i] - step1b[24][i]; - ;step3[25] = step1b[30][i] - step1b[25][i]; - ;step3[30] = step1b[30][i] + step1b[25][i]; - ;step3[31] = step1b[31][i] + step1b[24][i]; - LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 - vsub.s16 q8, q9, q12 - vadd.s16 q10, q14, q15 - vsub.s16 q14, q14, q15 - vadd.s16 q12, q9, q12 - STORE_IN_OUTPUT 31, 30, 31, q10, q12 - ; -------------------------------------------------------------------------- - ; TODO(cd) do some register allocation change to remove these push/pop - vpush {q8} ; [24] - vpush {q11} ; [23] - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; - ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; - ;step1[22] = dct_const_round_shift(temp1); - ;step1[25] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 31, 25, 22, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; - ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; - ;step1[23] = dct_const_round_shift(temp1); - ;step1[24] = dct_const_round_shift(temp2); - ; TODO(cd) do some register allocation change to remove these push/pop - vpop {q13} ; [23] - vpop {q14} ; [24] - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 22, 24, 23, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[20] = step1b[23][i] - step1b[20][i]; - ;step1[27] = step1b[24][i] - step1b[27][i]; - vsub.s16 q14, q5, q0 - vsub.s16 q13, q6, q2 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); - ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); - ;step2[27] = dct_const_round_shift(temp1); - ;step2[20] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[21] = step1b[22][i] - step1b[21][i]; - ;step1[26] = step1b[25][i] - step1b[26][i]; - vsub.s16 q14, q7, q1 - vsub.s16 q13, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); - ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); - ;step2[26] = dct_const_round_shift(temp1); - ;step2[21] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[18] = step1b[18][i] + step1b[21][i]; - ;step3[19] = step1b[19][i] + step1b[20][i]; - ;step3[20] = step1b[19][i] - step1b[20][i]; - ;step3[21] = step1b[18][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 - vadd.s16 q8, q14, q1 - vadd.s16 q9, q13, q6 - vsub.s16 q13, q13, q6 - vsub.s16 q1, q14, q1 - STORE_IN_OUTPUT 19, 18, 19, q8, q9 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[27] = step1b[28][i] - step1b[27][i]; - ;step3[28] = step1b[28][i] + step1b[27][i]; - ;step3[29] = step1b[29][i] + step1b[26][i]; - ;step3[26] = step1b[29][i] - step1b[26][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 - vsub.s16 q14, q8, q5 - vadd.s16 q10, q8, q5 - vadd.s16 q11, q9, q0 - vsub.s16 q0, q9, q0 - STORE_IN_OUTPUT 29, 28, 29, q10, q11 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; - ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; - ;step1[20] = dct_const_round_shift(temp1); - ;step1[27] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 29, 20, 27, q13, q14 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; - ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; - ;step1[21] = dct_const_round_shift(temp1); - ;step1[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 - STORE_IN_OUTPUT 27, 21, 26, q1, q0 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK C: 8-10,11-15 - ; -------------------------------------------------------------------------- - ; generate 8,9,14,15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; - ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; - ;step2[8] = dct_const_round_shift(temp1); - ;step2[15] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 3, 2, 30 - DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; - ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; - ;step2[9] = dct_const_round_shift(temp1); - ;step2[14] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 30, 18, 14 - DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[8] = step1b[8][i] + step1b[9][i]; - ;step3[9] = step1b[8][i] - step1b[9][i]; - ;step3[14] = step1b[15][i] - step1b[14][i]; - ;step3[15] = step1b[15][i] + step1b[14][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; - ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; - ;step1[9] = dct_const_round_shift(temp1); - ;step1[14] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 10,11,12,13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; - ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; - ;step2[10] = dct_const_round_shift(temp1); - ;step2[13] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 14, 10, 22 - DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; - ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; - ;step2[11] = dct_const_round_shift(temp1); - ;step2[12] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 22, 26, 6 - DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[10] = step1b[11][i] - step1b[10][i]; - ;step3[11] = step1b[11][i] + step1b[10][i]; - ;step3[12] = step1b[12][i] + step1b[13][i]; - ;step3[13] = step1b[12][i] - step1b[13][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); - ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); - ;step1[13] = dct_const_round_shift(temp1); - ;step1[10] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 8-10,11-15 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[8] = step1b[8][i] + step1b[11][i]; - ;step2[9] = step1b[9][i] + step1b[10][i]; - ;step2[10] = step1b[9][i] - step1b[10][i]; - vadd.s16 q8, q0, q5 - vadd.s16 q9, q1, q7 - vsub.s16 q13, q1, q7 - ;step2[13] = step1b[14][i] - step1b[13][i]; - ;step2[14] = step1b[14][i] + step1b[13][i]; - ;step2[15] = step1b[15][i] + step1b[12][i]; - vsub.s16 q14, q3, q4 - vadd.s16 q10, q3, q4 - vadd.s16 q15, q2, q6 - STORE_IN_OUTPUT 26, 8, 15, q8, q15 - STORE_IN_OUTPUT 15, 9, 14, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; - ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; - ;step3[10] = dct_const_round_shift(temp1); - ;step3[13] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 14, 13, 10, q3, q1 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[11] = step1b[8][i] - step1b[11][i]; - ;step2[12] = step1b[15][i] - step1b[12][i]; - vsub.s16 q13, q0, q5 - vsub.s16 q14, q2, q6 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; - ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; - ;step3[11] = dct_const_round_shift(temp1); - ;step3[12] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 10, 11, 12, q1, q3 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK D: 0-3,4-7 - ; -------------------------------------------------------------------------- - ; generate 4,5,6,7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; - ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; - ;step3[4] = dct_const_round_shift(temp1); - ;step3[7] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 6, 4, 28 - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; - ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; - ;step3[5] = dct_const_round_shift(temp1); - ;step3[6] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 28, 20, 12 - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[4] = step1b[4][i] + step1b[5][i]; - ;step1[5] = step1b[4][i] - step1b[5][i]; - ;step1[6] = step1b[7][i] - step1b[6][i]; - ;step1[7] = step1b[7][i] + step1b[6][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; - ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; - ;step2[5] = dct_const_round_shift(temp1); - ;step2[6] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 0,1,2,3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; - ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; - ;step1[1] = dct_const_round_shift(temp1); - ;step1[0] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 12, 0, 16 - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; - ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; - ;step1[2] = dct_const_round_shift(temp1); - ;step1[3] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 16, 8, 24 - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[0] = step1b[0][i] + step1b[3][i]; - ;step2[1] = step1b[1][i] + step1b[2][i]; - ;step2[2] = step1b[1][i] - step1b[2][i]; - ;step2[3] = step1b[0][i] - step1b[3][i]; - vadd.s16 q4, q7, q6 - vsub.s16 q7, q7, q6 - vsub.s16 q6, q5, q14 - vadd.s16 q5, q5, q14 - ; -------------------------------------------------------------------------- - ; combine 0-3,4-7 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[0] = step1b[0][i] + step1b[7][i]; - ;step3[1] = step1b[1][i] + step1b[6][i]; - ;step3[2] = step1b[2][i] + step1b[5][i]; - ;step3[3] = step1b[3][i] + step1b[4][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q3 - vadd.s16 q10, q6, q1 - vadd.s16 q11, q7, q0 - ;step3[4] = step1b[3][i] - step1b[4][i]; - ;step3[5] = step1b[2][i] - step1b[5][i]; - ;step3[6] = step1b[1][i] - step1b[6][i]; - ;step3[7] = step1b[0][i] - step1b[7][i]; - vsub.s16 q12, q7, q0 - vsub.s16 q13, q6, q1 - vsub.s16 q14, q5, q3 - vsub.s16 q15, q4, q2 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[0] = step1b[0][i] + step1b[15][i]; - ;step1[1] = step1b[1][i] + step1b[14][i]; - ;step1[14] = step1b[1][i] - step1b[14][i]; - ;step1[15] = step1b[0][i] - step1b[15][i]; - LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 - vadd.s16 q2, q8, q1 - vadd.s16 q3, q9, q0 - vsub.s16 q4, q9, q0 - vsub.s16 q5, q8, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[14 * 32] = step1b[14][i] + step1b[17][i]; - ;output[15 * 32] = step1b[15][i] + step1b[16][i]; - ;output[16 * 32] = step1b[15][i] - step1b[16][i]; - ;output[17 * 32] = step1b[14][i] - step1b[17][i]; - LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - - cmp r5, #0 - bgt idct32_bands_end_2nd_pass - -idct32_bands_end_1st_pass - STORE_IN_OUTPUT 17, 16, 17, q6, q7 - STORE_IN_OUTPUT 17, 14, 15, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 30, 31, q6, q7 - STORE_IN_OUTPUT 31, 0, 1, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 19, 18, 19, q6, q7 - STORE_IN_OUTPUT 19, 12, 13, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 28, 29, q6, q7 - STORE_IN_OUTPUT 29, 2, 3, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 21, 20, 21, q6, q7 - STORE_IN_OUTPUT 21, 10, 11, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 26, 27, q6, q7 - STORE_IN_OUTPUT 27, 4, 5, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 23, 22, 23, q6, q7 - STORE_IN_OUTPUT 23, 8, 9, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 24, 25, q6, q7 - STORE_IN_OUTPUT 25, 6, 7, q4, q5 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #7*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; parameters for second pass - ; the input of pass2 is the result of pass1. we have to remove the offset - ; of 32 columns induced by the above idct32_bands_loop - sub r3, r1, #32*2 - ; r1 = pass2[32 * 32] - add r1, sp, #2048 - - ; pass loop processing - add r5, r5, #1 - b idct32_pass_loop - -idct32_bands_end_2nd_pass - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; restore pointers to their initial indices for next band pass by - ; removing/adding dest_stride * 8. The actual increment by eight - ; is taken care of within the _LAST macros. - add r6, r6, r2, lsl #3 - add r9, r9, r2, lsl #3 - sub r7, r7, r2, lsl #3 - sub r10, r10, r2, lsl #3 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #25*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; stack operation - add sp, sp, #512+2048+2048 - vpop {d8-d15} - pop {r4-r11} - bx lr - ENDP ; |vp9_idct32x32_1024_add_neon| - END diff --git a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c deleted file mode 100644 index 0ff22d5d5..000000000 --- a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "vp9/common/vp9_idct.h" -#include "vpx_ports/mem.h" - -void vp9_idct4x4_1_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x8_t d6u8; - uint32x2_t d2u32 = vdup_n_u32(0); - uint16x8_t q8u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 4); - - q0s16 = vdupq_n_s16(a1); - - // dc_only_idct_add - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); - d1 += dest_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), - vreinterpret_u8_u32(d2u32)); - d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); - d2 += dest_stride; - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); - d2 += dest_stride; - } - return; -} diff --git a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm deleted file mode 100644 index 0d4a721c4..000000000 --- a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm +++ /dev/null @@ -1,68 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp9_idct4x4_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vp9_idct4x4_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 4) - add r0, r0, #8 ; + (1 <<((4) - 1)) - asr r0, r0, #4 ; >> 4 - - vdup.s16 q0, r0 ; duplicate a1 - - vld1.32 {d2[0]}, [r1], r2 - vld1.32 {d2[1]}, [r1], r2 - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q8, q0, d2 ; dest[x] + a1 - vaddw.u8 q9, q0, d4 - - vqmovun.s16 d6, q8 ; clip_pixel - vqmovun.s16 d7, q9 - - vst1.32 {d6[0]}, [r12], r2 - vst1.32 {d6[1]}, [r12], r2 - vst1.32 {d7[0]}, [r12], r2 - vst1.32 {d7[1]}, [r12] - - bx lr - ENDP ; |vp9_idct4x4_1_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_idct4x4_add_neon.c b/vp9/common/arm/neon/vp9_idct4x4_add_neon.c deleted file mode 100644 index dc91e0f30..000000000 --- a/vp9/common/arm/neon/vp9_idct4x4_add_neon.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -void vp9_idct4x4_16_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x8_t d26u8, d27u8; - uint32x2_t d26u32, d27u32; - uint16x8_t q8u16, q9u16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; - int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; - int16x8_t q8s16, q9s16, q13s16, q14s16; - int32x4_t q1s32, q13s32, q14s32, q15s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - uint8_t *d; - int16_t cospi_8_64 = 15137; - int16_t cospi_16_64 = 11585; - int16_t cospi_24_64 = 6270; - - d26u32 = d27u32 = vdup_n_u32(0); - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - d20s16 = vdup_n_s16(cospi_8_64); - d21s16 = vdup_n_s16(cospi_16_64); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), - vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - d22s16 = vdup_n_s16(cospi_24_64); - - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_high_s16(q9s16); // vswp d18 d19 - d19s16 = vget_low_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), - vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - // do the transform on columns - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d = dest; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); - d += dest_stride; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - d = dest; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); - return; -} diff --git a/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm b/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm deleted file mode 100644 index 00283fc8d..000000000 --- a/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm +++ /dev/null @@ -1,190 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp9_idct4x4_16_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY ; name this block of code -;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vp9_idct4x4_16_add_neon| PROC - - ; The 2D transform is done with two passes which are actually pretty - ; similar. We first transform the rows. This is done by transposing - ; the inputs, doing an SIMD column transform (the columns are the - ; transposed rows) and then transpose the results (so that it goes back - ; in normal/row positions). Then, we transform the columns by doing - ; another SIMD column transform. - ; So, two passes of a transpose followed by a column transform. - - ; load the inputs into q8-q9, d16-d19 - vld1.s16 {q8,q9}, [r0]! - - ; generate scalar constants - ; cospi_8_64 = 15137 = 0x3b21 - mov r0, #0x3b00 - add r0, #0x21 - ; cospi_16_64 = 11585 = 0x2d41 - mov r3, #0x2d00 - add r3, #0x41 - ; cospi_24_64 = 6270 = 0x 187e - mov r12, #0x1800 - add r12, #0x7e - - ; transpose the input data - ; 00 01 02 03 d16 - ; 10 11 12 13 d17 - ; 20 21 22 23 d18 - ; 30 31 32 33 d19 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - - ; generate constant vectors - vdup.16 d20, r0 ; replicate cospi_8_64 - vdup.16 d21, r3 ; replicate cospi_16_64 - - ; 00 10 02 12 d16 - ; 01 11 03 13 d17 - ; 20 30 22 32 d18 - ; 21 31 23 33 d19 - vtrn.32 q8, q9 - ; 00 10 20 30 d16 - ; 01 11 21 31 d17 - ; 02 12 22 32 d18 - ; 03 13 23 33 d19 - - vdup.16 d22, r12 ; replicate cospi_24_64 - - ; do the transform on transposed rows - - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 - vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 - - ; (input[0] + input[2]) * cospi_16_64; - ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64; - ; input[1] * cospi_8_64 + input[3] * cospi_24_64; - vmlsl.s16 q15, d19, d20 - vmlal.s16 q1, d19, d22 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - vswp d18, d19 - - ; transpose the results - ; 00 01 02 03 d16 - ; 10 11 12 13 d17 - ; 20 21 22 23 d18 - ; 30 31 32 33 d19 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - ; 00 10 02 12 d16 - ; 01 11 03 13 d17 - ; 20 30 22 32 d18 - ; 21 31 23 33 d19 - vtrn.32 q8, q9 - ; 00 10 20 30 d16 - ; 01 11 21 31 d17 - ; 02 12 22 32 d18 - ; 03 13 23 33 d19 - - ; do the transform on columns - - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 - vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 - - ; (input[0] + input[2]) * cospi_16_64; - ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64; - ; input[1] * cospi_8_64 + input[3] * cospi_24_64; - vmlsl.s16 q15, d19, d20 - vmlal.s16 q1, d19, d22 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - - ; The results are in two registers, one of them being swapped. This will - ; be taken care of by loading the 'dest' value in a swapped fashion and - ; also storing them in the same swapped fashion. - ; temp_out[0, 1] = d16, d17 = q8 - ; temp_out[2, 3] = d19, d18 = q9 swapped - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) - vrshr.s16 q8, q8, #4 - vrshr.s16 q9, q9, #4 - - vld1.32 {d26[0]}, [r1], r2 - vld1.32 {d26[1]}, [r1], r2 - vld1.32 {d27[1]}, [r1], r2 - vld1.32 {d27[0]}, [r1] ; no post-increment - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d26 - vaddw.u8 q9, q9, d27 - - ; clip_pixel - vqmovun.s16 d26, q8 - vqmovun.s16 d27, q9 - - ; do the stores in reverse order with negative post-increment, by changing - ; the sign of the stride - rsb r2, r2, #0 - vst1.32 {d27[0]}, [r1], r2 - vst1.32 {d27[1]}, [r1], r2 - vst1.32 {d26[1]}, [r1], r2 - vst1.32 {d26[0]}, [r1] ; no post-increment - bx lr - ENDP ; |vp9_idct4x4_16_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c deleted file mode 100644 index 4178d349d..000000000 --- a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "vp9/common/vp9_idct.h" -#include "vpx_ports/mem.h" - -void vp9_idct8x8_1_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 5); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d5u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - return; -} diff --git a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm deleted file mode 100644 index 421d202d4..000000000 --- a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm +++ /dev/null @@ -1,88 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp9_idct8x8_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vp9_idct8x8_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 5) - add r0, r0, #16 ; + (1 <<((5) - 1)) - asr r0, r0, #5 ; >> 5 - - vdup.s16 q0, r0 ; duplicate a1 - - ; load destination data - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r2 - vld1.64 {d17}, [r1] - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |vp9_idct8x8_1_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/vp9/common/arm/neon/vp9_idct8x8_add_neon.c deleted file mode 100644 index ea3ce4943..000000000 --- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c +++ /dev/null @@ -1,540 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./vpx_config.h" -#include "vpx_dsp/txfm_common.h" - -static INLINE void TRANSPOSE8X8( - int16x8_t *q8s16, - int16x8_t *q9s16, - int16x8_t *q10s16, - int16x8_t *q11s16, - int16x8_t *q12s16, - int16x8_t *q13s16, - int16x8_t *q14s16, - int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), - vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), - vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), - vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), - vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - -static INLINE void IDCT8x8_1D( - int16x8_t *q8s16, - int16x8_t *q9s16, - int16x8_t *q10s16, - int16x8_t *q11s16, - int16x8_t *q12s16, - int16x8_t *q13s16, - int16x8_t *q14s16, - int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vqrshrn_n_s32(q2s32, 14); - d15s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); - - d18s16 = vqrshrn_n_s32(q2s32, 14); - d19s16 = vqrshrn_n_s32(q3s32, 14); - d22s16 = vqrshrn_n_s32(q13s32, 14); - d23s16 = vqrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vqrshrn_n_s32(q2s32, 14); - d27s16 = vqrshrn_n_s32(q3s32, 14); - d30s16 = vqrshrn_n_s32(q8s32, 14); - d31s16 = vqrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); - return; -} - -void vp9_idct8x8_64_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), - vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), - vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), - vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), - vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; -} - -void vp9_idct8x8_12_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - int32x4_t q9s32, q10s32, q11s32, q12s32; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - // First transform rows - // stage 1 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - - q0s16 = vdupq_n_s16(-cospi_20_64 * 2); - - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - q1s16 = vdupq_n_s16(cospi_12_64 * 2); - - q5s16 = vqrdmulhq_s16(q11s16, q0s16); - - q0s16 = vdupq_n_s16(cospi_16_64 * 2); - - q6s16 = vqrdmulhq_s16(q11s16, q1s16); - - // stage 2 & stage 3 - even half - q1s16 = vdupq_n_s16(cospi_24_64 * 2); - - q9s16 = vqrdmulhq_s16(q8s16, q0s16); - - q0s16 = vdupq_n_s16(cospi_8_64 * 2); - - q13s16 = vqrdmulhq_s16(q10s16, q1s16); - - q15s16 = vqrdmulhq_s16(q10s16, q0s16); - - // stage 3 -odd half - q0s16 = vaddq_s16(q9s16, q15s16); - q1s16 = vaddq_s16(q9s16, q13s16); - q2s16 = vsubq_s16(q9s16, q13s16); - q3s16 = vsubq_s16(q9s16, q15s16); - - // stage 2 - odd half - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - q8s16 = vaddq_s16(q0s16, q7s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q7s16); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), - vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), - vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), - vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), - vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; -} diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm b/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm deleted file mode 100644 index ab5bb6920..000000000 --- a/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm +++ /dev/null @@ -1,519 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp9_idct8x8_64_add_neon| - EXPORT |vp9_idct8x8_12_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are - ; loaded in q8-q15. The output will be stored back into q8-q15 registers. - ; This macro will touch q0-q7 registers and use them as buffer during - ; calculation. - MACRO - IDCT8x8_1D - ; stage 1 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r4 ; duplicate cospi_4_64 - vdup.16 d2, r5 ; duplicate cospi_12_64 - vdup.16 d3, r6 ; duplicate cospi_20_64 - - ; input[1] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; input[5] * cospi_12_64 - vmull.s16 q5, d26, d2 - vmull.s16 q6, d27, d2 - - ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q5, d22, d3 - vmlsl.s16 q6, d23, d3 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q5, #14 ; >> 14 - vqrshrn.s32 d11, q6, #14 ; >> 14 - - ; input[1] * cospi_4_64 - vmull.s16 q2, d18, d1 - vmull.s16 q3, d19, d1 - - ; input[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q13, d27, d3 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vmlal.s16 q2, d30, d0 - vmlal.s16 q3, d31, d0 - - ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q13, d23, d2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d14, q2, #14 ; >> 14 - vqrshrn.s32 d15, q3, #14 ; >> 14 - - ; stage 2 & stage 3 - even half - vdup.16 d0, r7 ; duplicate cospi_16_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q13, #14 ; >> 14 - - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 - - ; input[0] * cospi_16_64 - vmull.s16 q13, d16, d0 - vmull.s16 q15, d17, d0 - - ; (input[0] + input[2]) * cospi_16_64 - vmlal.s16 q2, d24, d0 - vmlal.s16 q3, d25, d0 - - ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q13, d24, d0 - vmlsl.s16 q15, d25, d0 - - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d18, q2, #14 ; >> 14 - vqrshrn.s32 d19, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d22, q13, #14 ; >> 14 - vqrshrn.s32 d23, q15, #14 ; >> 14 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - ; input[1] * cospi_24_64 - vmull.s16 q2, d20, d0 - vmull.s16 q3, d21, d0 - - ; input[1] * cospi_8_64 - vmull.s16 q8, d20, d1 - vmull.s16 q12, d21, d1 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q2, d28, d1 - vmlsl.s16 q3, d29, d1 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q8, d28, d0 - vmlal.s16 q12, d29, d0 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d26, q2, #14 ; >> 14 - vqrshrn.s32 d27, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d30, q8, #14 ; >> 14 - vqrshrn.s32 d31, q12, #14 ; >> 14 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - MEND - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vp9_idct8x8_64_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - IDCT8x8_1D - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |vp9_idct8x8_64_add_neon| - -;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vp9_idct8x8_12_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - ; stage 1 - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling - ; multiply and shift the result by 16 bits instead of 14 bits. So we need - ; to double the constants before multiplying to compensate this. - mov r12, r3, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_28_64*2 - mov r12, r4, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; dct_const_round_shift(input[1] * cospi_28_64) - vqrdmulh.s16 q4, q9, q0 - - mov r12, r6, lsl #1 - rsb r12, #0 - vdup.16 q0, r12 ; duplicate -cospi_20_64*2 - - ; dct_const_round_shift(input[1] * cospi_4_64) - vqrdmulh.s16 q7, q9, q1 - - mov r12, r5, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_12_64*2 - - ; dct_const_round_shift(- input[3] * cospi_20_64) - vqrdmulh.s16 q5, q11, q0 - - mov r12, r7, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_16_64*2 - - ; dct_const_round_shift(input[3] * cospi_12_64) - vqrdmulh.s16 q6, q11, q1 - - ; stage 2 & stage 3 - even half - mov r12, r8, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_24_64*2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrdmulh.s16 q9, q8, q0 - - mov r12, r9, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_8_64*2 - - ; dct_const_round_shift(input[1] * cospi_24_64) - vqrdmulh.s16 q13, q10, q1 - - ; dct_const_round_shift(input[1] * cospi_8_64) - vqrdmulh.s16 q15, q10, q0 - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |vp9_idct8x8_12_add_neon| - - END diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 174b96e21..b15f7f370 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -11,283 +11,20 @@ #include #include "./vp9_rtcd.h" -#include "vpx_ports/mem.h" +#include "./vpx_dsp_rtcd.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_systemdependent.h" - -static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { - trans = WRAPLOW(trans, 8); - return clip_pixel(WRAPLOW(dest + trans, 8)); -} - -void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { -/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, - 0.5 shifts per pixel. */ - int i; - tran_low_t output[16]; - tran_high_t a1, b1, c1, d1, e1; - const tran_low_t *ip = input; - tran_low_t *op = output; - - for (i = 0; i < 4; i++) { - a1 = ip[0] >> UNIT_QUANT_SHIFT; - c1 = ip[1] >> UNIT_QUANT_SHIFT; - d1 = ip[2] >> UNIT_QUANT_SHIFT; - b1 = ip[3] >> UNIT_QUANT_SHIFT; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - op[0] = WRAPLOW(a1, 8); - op[1] = WRAPLOW(b1, 8); - op[2] = WRAPLOW(c1, 8); - op[3] = WRAPLOW(d1, 8); - ip += 4; - op += 4; - } - - ip = output; - for (i = 0; i < 4; i++) { - a1 = ip[4 * 0]; - c1 = ip[4 * 1]; - d1 = ip[4 * 2]; - b1 = ip[4 * 3]; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); - dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1); - dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1); - dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1); - - ip++; - dest++; - } -} - -void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { - int i; - tran_high_t a1, e1; - tran_low_t tmp[4]; - const tran_low_t *ip = in; - tran_low_t *op = tmp; - - a1 = ip[0] >> UNIT_QUANT_SHIFT; - e1 = a1 >> 1; - a1 -= e1; - op[0] = WRAPLOW(a1, 8); - op[1] = op[2] = op[3] = WRAPLOW(e1, 8); - - ip = tmp; - for (i = 0; i < 4; i++) { - e1 = ip[0] >> 1; - a1 = ip[0] - e1; - dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); - dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); - dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); - dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); - ip++; - dest++; - } -} - -static void idct4(const tran_low_t *input, tran_low_t *output) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step[1] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - - // stage 2 - output[0] = WRAPLOW(step[0] + step[3], 8); - output[1] = WRAPLOW(step[1] + step[2], 8); - output[2] = WRAPLOW(step[1] - step[2], 8); - output[3] = WRAPLOW(step[0] - step[3], 8); -} - -void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - - // Rows - for (i = 0; i < 4; ++i) { - idct4(input, outptr); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; - idct4(temp_in, temp_out); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 4)); - } - } -} - -void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - int i; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (i = 0; i < 4; i++) { - dest[0] = clip_pixel_add(dest[0], a1); - dest[1] = clip_pixel_add(dest[1], a1); - dest[2] = clip_pixel_add(dest[2], a1); - dest[3] = clip_pixel_add(dest[3], a1); - dest += dest_stride; - } -} - -static void idct8(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - - // stage 2 & stage 3 - even half - idct4(step1, step1); - - // stage 2 - odd half - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); - - // stage 3 -odd half - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[7] = step2[7]; - - // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7], 8); - output[1] = WRAPLOW(step1[1] + step1[6], 8); - output[2] = WRAPLOW(step1[2] + step1[5], 8); - output[3] = WRAPLOW(step1[3] + step1[4], 8); - output[4] = WRAPLOW(step1[3] - step1[4], 8); - output[5] = WRAPLOW(step1[2] - step1[5], 8); - output[6] = WRAPLOW(step1[1] - step1[6], 8); - output[7] = WRAPLOW(step1[0] - step1[7], 8); -} - -void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - for (i = 0; i < 8; ++i) { - idct8(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - idct8(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } -} - -void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); - a1 = ROUND_POWER_OF_TWO(out, 5); - for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) - dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -static void iadst4(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = x0 - x2 + x3; - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8); - output[2] = WRAPLOW(dct_const_round_shift(s2), 8); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8); -} +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { const transform_2d IHT_4[] = { - { idct4, idct4 }, // DCT_DCT = 0 - { iadst4, idct4 }, // ADST_DCT = 1 - { idct4, iadst4 }, // DCT_ADST = 2 - { iadst4, iadst4 } // ADST_ADST = 3 + { idct4_c, idct4_c }, // DCT_DCT = 0 + { iadst4_c, idct4_c }, // ADST_DCT = 1 + { idct4_c, iadst4_c }, // DCT_ADST = 2 + { iadst4_c, iadst4_c } // ADST_ADST = 3 }; int i, j; @@ -314,88 +51,11 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, } } -static void iadst8(const tran_low_t *input, tran_low_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - - tran_high_t x0 = input[7]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[5]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[3]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[1]; - tran_high_t x7 = input[6]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] - = output[5] = output[6] = output[7] = 0; - return; - } - - // stage 1 - s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); - s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); - s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); - s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); - s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); - s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); - s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); - s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); - - x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8); - - // stage 2 - s0 = (int)x0; - s1 = (int)x1; - s2 = (int)x2; - s3 = (int)x3; - s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); - s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); - s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); - s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); - - x0 = WRAPLOW(s0 + s2, 8); - x1 = WRAPLOW(s1 + s3, 8); - x2 = WRAPLOW(s0 - s2, 8); - x3 = WRAPLOW(s1 - s3, 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); - - // stage 3 - s2 = (int)(cospi_16_64 * (x2 + x3)); - s3 = (int)(cospi_16_64 * (x2 - x3)); - s6 = (int)(cospi_16_64 * (x6 + x7)); - s7 = (int)(cospi_16_64 * (x6 - x7)); - - x2 = WRAPLOW(dct_const_round_shift(s2), 8); - x3 = WRAPLOW(dct_const_round_shift(s3), 8); - x6 = WRAPLOW(dct_const_round_shift(s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s7), 8); - - output[0] = WRAPLOW(x0, 8); - output[1] = WRAPLOW(-x4, 8); - output[2] = WRAPLOW(x6, 8); - output[3] = WRAPLOW(-x2, 8); - output[4] = WRAPLOW(x3, 8); - output[5] = WRAPLOW(-x7, 8); - output[6] = WRAPLOW(x5, 8); - output[7] = WRAPLOW(-x1, 8); -} - static const transform_2d IHT_8[] = { - { idct8, idct8 }, // DCT_DCT = 0 - { iadst8, idct8 }, // ADST_DCT = 1 - { idct8, iadst8 }, // DCT_ADST = 2 - { iadst8, iadst8 } // ADST_ADST = 3 + { idct8_c, idct8_c }, // DCT_DCT = 0 + { iadst8_c, idct8_c }, // ADST_DCT = 1 + { idct8_c, iadst8_c }, // DCT_ADST = 2 + { iadst8_c, iadst8_c } // ADST_ADST = 3 }; void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, @@ -425,400 +85,11 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, } } -void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - // only first 4 row has non-zero coefs - for (i = 0; i < 4; ++i) { - idct8(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - idct8(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } -} - -static void idct16(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[16], step2[16]; - tran_high_t temp1, temp2; - - // stage 1 - step1[0] = input[0/2]; - step1[1] = input[16/2]; - step1[2] = input[8/2]; - step1[3] = input[24/2]; - step1[4] = input[4/2]; - step1[5] = input[20/2]; - step1[6] = input[12/2]; - step1[7] = input[28/2]; - step1[8] = input[2/2]; - step1[9] = input[18/2]; - step1[10] = input[10/2]; - step1[11] = input[26/2]; - step1[12] = input[6/2]; - step1[13] = input[22/2]; - step1[14] = input[14/2]; - step1[15] = input[30/2]; - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step1[8] = WRAPLOW(step2[8] + step2[9], 8); - step1[9] = WRAPLOW(step2[8] - step2[9], 8); - step1[10] = WRAPLOW(-step2[10] + step2[11], 8); - step1[11] = WRAPLOW(step2[10] + step2[11], 8); - step1[12] = WRAPLOW(step2[12] + step2[13], 8); - step1[13] = WRAPLOW(step2[12] - step2[13], 8); - step1[14] = WRAPLOW(-step2[14] + step2[15], 8); - step1[15] = WRAPLOW(step2[14] + step2[15], 8); - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[11] = step1[11]; - step2[12] = step1[12]; - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], 8); - step1[1] = WRAPLOW(step2[1] + step2[2], 8); - step1[2] = WRAPLOW(step2[1] - step2[2], 8); - step1[3] = WRAPLOW(step2[0] - step2[3], 8); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11], 8); - step1[9] = WRAPLOW(step2[9] + step2[10], 8); - step1[10] = WRAPLOW(step2[9] - step2[10], 8); - step1[11] = WRAPLOW(step2[8] - step2[11], 8); - step1[12] = WRAPLOW(-step2[12] + step2[15], 8); - step1[13] = WRAPLOW(-step2[13] + step2[14], 8); - step1[14] = WRAPLOW(step2[13] + step2[14], 8); - step1[15] = WRAPLOW(step2[12] + step2[15], 8); - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], 8); - step2[1] = WRAPLOW(step1[1] + step1[6], 8); - step2[2] = WRAPLOW(step1[2] + step1[5], 8); - step2[3] = WRAPLOW(step1[3] + step1[4], 8); - step2[4] = WRAPLOW(step1[3] - step1[4], 8); - step2[5] = WRAPLOW(step1[2] - step1[5], 8); - step2[6] = WRAPLOW(step1[1] - step1[6], 8); - step2[7] = WRAPLOW(step1[0] - step1[7], 8); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[14] = step1[14]; - step2[15] = step1[15]; - - // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15], 8); - output[1] = WRAPLOW(step2[1] + step2[14], 8); - output[2] = WRAPLOW(step2[2] + step2[13], 8); - output[3] = WRAPLOW(step2[3] + step2[12], 8); - output[4] = WRAPLOW(step2[4] + step2[11], 8); - output[5] = WRAPLOW(step2[5] + step2[10], 8); - output[6] = WRAPLOW(step2[6] + step2[9], 8); - output[7] = WRAPLOW(step2[7] + step2[8], 8); - output[8] = WRAPLOW(step2[7] - step2[8], 8); - output[9] = WRAPLOW(step2[6] - step2[9], 8); - output[10] = WRAPLOW(step2[5] - step2[10], 8); - output[11] = WRAPLOW(step2[4] - step2[11], 8); - output[12] = WRAPLOW(step2[3] - step2[12], 8); - output[13] = WRAPLOW(step2[2] - step2[13], 8); - output[14] = WRAPLOW(step2[1] - step2[14], 8); - output[15] = WRAPLOW(step2[0] - step2[15], 8); -} - -void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - - // First transform rows - for (i = 0; i < 16; ++i) { - idct16(input, outptr); - input += 16; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - idct16(temp_in, temp_out); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -static void iadst16(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - - tran_high_t x0 = input[15]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[13]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[11]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[9]; - tran_high_t x7 = input[6]; - tran_high_t x8 = input[7]; - tran_high_t x9 = input[8]; - tran_high_t x10 = input[5]; - tran_high_t x11 = input[10]; - tran_high_t x12 = input[3]; - tran_high_t x13 = input[12]; - tran_high_t x14 = input[1]; - tran_high_t x15 = input[14]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 - | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] - = output[5] = output[6] = output[7] = output[8] - = output[9] = output[10] = output[11] = output[12] - = output[13] = output[14] = output[15] = 0; - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = WRAPLOW(s0 + s4, 8); - x1 = WRAPLOW(s1 + s5, 8); - x2 = WRAPLOW(s2 + s6, 8); - x3 = WRAPLOW(s3 + s7, 8); - x4 = WRAPLOW(s0 - s4, 8); - x5 = WRAPLOW(s1 - s5, 8); - x6 = WRAPLOW(s2 - s6, 8); - x7 = WRAPLOW(s3 - s7, 8); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = WRAPLOW(check_range(s0 + s2), 8); - x1 = WRAPLOW(check_range(s1 + s3), 8); - x2 = WRAPLOW(check_range(s0 - s2), 8); - x3 = WRAPLOW(check_range(s1 - s3), 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); - x8 = WRAPLOW(check_range(s8 + s10), 8); - x9 = WRAPLOW(check_range(s9 + s11), 8); - x10 = WRAPLOW(check_range(s8 - s10), 8); - x11 = WRAPLOW(check_range(s9 - s11), 8); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8); - - // stage 4 - s2 = (- cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (- x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (- x10 + x11); - s14 = (- cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = WRAPLOW(dct_const_round_shift(s2), 8); - x3 = WRAPLOW(dct_const_round_shift(s3), 8); - x6 = WRAPLOW(dct_const_round_shift(s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s7), 8); - x10 = WRAPLOW(dct_const_round_shift(s10), 8); - x11 = WRAPLOW(dct_const_round_shift(s11), 8); - x14 = WRAPLOW(dct_const_round_shift(s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s15), 8); - - output[0] = WRAPLOW(x0, 8); - output[1] = WRAPLOW(-x8, 8); - output[2] = WRAPLOW(x12, 8); - output[3] = WRAPLOW(-x4, 8); - output[4] = WRAPLOW(x6, 8); - output[5] = WRAPLOW(x14, 8); - output[6] = WRAPLOW(x10, 8); - output[7] = WRAPLOW(x2, 8); - output[8] = WRAPLOW(x3, 8); - output[9] = WRAPLOW(x11, 8); - output[10] = WRAPLOW(x15, 8); - output[11] = WRAPLOW(x7, 8); - output[12] = WRAPLOW(x5, 8); - output[13] = WRAPLOW(-x13, 8); - output[14] = WRAPLOW(x9, 8); - output[15] = WRAPLOW(-x1, 8); -} - static const transform_2d IHT_16[] = { - { idct16, idct16 }, // DCT_DCT = 0 - { iadst16, idct16 }, // ADST_DCT = 1 - { idct16, iadst16 }, // DCT_ADST = 2 - { iadst16, iadst16 } // ADST_ADST = 3 + { idct16_c, idct16_c }, // DCT_DCT = 0 + { iadst16_c, idct16_c }, // ADST_DCT = 1 + { idct16_c, iadst16_c }, // DCT_ADST = 2 + { iadst16_c, iadst16_c } // ADST_ADST = 3 }; void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, @@ -848,870 +119,101 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, } } -void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; +// idct +void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + if (eob > 1) + vp9_idct4x4_16_add(input, dest, stride); + else + vp9_idct4x4_1_add(input, dest, stride); +} - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - for (i = 0; i < 4; ++i) { - idct16(input, outptr); - input += 16; - outptr += 16; - } - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; - idct16(temp_in, temp_out); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } +void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + if (eob > 1) + vp9_iwht4x4_16_add(input, dest, stride); + else + vp9_iwht4x4_1_add(input, dest, stride); } -void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); - a1 = ROUND_POWER_OF_TWO(out, 6); - for (j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) - dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } +void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. + // Combine that with code here. + if (eob == 1) + // DC only DCT coefficient + vp9_idct8x8_1_add(input, dest, stride); + else if (eob <= 12) + vp9_idct8x8_12_add(input, dest, stride); + else + vp9_idct8x8_64_add(input, dest, stride); } -static void idct32(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[32], step2[32]; - tran_high_t temp1, temp2; +void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + /* The calculation can be simplified if there are not many non-zero dct + * coefficients. Use eobs to separate different cases. */ + if (eob == 1) + /* DC only DCT coefficient. */ + vp9_idct16x16_1_add(input, dest, stride); + else if (eob <= 10) + vp9_idct16x16_10_add(input, dest, stride); + else + vp9_idct16x16_256_add(input, dest, stride); +} - // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; +void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + if (eob == 1) + vp9_idct32x32_1_add(input, dest, stride); + else if (eob <= 34) + // non-zero coeff only in upper-left 8x8 + vp9_idct32x32_34_add(input, dest, stride); + else + vp9_idct32x32_1024_add(input, dest, stride); +} - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8); +// iht +void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) + vp9_idct4x4_add(input, dest, stride, eob); + else + vp9_iht4x4_16_add(input, dest, stride, tx_type); +} - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); +void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct8x8_add(input, dest, stride, eob); + } else { + vp9_iht8x8_64_add(input, dest, stride, tx_type); + } +} - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step2[16] = WRAPLOW(step1[16] + step1[17], 8); - step2[17] = WRAPLOW(step1[16] - step1[17], 8); - step2[18] = WRAPLOW(-step1[18] + step1[19], 8); - step2[19] = WRAPLOW(step1[18] + step1[19], 8); - step2[20] = WRAPLOW(step1[20] + step1[21], 8); - step2[21] = WRAPLOW(step1[20] - step1[21], 8); - step2[22] = WRAPLOW(-step1[22] + step1[23], 8); - step2[23] = WRAPLOW(step1[22] + step1[23], 8); - step2[24] = WRAPLOW(step1[24] + step1[25], 8); - step2[25] = WRAPLOW(step1[24] - step1[25], 8); - step2[26] = WRAPLOW(-step1[26] + step1[27], 8); - step2[27] = WRAPLOW(step1[26] + step1[27], 8); - step2[28] = WRAPLOW(step1[28] + step1[29], 8); - step2[29] = WRAPLOW(step1[28] - step1[29], 8); - step2[30] = WRAPLOW(-step1[30] + step1[31], 8); - step2[31] = WRAPLOW(step1[30] + step1[31], 8); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step1[8] = WRAPLOW(step2[8] + step2[9], 8); - step1[9] = WRAPLOW(step2[8] - step2[9], 8); - step1[10] = WRAPLOW(-step2[10] + step2[11], 8); - step1[11] = WRAPLOW(step2[10] + step2[11], 8); - step1[12] = WRAPLOW(step2[12] + step2[13], 8); - step1[13] = WRAPLOW(step2[12] - step2[13], 8); - step1[14] = WRAPLOW(-step2[14] + step2[15], 8); - step1[15] = WRAPLOW(step2[14] + step2[15], 8); - - step1[16] = step2[16]; - step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[19] = step2[19]; - step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[27] = step2[27]; - step1[28] = step2[28]; - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[11] = step1[11]; - step2[12] = step1[12]; - - step2[16] = WRAPLOW(step1[16] + step1[19], 8); - step2[17] = WRAPLOW(step1[17] + step1[18], 8); - step2[18] = WRAPLOW(step1[17] - step1[18], 8); - step2[19] = WRAPLOW(step1[16] - step1[19], 8); - step2[20] = WRAPLOW(-step1[20] + step1[23], 8); - step2[21] = WRAPLOW(-step1[21] + step1[22], 8); - step2[22] = WRAPLOW(step1[21] + step1[22], 8); - step2[23] = WRAPLOW(step1[20] + step1[23], 8); - - step2[24] = WRAPLOW(step1[24] + step1[27], 8); - step2[25] = WRAPLOW(step1[25] + step1[26], 8); - step2[26] = WRAPLOW(step1[25] - step1[26], 8); - step2[27] = WRAPLOW(step1[24] - step1[27], 8); - step2[28] = WRAPLOW(-step1[28] + step1[31], 8); - step2[29] = WRAPLOW(-step1[29] + step1[30], 8); - step2[30] = WRAPLOW(step1[29] + step1[30], 8); - step2[31] = WRAPLOW(step1[28] + step1[31], 8); - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], 8); - step1[1] = WRAPLOW(step2[1] + step2[2], 8); - step1[2] = WRAPLOW(step2[1] - step2[2], 8); - step1[3] = WRAPLOW(step2[0] - step2[3], 8); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11], 8); - step1[9] = WRAPLOW(step2[9] + step2[10], 8); - step1[10] = WRAPLOW(step2[9] - step2[10], 8); - step1[11] = WRAPLOW(step2[8] - step2[11], 8); - step1[12] = WRAPLOW(-step2[12] + step2[15], 8); - step1[13] = WRAPLOW(-step2[13] + step2[14], 8); - step1[14] = WRAPLOW(step2[13] + step2[14], 8); - step1[15] = WRAPLOW(step2[12] + step2[15], 8); - - step1[16] = step2[16]; - step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[22] = step2[22]; - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[25] = step2[25]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], 8); - step2[1] = WRAPLOW(step1[1] + step1[6], 8); - step2[2] = WRAPLOW(step1[2] + step1[5], 8); - step2[3] = WRAPLOW(step1[3] + step1[4], 8); - step2[4] = WRAPLOW(step1[3] - step1[4], 8); - step2[5] = WRAPLOW(step1[2] - step1[5], 8); - step2[6] = WRAPLOW(step1[1] - step1[6], 8); - step2[7] = WRAPLOW(step1[0] - step1[7], 8); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[14] = step1[14]; - step2[15] = step1[15]; - - step2[16] = WRAPLOW(step1[16] + step1[23], 8); - step2[17] = WRAPLOW(step1[17] + step1[22], 8); - step2[18] = WRAPLOW(step1[18] + step1[21], 8); - step2[19] = WRAPLOW(step1[19] + step1[20], 8); - step2[20] = WRAPLOW(step1[19] - step1[20], 8); - step2[21] = WRAPLOW(step1[18] - step1[21], 8); - step2[22] = WRAPLOW(step1[17] - step1[22], 8); - step2[23] = WRAPLOW(step1[16] - step1[23], 8); - - step2[24] = WRAPLOW(-step1[24] + step1[31], 8); - step2[25] = WRAPLOW(-step1[25] + step1[30], 8); - step2[26] = WRAPLOW(-step1[26] + step1[29], 8); - step2[27] = WRAPLOW(-step1[27] + step1[28], 8); - step2[28] = WRAPLOW(step1[27] + step1[28], 8); - step2[29] = WRAPLOW(step1[26] + step1[29], 8); - step2[30] = WRAPLOW(step1[25] + step1[30], 8); - step2[31] = WRAPLOW(step1[24] + step1[31], 8); - - // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15], 8); - step1[1] = WRAPLOW(step2[1] + step2[14], 8); - step1[2] = WRAPLOW(step2[2] + step2[13], 8); - step1[3] = WRAPLOW(step2[3] + step2[12], 8); - step1[4] = WRAPLOW(step2[4] + step2[11], 8); - step1[5] = WRAPLOW(step2[5] + step2[10], 8); - step1[6] = WRAPLOW(step2[6] + step2[9], 8); - step1[7] = WRAPLOW(step2[7] + step2[8], 8); - step1[8] = WRAPLOW(step2[7] - step2[8], 8); - step1[9] = WRAPLOW(step2[6] - step2[9], 8); - step1[10] = WRAPLOW(step2[5] - step2[10], 8); - step1[11] = WRAPLOW(step2[4] - step2[11], 8); - step1[12] = WRAPLOW(step2[3] - step2[12], 8); - step1[13] = WRAPLOW(step2[2] - step2[13], 8); - step1[14] = WRAPLOW(step2[1] - step2[14], 8); - step1[15] = WRAPLOW(step2[0] - step2[15], 8); - - step1[16] = step2[16]; - step1[17] = step2[17]; - step1[18] = step2[18]; - step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[28] = step2[28]; - step1[29] = step2[29]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // final stage - output[0] = WRAPLOW(step1[0] + step1[31], 8); - output[1] = WRAPLOW(step1[1] + step1[30], 8); - output[2] = WRAPLOW(step1[2] + step1[29], 8); - output[3] = WRAPLOW(step1[3] + step1[28], 8); - output[4] = WRAPLOW(step1[4] + step1[27], 8); - output[5] = WRAPLOW(step1[5] + step1[26], 8); - output[6] = WRAPLOW(step1[6] + step1[25], 8); - output[7] = WRAPLOW(step1[7] + step1[24], 8); - output[8] = WRAPLOW(step1[8] + step1[23], 8); - output[9] = WRAPLOW(step1[9] + step1[22], 8); - output[10] = WRAPLOW(step1[10] + step1[21], 8); - output[11] = WRAPLOW(step1[11] + step1[20], 8); - output[12] = WRAPLOW(step1[12] + step1[19], 8); - output[13] = WRAPLOW(step1[13] + step1[18], 8); - output[14] = WRAPLOW(step1[14] + step1[17], 8); - output[15] = WRAPLOW(step1[15] + step1[16], 8); - output[16] = WRAPLOW(step1[15] - step1[16], 8); - output[17] = WRAPLOW(step1[14] - step1[17], 8); - output[18] = WRAPLOW(step1[13] - step1[18], 8); - output[19] = WRAPLOW(step1[12] - step1[19], 8); - output[20] = WRAPLOW(step1[11] - step1[20], 8); - output[21] = WRAPLOW(step1[10] - step1[21], 8); - output[22] = WRAPLOW(step1[9] - step1[22], 8); - output[23] = WRAPLOW(step1[8] - step1[23], 8); - output[24] = WRAPLOW(step1[7] - step1[24], 8); - output[25] = WRAPLOW(step1[6] - step1[25], 8); - output[26] = WRAPLOW(step1[5] - step1[26], 8); - output[27] = WRAPLOW(step1[4] - step1[27], 8); - output[28] = WRAPLOW(step1[3] - step1[28], 8); - output[29] = WRAPLOW(step1[2] - step1[29], 8); - output[30] = WRAPLOW(step1[1] - step1[30], 8); - output[31] = WRAPLOW(step1[0] - step1[31], 8); -} - -void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[32 * 32]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - for (i = 0; i < 32; ++i) { - int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) - zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) - idct32(input, outptr); - else - memset(outptr, 0, sizeof(tran_low_t) * 32); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; - idct32(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[32 * 32] = {0}; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - // only upper-left 8x8 has non-zero coeff - for (i = 0; i < 8; ++i) { - idct32(input, outptr); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; - idct32(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); - a1 = ROUND_POWER_OF_TWO(out, 6); - - for (j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) - dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -// idct -void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, - int eob) { - if (eob > 1) - vp9_idct4x4_16_add(input, dest, stride); - else - vp9_idct4x4_1_add(input, dest, stride); -} - - -void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, - int eob) { - if (eob > 1) - vp9_iwht4x4_16_add(input, dest, stride); - else - vp9_iwht4x4_1_add(input, dest, stride); -} - -void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, - int eob) { - // If dc is 1, then input[0] is the reconstructed value, do not need - // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. - - // The calculation can be simplified if there are not many non-zero dct - // coefficients. Use eobs to decide what to do. - // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. - // Combine that with code here. - if (eob == 1) - // DC only DCT coefficient - vp9_idct8x8_1_add(input, dest, stride); - else if (eob <= 12) - vp9_idct8x8_12_add(input, dest, stride); - else - vp9_idct8x8_64_add(input, dest, stride); -} - -void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, - int eob) { - /* The calculation can be simplified if there are not many non-zero dct - * coefficients. Use eobs to separate different cases. */ - if (eob == 1) - /* DC only DCT coefficient. */ - vp9_idct16x16_1_add(input, dest, stride); - else if (eob <= 10) - vp9_idct16x16_10_add(input, dest, stride); - else - vp9_idct16x16_256_add(input, dest, stride); -} - -void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, - int eob) { - if (eob == 1) - vp9_idct32x32_1_add(input, dest, stride); - else if (eob <= 34) - // non-zero coeff only in upper-left 8x8 - vp9_idct32x32_34_add(input, dest, stride); - else - vp9_idct32x32_1024_add(input, dest, stride); -} - -// iht -void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, - int stride, int eob) { - if (tx_type == DCT_DCT) - vp9_idct4x4_add(input, dest, stride, eob); - else - vp9_iht4x4_16_add(input, dest, stride, tx_type); -} - -void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, - int stride, int eob) { - if (tx_type == DCT_DCT) { - vp9_idct8x8_add(input, dest, stride, eob); - } else { - vp9_iht8x8_64_add(input, dest, stride, tx_type); - } -} - -void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, - int stride, int eob) { - if (tx_type == DCT_DCT) { - vp9_idct16x16_add(input, dest, stride, eob); - } else { - vp9_iht16x16_256_add(input, dest, stride, tx_type); - } -} +void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct16x16_add(input, dest, stride, eob); + } else { + vp9_iht16x16_256_add(input, dest, stride, tx_type); + } +} #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, - 0.5 shifts per pixel. */ - int i; - tran_low_t output[16]; - tran_high_t a1, b1, c1, d1, e1; - const tran_low_t *ip = input; - tran_low_t *op = output; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - for (i = 0; i < 4; i++) { - a1 = ip[0] >> UNIT_QUANT_SHIFT; - c1 = ip[1] >> UNIT_QUANT_SHIFT; - d1 = ip[2] >> UNIT_QUANT_SHIFT; - b1 = ip[3] >> UNIT_QUANT_SHIFT; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - op[0] = WRAPLOW(a1, bd); - op[1] = WRAPLOW(b1, bd); - op[2] = WRAPLOW(c1, bd); - op[3] = WRAPLOW(d1, bd); - ip += 4; - op += 4; - } - - ip = output; - for (i = 0; i < 4; i++) { - a1 = ip[4 * 0]; - c1 = ip[4 * 1]; - d1 = ip[4 * 2]; - b1 = ip[4 * 3]; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); - dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); - dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); - dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); - - ip++; - dest++; - } -} - -void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, - int dest_stride, int bd) { - int i; - tran_high_t a1, e1; - tran_low_t tmp[4]; - const tran_low_t *ip = in; - tran_low_t *op = tmp; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - (void) bd; - - a1 = ip[0] >> UNIT_QUANT_SHIFT; - e1 = a1 >> 1; - a1 -= e1; - op[0] = WRAPLOW(a1, bd); - op[1] = op[2] = op[3] = WRAPLOW(e1, bd); - - ip = tmp; - for (i = 0; i < 4; i++) { - e1 = ip[0] >> 1; - a1 = ip[0] - e1; - dest[dest_stride * 0] = highbd_clip_pixel_add( - dest[dest_stride * 0], a1, bd); - dest[dest_stride * 1] = highbd_clip_pixel_add( - dest[dest_stride * 1], e1, bd); - dest[dest_stride * 2] = highbd_clip_pixel_add( - dest[dest_stride * 2], e1, bd); - dest[dest_stride * 3] = highbd_clip_pixel_add( - dest[dest_stride * 3], e1, bd); - ip++; - dest++; - } -} - -void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - (void) bd; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - // stage 2 - output[0] = WRAPLOW(step[0] + step[3], bd); - output[1] = WRAPLOW(step[1] + step[2], bd); - output[2] = WRAPLOW(step[1] - step[2], bd); - output[3] = WRAPLOW(step[0] - step[3], bd); -} - -void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - for (i = 0; i < 4; ++i) { - vp9_highbd_idct4(input, outptr, bd); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; - vp9_highbd_idct4(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } -} - -void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, - int dest_stride, int bd) { - int i; - tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (i = 0; i < 4; i++) { - dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); - dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); - dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); - dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); - dest += dest_stride; - } -} - -void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - // stage 2 & stage 3 - even half - vp9_highbd_idct4(step1, step1, bd); - - // stage 2 - odd half - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); - - // stage 3 - odd half - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step1[7] = step2[7]; - - // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7], bd); - output[1] = WRAPLOW(step1[1] + step1[6], bd); - output[2] = WRAPLOW(step1[2] + step1[5], bd); - output[3] = WRAPLOW(step1[3] + step1[4], bd); - output[4] = WRAPLOW(step1[3] - step1[4], bd); - output[5] = WRAPLOW(step1[2] - step1[5], bd); - output[6] = WRAPLOW(step1[1] - step1[6], bd); - output[7] = WRAPLOW(step1[0] - step1[7], bd); -} - -void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. - for (i = 0; i < 8; ++i) { - vp9_highbd_idct8(input, outptr, bd); - input += 8; - outptr += 8; - } - - // Then transform columns. - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - vp9_highbd_idct8(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } -} - -void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); - a1 = ROUND_POWER_OF_TWO(out, 5); - for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) - dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); - dest += stride; - } -} - -static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - (void) bd; - - if (!(x0 | x1 | x2 | x3)) { - memset(output, 0, 4 * sizeof(*output)); - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = (tran_high_t)(x0 - x2 + x3); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd); - output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd); - output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd); -} - void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { const highbd_transform_2d IHT_4[] = { - { vp9_highbd_idct4, vp9_highbd_idct4 }, // DCT_DCT = 0 - { highbd_iadst4, vp9_highbd_idct4 }, // ADST_DCT = 1 - { vp9_highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2 - { highbd_iadst4, highbd_iadst4 } // ADST_ADST = 3 + { vp9_highbd_idct4_c, vp9_highbd_idct4_c }, // DCT_DCT = 0 + { highbd_iadst4_c, vp9_highbd_idct4_c }, // ADST_DCT = 1 + { vp9_highbd_idct4_c, highbd_iadst4_c }, // DCT_ADST = 2 + { highbd_iadst4_c, highbd_iadst4_c } // ADST_ADST = 3 }; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); @@ -1739,88 +241,11 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[7]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[5]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[3]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[1]; - tran_low_t x7 = input[6]; - (void) bd; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - memset(output, 0, 8 * sizeof(*output)); - return; - } - - // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - - x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd); - x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - - x0 = WRAPLOW(s0 + s2, bd); - x1 = WRAPLOW(s1 + s3, bd); - x2 = WRAPLOW(s0 - s2, bd); - x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); - - // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); - - x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); - - output[0] = WRAPLOW(x0, bd); - output[1] = WRAPLOW(-x4, bd); - output[2] = WRAPLOW(x6, bd); - output[3] = WRAPLOW(-x2, bd); - output[4] = WRAPLOW(x3, bd); - output[5] = WRAPLOW(-x7, bd); - output[6] = WRAPLOW(x5, bd); - output[7] = WRAPLOW(-x1, bd); -} - static const highbd_transform_2d HIGH_IHT_8[] = { - { vp9_highbd_idct8, vp9_highbd_idct8 }, // DCT_DCT = 0 - { highbd_iadst8, vp9_highbd_idct8 }, // ADST_DCT = 1 - { vp9_highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2 - { highbd_iadst8, highbd_iadst8 } // ADST_ADST = 3 + { vp9_highbd_idct8_c, vp9_highbd_idct8_c }, // DCT_DCT = 0 + { highbd_iadst8_c, vp9_highbd_idct8_c }, // ADST_DCT = 1 + { vp9_highbd_idct8_c, highbd_iadst8_c }, // DCT_ADST = 2 + { highbd_iadst8_c, highbd_iadst8_c } // ADST_ADST = 3 }; void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1851,402 +276,11 @@ void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. - // Only first 4 row has non-zero coefs. - for (i = 0; i < 4; ++i) { - vp9_highbd_idct8(input, outptr, bd); - input += 8; - outptr += 8; - } - // Then transform columns. - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - vp9_highbd_idct8(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } -} - -void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[16], step2[16]; - tran_high_t temp1, temp2; - (void) bd; - - // stage 1 - step1[0] = input[0/2]; - step1[1] = input[16/2]; - step1[2] = input[8/2]; - step1[3] = input[24/2]; - step1[4] = input[4/2]; - step1[5] = input[20/2]; - step1[6] = input[12/2]; - step1[7] = input[28/2]; - step1[8] = input[2/2]; - step1[9] = input[18/2]; - step1[10] = input[10/2]; - step1[11] = input[26/2]; - step1[12] = input[6/2]; - step1[13] = input[22/2]; - step1[14] = input[14/2]; - step1[15] = input[30/2]; - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step1[8] = WRAPLOW(step2[8] + step2[9], bd); - step1[9] = WRAPLOW(step2[8] - step2[9], bd); - step1[10] = WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = WRAPLOW(step2[10] + step2[11], bd); - step1[12] = WRAPLOW(step2[12] + step2[13], bd); - step1[13] = WRAPLOW(step2[12] - step2[13], bd); - step1[14] = WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = WRAPLOW(step2[14] + step2[15], bd); - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[11] = step1[11]; - step2[12] = step1[12]; - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], bd); - step1[1] = WRAPLOW(step2[1] + step2[2], bd); - step1[2] = WRAPLOW(step2[1] - step2[2], bd); - step1[3] = WRAPLOW(step2[0] - step2[3], bd); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11], bd); - step1[9] = WRAPLOW(step2[9] + step2[10], bd); - step1[10] = WRAPLOW(step2[9] - step2[10], bd); - step1[11] = WRAPLOW(step2[8] - step2[11], bd); - step1[12] = WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = WRAPLOW(step2[13] + step2[14], bd); - step1[15] = WRAPLOW(step2[12] + step2[15], bd); - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], bd); - step2[1] = WRAPLOW(step1[1] + step1[6], bd); - step2[2] = WRAPLOW(step1[2] + step1[5], bd); - step2[3] = WRAPLOW(step1[3] + step1[4], bd); - step2[4] = WRAPLOW(step1[3] - step1[4], bd); - step2[5] = WRAPLOW(step1[2] - step1[5], bd); - step2[6] = WRAPLOW(step1[1] - step1[6], bd); - step2[7] = WRAPLOW(step1[0] - step1[7], bd); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[14] = step1[14]; - step2[15] = step1[15]; - - // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15], bd); - output[1] = WRAPLOW(step2[1] + step2[14], bd); - output[2] = WRAPLOW(step2[2] + step2[13], bd); - output[3] = WRAPLOW(step2[3] + step2[12], bd); - output[4] = WRAPLOW(step2[4] + step2[11], bd); - output[5] = WRAPLOW(step2[5] + step2[10], bd); - output[6] = WRAPLOW(step2[6] + step2[9], bd); - output[7] = WRAPLOW(step2[7] + step2[8], bd); - output[8] = WRAPLOW(step2[7] - step2[8], bd); - output[9] = WRAPLOW(step2[6] - step2[9], bd); - output[10] = WRAPLOW(step2[5] - step2[10], bd); - output[11] = WRAPLOW(step2[4] - step2[11], bd); - output[12] = WRAPLOW(step2[3] - step2[12], bd); - output[13] = WRAPLOW(step2[2] - step2[13], bd); - output[14] = WRAPLOW(step2[1] - step2[14], bd); - output[15] = WRAPLOW(step2[0] - step2[15], bd); -} - -void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. - for (i = 0; i < 16; ++i) { - vp9_highbd_idct16(input, outptr, bd); - input += 16; - outptr += 16; - } - - // Then transform columns. - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - vp9_highbd_idct16(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } -} - -static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, - int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - - tran_low_t x0 = input[15]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[13]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[11]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[9]; - tran_low_t x7 = input[6]; - tran_low_t x8 = input[7]; - tran_low_t x9 = input[8]; - tran_low_t x10 = input[5]; - tran_low_t x11 = input[10]; - tran_low_t x12 = input[3]; - tran_low_t x13 = input[12]; - tran_low_t x14 = input[1]; - tran_low_t x15 = input[14]; - (void) bd; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 - | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { - memset(output, 0, 16 * sizeof(*output)); - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd); - x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd); - x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd); - x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = WRAPLOW(s0 + s4, bd); - x1 = WRAPLOW(s1 + s5, bd); - x2 = WRAPLOW(s2 + s6, bd); - x3 = WRAPLOW(s3 + s7, bd); - x4 = WRAPLOW(s0 - s4, bd); - x5 = WRAPLOW(s1 - s5, bd); - x6 = WRAPLOW(s2 - s6, bd); - x7 = WRAPLOW(s3 - s7, bd); - x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd); - x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = WRAPLOW(s0 + s2, bd); - x1 = WRAPLOW(s1 + s3, bd); - x2 = WRAPLOW(s0 - s2, bd); - x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); - x8 = WRAPLOW(s8 + s10, bd); - x9 = WRAPLOW(s9 + s11, bd); - x10 = WRAPLOW(s8 - s10, bd); - x11 = WRAPLOW(s9 - s11, bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd); - - // stage 4 - s2 = (- cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (- cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd); - - output[0] = WRAPLOW(x0, bd); - output[1] = WRAPLOW(-x8, bd); - output[2] = WRAPLOW(x12, bd); - output[3] = WRAPLOW(-x4, bd); - output[4] = WRAPLOW(x6, bd); - output[5] = WRAPLOW(x14, bd); - output[6] = WRAPLOW(x10, bd); - output[7] = WRAPLOW(x2, bd); - output[8] = WRAPLOW(x3, bd); - output[9] = WRAPLOW(x11, bd); - output[10] = WRAPLOW(x15, bd); - output[11] = WRAPLOW(x7, bd); - output[12] = WRAPLOW(x5, bd); - output[13] = WRAPLOW(-x13, bd); - output[14] = WRAPLOW(x9, bd); - output[15] = WRAPLOW(-x1, bd); -} - static const highbd_transform_2d HIGH_IHT_16[] = { - { vp9_highbd_idct16, vp9_highbd_idct16 }, // DCT_DCT = 0 - { highbd_iadst16, vp9_highbd_idct16 }, // ADST_DCT = 1 - { vp9_highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2 - { highbd_iadst16, highbd_iadst16 } // ADST_ADST = 3 + { vp9_highbd_idct16_c, vp9_highbd_idct16_c }, // DCT_DCT = 0 + { highbd_iadst16_c, vp9_highbd_idct16_c }, // ADST_DCT = 1 + { vp9_highbd_idct16_c, highbd_iadst16_c }, // DCT_ADST = 2 + { highbd_iadst16_c, highbd_iadst16_c } // ADST_ADST = 3 }; void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, @@ -2277,504 +311,6 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - for (i = 0; i < 4; ++i) { - vp9_highbd_idct16(input, outptr, bd); - input += 16; - outptr += 16; - } - - // Then transform columns. - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; - vp9_highbd_idct16(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } -} - -void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); - a1 = ROUND_POWER_OF_TWO(out, 6); - for (j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) - dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); - dest += stride; - } -} - -static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[32], step2[32]; - tran_high_t temp1, temp2; - (void) bd; - - // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; - - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step2[16] = WRAPLOW(step1[16] + step1[17], bd); - step2[17] = WRAPLOW(step1[16] - step1[17], bd); - step2[18] = WRAPLOW(-step1[18] + step1[19], bd); - step2[19] = WRAPLOW(step1[18] + step1[19], bd); - step2[20] = WRAPLOW(step1[20] + step1[21], bd); - step2[21] = WRAPLOW(step1[20] - step1[21], bd); - step2[22] = WRAPLOW(-step1[22] + step1[23], bd); - step2[23] = WRAPLOW(step1[22] + step1[23], bd); - step2[24] = WRAPLOW(step1[24] + step1[25], bd); - step2[25] = WRAPLOW(step1[24] - step1[25], bd); - step2[26] = WRAPLOW(-step1[26] + step1[27], bd); - step2[27] = WRAPLOW(step1[26] + step1[27], bd); - step2[28] = WRAPLOW(step1[28] + step1[29], bd); - step2[29] = WRAPLOW(step1[28] - step1[29], bd); - step2[30] = WRAPLOW(-step1[30] + step1[31], bd); - step2[31] = WRAPLOW(step1[30] + step1[31], bd); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step1[8] = WRAPLOW(step2[8] + step2[9], bd); - step1[9] = WRAPLOW(step2[8] - step2[9], bd); - step1[10] = WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = WRAPLOW(step2[10] + step2[11], bd); - step1[12] = WRAPLOW(step2[12] + step2[13], bd); - step1[13] = WRAPLOW(step2[12] - step2[13], bd); - step1[14] = WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = WRAPLOW(step2[14] + step2[15], bd); - - step1[16] = step2[16]; - step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step1[19] = step2[19]; - step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[27] = step2[27]; - step1[28] = step2[28]; - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[11] = step1[11]; - step2[12] = step1[12]; - - step2[16] = WRAPLOW(step1[16] + step1[19], bd); - step2[17] = WRAPLOW(step1[17] + step1[18], bd); - step2[18] = WRAPLOW(step1[17] - step1[18], bd); - step2[19] = WRAPLOW(step1[16] - step1[19], bd); - step2[20] = WRAPLOW(-step1[20] + step1[23], bd); - step2[21] = WRAPLOW(-step1[21] + step1[22], bd); - step2[22] = WRAPLOW(step1[21] + step1[22], bd); - step2[23] = WRAPLOW(step1[20] + step1[23], bd); - - step2[24] = WRAPLOW(step1[24] + step1[27], bd); - step2[25] = WRAPLOW(step1[25] + step1[26], bd); - step2[26] = WRAPLOW(step1[25] - step1[26], bd); - step2[27] = WRAPLOW(step1[24] - step1[27], bd); - step2[28] = WRAPLOW(-step1[28] + step1[31], bd); - step2[29] = WRAPLOW(-step1[29] + step1[30], bd); - step2[30] = WRAPLOW(step1[29] + step1[30], bd); - step2[31] = WRAPLOW(step1[28] + step1[31], bd); - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], bd); - step1[1] = WRAPLOW(step2[1] + step2[2], bd); - step1[2] = WRAPLOW(step2[1] - step2[2], bd); - step1[3] = WRAPLOW(step2[0] - step2[3], bd); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11], bd); - step1[9] = WRAPLOW(step2[9] + step2[10], bd); - step1[10] = WRAPLOW(step2[9] - step2[10], bd); - step1[11] = WRAPLOW(step2[8] - step2[11], bd); - step1[12] = WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = WRAPLOW(step2[13] + step2[14], bd); - step1[15] = WRAPLOW(step2[12] + step2[15], bd); - - step1[16] = step2[16]; - step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step1[22] = step2[22]; - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[25] = step2[25]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], bd); - step2[1] = WRAPLOW(step1[1] + step1[6], bd); - step2[2] = WRAPLOW(step1[2] + step1[5], bd); - step2[3] = WRAPLOW(step1[3] + step1[4], bd); - step2[4] = WRAPLOW(step1[3] - step1[4], bd); - step2[5] = WRAPLOW(step1[2] - step1[5], bd); - step2[6] = WRAPLOW(step1[1] - step1[6], bd); - step2[7] = WRAPLOW(step1[0] - step1[7], bd); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[14] = step1[14]; - step2[15] = step1[15]; - - step2[16] = WRAPLOW(step1[16] + step1[23], bd); - step2[17] = WRAPLOW(step1[17] + step1[22], bd); - step2[18] = WRAPLOW(step1[18] + step1[21], bd); - step2[19] = WRAPLOW(step1[19] + step1[20], bd); - step2[20] = WRAPLOW(step1[19] - step1[20], bd); - step2[21] = WRAPLOW(step1[18] - step1[21], bd); - step2[22] = WRAPLOW(step1[17] - step1[22], bd); - step2[23] = WRAPLOW(step1[16] - step1[23], bd); - - step2[24] = WRAPLOW(-step1[24] + step1[31], bd); - step2[25] = WRAPLOW(-step1[25] + step1[30], bd); - step2[26] = WRAPLOW(-step1[26] + step1[29], bd); - step2[27] = WRAPLOW(-step1[27] + step1[28], bd); - step2[28] = WRAPLOW(step1[27] + step1[28], bd); - step2[29] = WRAPLOW(step1[26] + step1[29], bd); - step2[30] = WRAPLOW(step1[25] + step1[30], bd); - step2[31] = WRAPLOW(step1[24] + step1[31], bd); - - // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15], bd); - step1[1] = WRAPLOW(step2[1] + step2[14], bd); - step1[2] = WRAPLOW(step2[2] + step2[13], bd); - step1[3] = WRAPLOW(step2[3] + step2[12], bd); - step1[4] = WRAPLOW(step2[4] + step2[11], bd); - step1[5] = WRAPLOW(step2[5] + step2[10], bd); - step1[6] = WRAPLOW(step2[6] + step2[9], bd); - step1[7] = WRAPLOW(step2[7] + step2[8], bd); - step1[8] = WRAPLOW(step2[7] - step2[8], bd); - step1[9] = WRAPLOW(step2[6] - step2[9], bd); - step1[10] = WRAPLOW(step2[5] - step2[10], bd); - step1[11] = WRAPLOW(step2[4] - step2[11], bd); - step1[12] = WRAPLOW(step2[3] - step2[12], bd); - step1[13] = WRAPLOW(step2[2] - step2[13], bd); - step1[14] = WRAPLOW(step2[1] - step2[14], bd); - step1[15] = WRAPLOW(step2[0] - step2[15], bd); - - step1[16] = step2[16]; - step1[17] = step2[17]; - step1[18] = step2[18]; - step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step1[28] = step2[28]; - step1[29] = step2[29]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // final stage - output[0] = WRAPLOW(step1[0] + step1[31], bd); - output[1] = WRAPLOW(step1[1] + step1[30], bd); - output[2] = WRAPLOW(step1[2] + step1[29], bd); - output[3] = WRAPLOW(step1[3] + step1[28], bd); - output[4] = WRAPLOW(step1[4] + step1[27], bd); - output[5] = WRAPLOW(step1[5] + step1[26], bd); - output[6] = WRAPLOW(step1[6] + step1[25], bd); - output[7] = WRAPLOW(step1[7] + step1[24], bd); - output[8] = WRAPLOW(step1[8] + step1[23], bd); - output[9] = WRAPLOW(step1[9] + step1[22], bd); - output[10] = WRAPLOW(step1[10] + step1[21], bd); - output[11] = WRAPLOW(step1[11] + step1[20], bd); - output[12] = WRAPLOW(step1[12] + step1[19], bd); - output[13] = WRAPLOW(step1[13] + step1[18], bd); - output[14] = WRAPLOW(step1[14] + step1[17], bd); - output[15] = WRAPLOW(step1[15] + step1[16], bd); - output[16] = WRAPLOW(step1[15] - step1[16], bd); - output[17] = WRAPLOW(step1[14] - step1[17], bd); - output[18] = WRAPLOW(step1[13] - step1[18], bd); - output[19] = WRAPLOW(step1[12] - step1[19], bd); - output[20] = WRAPLOW(step1[11] - step1[20], bd); - output[21] = WRAPLOW(step1[10] - step1[21], bd); - output[22] = WRAPLOW(step1[9] - step1[22], bd); - output[23] = WRAPLOW(step1[8] - step1[23], bd); - output[24] = WRAPLOW(step1[7] - step1[24], bd); - output[25] = WRAPLOW(step1[6] - step1[25], bd); - output[26] = WRAPLOW(step1[5] - step1[26], bd); - output[27] = WRAPLOW(step1[4] - step1[27], bd); - output[28] = WRAPLOW(step1[3] - step1[28], bd); - output[29] = WRAPLOW(step1[2] - step1[29], bd); - output[30] = WRAPLOW(step1[1] - step1[30], bd); - output[31] = WRAPLOW(step1[0] - step1[31], bd); -} - -void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[32 * 32]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - for (i = 0; i < 32; ++i) { - tran_low_t zero_coeff[16]; - for (j = 0; j < 16; ++j) - zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) - highbd_idct32(input, outptr, bd); - else - memset(outptr, 0, sizeof(tran_low_t) * 32); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; - highbd_idct32(temp_in, temp_out, bd); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } -} - -void vp9_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[32 * 32] = {0}; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - // Only upper-left 8x8 has non-zero coeff. - for (i = 0; i < 8; ++i) { - highbd_idct32(input, outptr, bd); - input += 32; - outptr += 32; - } - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; - highbd_idct32(temp_in, temp_out, bd); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } -} - -void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - int a1; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); - a1 = ROUND_POWER_OF_TWO(out, 6); - - for (j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) - dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); - dest += stride; - } -} - // idct void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd) { diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 1a4ea2f04..7a7dc1d64 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -14,63 +14,16 @@ #include #include "./vpx_config.h" -#include "vpx_dsp/txfm_common.h" -#if CONFIG_VP9_HIGHBITDEPTH -#include "vpx_dsp/vpx_dsp_common.h" -#endif // CONFIG_VP9_HIGHBITDEPTH -#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" #ifdef __cplusplus extern "C" { #endif -static INLINE tran_low_t check_range(tran_high_t input) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - // For valid VP9 input streams, intermediate stage coefficients should always - // stay within the range of a signed 16 bit integer. Coefficients can go out - // of this range for invalid/corrupt VP9 streams. However, strictly checking - // this range for every intermediate coefficient can burdensome for a decoder, - // therefore the following assertion is only enabled when configured with - // --enable-coefficient-range-checking. - assert(INT16_MIN <= input); - assert(input <= INT16_MAX); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - return (tran_low_t)input; -} - -static INLINE tran_low_t dct_const_round_shift(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return check_range(rv); -} - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE tran_low_t highbd_check_range(tran_high_t input, - int bd) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - // For valid highbitdepth VP9 streams, intermediate stage coefficients will - // stay within the ranges: - // - 8 bit: signed 16 bit integer - // - 10 bit: signed 18 bit integer - // - 12 bit: signed 20 bit integer - const int32_t int_max = (1 << (7 + bd)) - 1; - const int32_t int_min = -int_max - 1; - assert(int_min <= input); - assert(input <= int_max); - (void) int_min; -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - (void) bd; - return (tran_low_t)input; -} - -static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input, - int bd) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return highbd_check_range(rv, bd); -} -#endif // CONFIG_VP9_HIGHBITDEPTH - typedef void (*transform_1d)(const tran_low_t*, tran_low_t*); typedef struct { @@ -85,28 +38,6 @@ typedef struct { } highbd_transform_2d; #endif // CONFIG_VP9_HIGHBITDEPTH -#if CONFIG_EMULATE_HARDWARE -// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a -// non-normative method to handle overflows. A stream that causes -// overflows in the inverse transform is considered invalid in VP9, -// and a hardware implementer is free to choose any reasonable -// method to handle overflows. However to aid in hardware -// verification they can use a specific implementation of the -// WRAPLOW() macro below that is identical to their intended -// hardware implementation (and also use configure options to trigger -// the C-implementation of the transform). -// -// The particular WRAPLOW implementation below performs strict -// overflow wrapping to match common hardware implementations. -// bd of 8 uses trans_low with 16bits, need to remove 16bits -// bd of 10 uses trans_low with 18bits, need to remove 14bits -// bd of 12 uses trans_low with 20bits, need to remove 12bits -// bd of x uses trans_low with 8+x bits, need to remove 24-x bits -#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd)) -#else -#define WRAPLOW(x, bd) ((int32_t)(x)) -#endif // CONFIG_EMULATE_HARDWARE - void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob); void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, @@ -126,9 +57,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd); -void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd); -void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd); void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd); void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, @@ -145,11 +73,6 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd); void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd); -static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, - int bd) { - trans = WRAPLOW(trans, bd); - return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd); -} #endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 883733f55..c1731012f 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -87,39 +87,6 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_1_add/; - - add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_16_add/; - - add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_1_add/; - - add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_64_add/; - - add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_12_add/; - - add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_1_add/; - - add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_256_add/; - - add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_10_add/; - - add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1024_add/; - - add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_34_add/; - - add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1_add/; - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp9_iht4x4_16_add/; @@ -128,51 +95,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/vp9_iht16x16_256_add/; - - # dct and add - - add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_1_add/; - - add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_16_add/; - } else { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_1_add/; - - add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_16_add/; - - add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_1_add/; - - add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_64_add/; - - add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_12_add/; - - add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_1_add/; - - add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_256_add/; - - add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_10_add/; - - add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1024_add/; - - add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_34_add/; - - add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1_add/; - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp9_iht4x4_16_add/; @@ -181,50 +106,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/vp9_iht16x16_256_add/; - - # dct and add - - add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_1_add/; - - add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_16_add/; } else { - add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_1_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_16_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc"; - - add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc"; - - add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/; - #is this a typo? - $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon; - - add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/; - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/; @@ -233,14 +115,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/; - - # dct and add - - add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_1_add msa/; - - add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc"; } } @@ -295,24 +169,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct4x4_1_add/; - - add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct8x8_1_add/; - - add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct16x16_1_add/; - - add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct32x32_1024_add/; - - add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct32x32_34_add/; - - add_proto qw/void vp9_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct32x32_1_add/; - add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; specialize qw/vp9_highbd_iht4x4_16_add/; @@ -321,50 +177,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd"; specialize qw/vp9_highbd_iht16x16_256_add/; - - # dct and add - - add_proto qw/void vp9_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_iwht4x4_1_add/; - - add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_iwht4x4_16_add/; - - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - - add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct4x4_16_add/; - - add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct8x8_64_add/; - - add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct8x8_10_add/; - - add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct16x16_256_add/; - - add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct16x16_10_add/; - - } else { - - add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct4x4_16_add sse2/; - - add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct8x8_64_add sse2/; - - add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct8x8_10_add sse2/; - - add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct16x16_256_add sse2/; - - add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vp9_highbd_idct16x16_10_add sse2/; - } } # diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 086cadf4e..4a1634577 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -8,260 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/x86/vp9_idct_intrin_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" -#define RECON_AND_STORE4X4(dest, in_x) \ -{ \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ -} - -void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; - - // Rows - input0 = _mm_load_si128((const __m128i *)input); - input2 = _mm_load_si128((const __m128i *)(input + 8)); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } -} - -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = dct_const_round_shift(input[0] * cospi_16_64); - a = dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 4); - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -static INLINE void transpose_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); -} - -static void idct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - - transpose_4x4(in); - // stage 1 - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[3], v[2]); - - // stage 2 - in[0] = _mm_add_epi16(u[0], u[1]); - in[1] = _mm_sub_epi16(u[0], u[1]); - in[1] = _mm_shuffle_epi32(in[1], 0x4E); -} - -static void iadst4_sse2(__m128i *in) { - const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); - const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); - const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8], in7; - - transpose_4x4(in); - in7 = _mm_srli_si128(in[1], 8); - in7 = _mm_add_epi16(in7, in[0]); - in7 = _mm_sub_epi16(in7, in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpackhi_epi16(in[0], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 - v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_add_epi32(v[3], v[4]); - u[2] = v[2]; - u[3] = _mm_add_epi32(u[0], u[1]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_add_epi32(u[3], v[5]); - u[6] = _mm_sub_epi32(u[5], u[4]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); -} - void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[2]; @@ -327,537 +77,6 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, } } -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ - out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ - cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } - -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - } - -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_4, \ - stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_0, \ - stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ - } - -void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. - in0 = _mm_load_si128((const __m128i *)input); - in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); - - // 2-D - for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vp9_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - } - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = dct_const_round_shift(input[0] * cospi_16_64); - a = dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); -} - -static void idct8_sse2(__m128i *in) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // 8x8 Transpose is copied from vp9_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], - in0, in1, in2, in3, in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, - in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); -} - -static void iadst8_sse2(__m128i *in) { - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // transpose - array_transpose_8x8(in, in); - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); -} - void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; @@ -925,1366 +144,6 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest + 7 * stride, in[7]); } -void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // Rows. Load 4-row input data. - in0 = _mm_load_si128((const __m128i *)input); - in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - stp1_5 = _mm_packs_epi32(tmp4, tmp6); - } - - // Stage2 - { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_04, stg2_0); - tmp2 = _mm_madd_epi16(lo_04, stg2_1); - tmp4 = _mm_madd_epi16(lo_26, stg2_2); - tmp6 = _mm_madd_epi16(lo_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp0, tmp2); - stp2_2 = _mm_packs_epi32(tmp6, tmp4); - - tmp0 = _mm_adds_epi16(stp1_4, stp1_5); - tmp1 = _mm_subs_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage3 - { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - - tmp4 = _mm_adds_epi16(stp2_0, stp2_2); - tmp6 = _mm_subs_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); - stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); - - tmp0 = _mm_madd_epi16(lo_56, stg3_0); - tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - tmp0 = _mm_adds_epi16(stp1_3, stp2_4); - tmp1 = _mm_adds_epi16(stp1_2, stp1_5); - tmp2 = _mm_subs_epi16(stp1_3, stp2_4); - tmp3 = _mm_subs_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, - in0, in1, in2, in3, in4, in5, in6, in7); - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ - stg2_0, stg2_1, stg2_2, stg2_3, \ - stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ - stg2_4, stg2_5, stg2_6, stg2_7, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ - stg3_0, stg3_1, stg3_2, stg3_3, \ - stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ - stg4_0, stg4_1, stg4_2, stg4_3, \ - stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ - stg4_4, stg4_5, stg4_6, stg4_7, \ - stp2_9, stp2_14, stp2_10, stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ - } - -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ - stg2_0, stg2_1, stg2_6, stg2_7, \ - stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ - stg3_0, stg3_1, \ - stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ - stg4_0, stg4_1, \ - stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ - stg4_4, stg4_5, stg4_6, stg4_7, \ - stp2_9, stp2_14, stp2_10, stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ - } - -void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - curr1 = l; - for (i = 0; i < 2; i++) { - // 1-D idct - - // Load input data. - in[0] = _mm_load_si128((const __m128i *)input); - in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); - in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); - in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); - in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); - in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); - in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); - in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); - in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); - in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); - - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; - input += 128; - } - for (i = 0; i < 2; i++) { - int j; - // 1-D idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; - - a = dct_const_round_shift(input[0] * cospi_16_64); - a = dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); - - for (i = 0; i < 2; ++i) { - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); - RECON_AND_STORE(dest + 8 * stride, dc_value); - RECON_AND_STORE(dest + 9 * stride, dc_value); - RECON_AND_STORE(dest + 10 * stride, dc_value); - RECON_AND_STORE(dest + 11 * stride, dc_value); - RECON_AND_STORE(dest + 12 * stride, dc_value); - RECON_AND_STORE(dest + 13 * stride, dc_value); - RECON_AND_STORE(dest + 14 * stride, dc_value); - RECON_AND_STORE(dest + 15 * stride, dc_value); - dest += 8; - } -} - -static void iadst16_8col(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_packs_epi32(u[8], u[9]); - s[5] = _mm_packs_epi32(u[10], u[11]); - s[6] = _mm_packs_epi32(u[12], u[13]); - s[7] = _mm_packs_epi32(u[14], u[15]); - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - x[0] = _mm_add_epi16(s[0], s[4]); - x[1] = _mm_add_epi16(s[1], s[5]); - x[2] = _mm_add_epi16(s[2], s[6]); - x[3] = _mm_add_epi16(s[3], s[7]); - x[4] = _mm_sub_epi16(s[0], s[4]); - x[5] = _mm_sub_epi16(s[1], s[5]); - x[6] = _mm_sub_epi16(s[2], s[6]); - x[7] = _mm_sub_epi16(s[3], s[7]); - x[8] = _mm_packs_epi32(u[0], u[1]); - x[9] = _mm_packs_epi32(u[2], u[3]); - x[10] = _mm_packs_epi32(u[4], u[5]); - x[11] = _mm_packs_epi32(u[6], u[7]); - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(x[4], x[5]); - u[1] = _mm_unpackhi_epi16(x[4], x[5]); - u[2] = _mm_unpacklo_epi16(x[6], x[7]); - u[3] = _mm_unpackhi_epi16(x[6], x[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_add_epi16(x[0], x[2]); - s[1] = _mm_add_epi16(x[1], x[3]); - s[2] = _mm_sub_epi16(x[0], x[2]); - s[3] = _mm_sub_epi16(x[1], x[3]); - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - s[8] = _mm_add_epi16(x[8], x[10]); - s[9] = _mm_add_epi16(x[9], x[11]); - s[10] = _mm_sub_epi16(x[8], x[10]); - s[11] = _mm_sub_epi16(x[9], x[11]); - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -static void idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i v[16], u[16], s[16], t[16]; - - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); - v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); - v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); - v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); - v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); - v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); - v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); - v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); - v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); - v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); - v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); - v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); - v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); - v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[8] = _mm_packs_epi32(u[0], u[1]); - s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); - s[14] = _mm_packs_epi32(u[6], u[7]); - s[10] = _mm_packs_epi32(u[8], u[9]); - s[13] = _mm_packs_epi32(u[10], u[11]); - s[11] = _mm_packs_epi32(u[12], u[13]); - s[12] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[4] = _mm_packs_epi32(u[0], u[1]); - t[7] = _mm_packs_epi32(u[2], u[3]); - t[5] = _mm_packs_epi32(u[4], u[5]); - t[6] = _mm_packs_epi32(u[6], u[7]); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); - v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); - v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); - s[14] = _mm_packs_epi32(u[10], u[11]); - s[10] = _mm_packs_epi32(u[12], u[13]); - s[13] = _mm_packs_epi32(u[14], u[15]); - s[11] = t[11]; - s[12] = t[12]; - - // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - t[5] = _mm_packs_epi32(u[0], u[1]); - t[6] = _mm_packs_epi32(u[2], u[3]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); - - // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - s[10] = _mm_packs_epi32(u[0], u[1]); - s[13] = _mm_packs_epi32(u[2], u[3]); - s[11] = _mm_packs_epi32(u[4], u[5]); - s[12] = _mm_packs_epi32(u[6], u[7]); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); -} - -static void idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - idct16_8col(in0); - idct16_8col(in1); -} - -static void iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); -} - void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in0[16], in1[16]; @@ -2319,1905 +178,3 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, dest += 8; write_buffer_8x16(dest, in1, stride); } - -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = _mm_load_si128((const __m128i *)input); - in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); - - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); - tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp2_8 = _mm_packs_epi32(tmp0, tmp2); - stp2_11 = _mm_packs_epi32(tmp5, tmp7); - } - - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); - - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); - tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); - tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); - tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp2, tmp2); - stp2_9 = _mm_packs_epi32(tmp1, tmp3); - stp2_10 = _mm_packs_epi32(tmp5, tmp7); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } - - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } - - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_6 = _mm_packs_epi32(tmp3, tmp1); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp2, zero); - stp2_11 = _mm_packs_epi32(tmp4, zero); - stp2_12 = _mm_packs_epi32(tmp6, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); - - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - array_transpose_4X8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = _mm_load_si128((const __m128i *) input); \ - input += 8; \ - } \ - -#define IDCT32_34 \ -/* Stage1 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ - stg1_1, stp1_16, stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ - stg1_7, stp1_19, stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ - stg1_9, stp1_20, stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ - stg1_15, stp1_23, stp1_24); \ -} \ -\ -/* Stage2 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ - stg2_1, stp2_8, stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ - stg2_7, stp2_11, stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ -} \ -\ -/* Stage3 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ - stg3_1, stp1_4, stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ - stp1_18, stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ - stp1_22, stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ -} \ -\ -/* Stage4 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ - stg4_1, stp2_0, stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ - stp2_10, stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ -} \ -\ -/* Stage5 */ \ -{ \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} \ -\ -/* Stage6 */ \ -{ \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ - stp2_13, stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ -} \ -\ -/* Stage7 */ \ -{ \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} - - -#define IDCT32 \ -/* Stage1 */ \ -{ \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ - const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ - stp1_17, stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ - stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ -} \ -\ -/* Stage2 */ \ -{ \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ - stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ -} \ -\ -/* Stage3 */ \ -{ \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ - stp1_18, stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ - stp1_22, stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ -} \ -\ -/* Stage4 */ \ -{ \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ - stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ - stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ - stp2_10, stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ -} \ -\ -/* Stage5 */ \ -{ \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} \ -\ -/* Stage6 */ \ -{ \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ - stp2_13, stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ -} \ -\ -/* Stage7 */ \ -{ \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} - -// Only upper-left 8x8 has non-zero coeff -void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1<<5); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[32]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, - stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, - stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, - stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, - stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. Only need to load the top left 8x8 block. - in[0] = _mm_load_si128((const __m128i *)input); - in[1] = _mm_load_si128((const __m128i *)(input + 32)); - in[2] = _mm_load_si128((const __m128i *)(input + 64)); - in[3] = _mm_load_si128((const __m128i *)(input + 96)); - in[4] = _mm_load_si128((const __m128i *)(input + 128)); - in[5] = _mm_load_si128((const __m128i *)(input + 160)); - in[6] = _mm_load_si128((const __m128i *)(input + 192)); - in[7] = _mm_load_si128((const __m128i *)(input + 224)); - - for (i = 8; i < 32; ++i) { - in[i] = _mm_setzero_si128(); - } - - array_transpose_8x8(in, in); - // TODO(hkuang): Following transposes are unnecessary. But remove them will - // lead to performance drop on some devices. - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32_34 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[0] = _mm_add_epi16(stp1_0, stp1_31); - col[1] = _mm_add_epi16(stp1_1, stp1_30); - col[2] = _mm_add_epi16(stp1_2, stp1_29); - col[3] = _mm_add_epi16(stp1_3, stp1_28); - col[4] = _mm_add_epi16(stp1_4, stp1_27); - col[5] = _mm_add_epi16(stp1_5, stp1_26); - col[6] = _mm_add_epi16(stp1_6, stp1_25); - col[7] = _mm_add_epi16(stp1_7, stp1_24); - col[8] = _mm_add_epi16(stp1_8, stp1_23); - col[9] = _mm_add_epi16(stp1_9, stp1_22); - col[10] = _mm_add_epi16(stp1_10, stp1_21); - col[11] = _mm_add_epi16(stp1_11, stp1_20); - col[12] = _mm_add_epi16(stp1_12, stp1_19); - col[13] = _mm_add_epi16(stp1_13, stp1_18); - col[14] = _mm_add_epi16(stp1_14, stp1_17); - col[15] = _mm_add_epi16(stp1_15, stp1_16); - col[16] = _mm_sub_epi16(stp1_15, stp1_16); - col[17] = _mm_sub_epi16(stp1_14, stp1_17); - col[18] = _mm_sub_epi16(stp1_13, stp1_18); - col[19] = _mm_sub_epi16(stp1_12, stp1_19); - col[20] = _mm_sub_epi16(stp1_11, stp1_20); - col[21] = _mm_sub_epi16(stp1_10, stp1_21); - col[22] = _mm_sub_epi16(stp1_9, stp1_22); - col[23] = _mm_sub_epi16(stp1_8, stp1_23); - col[24] = _mm_sub_epi16(stp1_7, stp1_24); - col[25] = _mm_sub_epi16(stp1_6, stp1_25); - col[26] = _mm_sub_epi16(stp1_5, stp1_26); - col[27] = _mm_sub_epi16(stp1_4, stp1_27); - col[28] = _mm_sub_epi16(stp1_3, stp1_28); - col[29] = _mm_sub_epi16(stp1_2, stp1_29); - col[30] = _mm_sub_epi16(stp1_1, stp1_30); - col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { - int j; - const __m128i zero = _mm_setzero_si128(); - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - IDCT32_34 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, - stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, - stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, - stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, - stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; - - for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } - for (i = 0; i < 4; i++) { - // Second 1-D idct - j = i << 3; - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - IDCT32 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; - - a = dct_const_round_shift(input[0] * cospi_16_64); - a = dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); - - for (i = 0; i < 4; ++i) { - int j; - for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + j * stride, dc_value); - } - dest += 8; - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vp9_highbd_idct4(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; - vp9_highbd_idct4(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} - -void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - vp9_highbd_idct8(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - vp9_highbd_idct8(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vp9_highbd_idct8(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - vp9_highbd_idct8(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_16x16(inptr, inptr + 16); - for (i = 0; i < 16; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 16; ++i) { - vp9_highbd_idct16(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i ] = _mm_add_epi16(inptr[i ], rounding); - inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); - inptr[i ] = _mm_srai_epi16(inptr[i ], 6); - inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - vp9_highbd_idct16(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // Since all non-zero dct coefficients are in upper-left 4x4 area, - // we only need to consider first 4 rows here. - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform (N.B. This transposes inptr) - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 16; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vp9_highbd_idct16(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i ] = _mm_add_epi16(inptr[i ], rounding); - inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); - inptr[i ] = _mm_srai_epi16(inptr[i ], 6); - inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - vp9_highbd_idct16(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.h b/vp9/common/x86/vp9_idct_intrin_sse2.h deleted file mode 100644 index 984363d40..000000000 --- a/vp9/common/x86/vp9_idct_intrin_sse2.h +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include // SSE2 -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_idct.h" - -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - -static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); - - in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); - in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); - in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); - in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); - in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); - in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); - in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); - in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); -} - -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - } - -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1<<5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); -} diff --git a/vp9/common/x86/vp9_idct_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm deleted file mode 100644 index 69b68e6d8..000000000 --- a/vp9/common/x86/vp9_idct_sse2.asm +++ /dev/null @@ -1,102 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro REORDER_INPUTS 0 - ; a c d b to a b c d - SWAP 1, 3, 2 -%endmacro - -%macro TRANSFORM_COLS 0 - ; input: - ; m0 a - ; m1 b - ; m2 c - ; m3 d - paddw m0, m2 - psubw m3, m1 - - ; wide subtract - punpcklwd m4, m0 - punpcklwd m5, m3 - psrad m4, 16 - psrad m5, 16 - psubd m4, m5 - psrad m4, 1 - packssdw m4, m4 ; e - - psubw m5, m4, m1 ; b - psubw m4, m2 ; c - psubw m0, m5 - paddw m3, m4 - ; m0 a - SWAP 1, 5 ; m1 b - SWAP 2, 4 ; m2 c - ; m3 d -%endmacro - -%macro TRANSPOSE_4X4 0 - punpcklwd m0, m2 - punpcklwd m1, m3 - mova m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - pshufd m1, m0, 0x0e - pshufd m3, m2, 0x0e -%endmacro - -; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 -%macro TRANSPOSE_4X4_WIDE 0 - mova m3, m0 - punpcklwd m0, m1 - punpckhwd m3, m1 - mova m2, m0 - punpcklwd m0, m3 - punpckhwd m2, m3 - pshufd m1, m0, 0x0e - pshufd m3, m2, 0x0e -%endmacro - -%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero - movq m%3, [outputq] - movq m%4, [outputq + strideq] - punpcklbw m%3, m%5 - punpcklbw m%4, m%5 - paddw m%1, m%3 - paddw m%2, m%4 - packuswb m%1, m%5 - packuswb m%2, m%5 - movd [outputq], m%1 - movd [outputq + strideq], m%2 -%endmacro - -INIT_XMM sse2 -cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride - mova m0, [inputq + 0] - mova m1, [inputq + 16] - - psraw m0, 2 - psraw m1, 2 - - TRANSPOSE_4X4_WIDE - REORDER_INPUTS - TRANSFORM_COLS - TRANSPOSE_4X4 - REORDER_INPUTS - TRANSFORM_COLS - - pxor m4, m4 - ADD_STORE_4P_2X 0, 1, 5, 6, 4 - lea outputq, [outputq + 2 * strideq] - ADD_STORE_4P_2X 2, 3, 5, 6, 4 - - RET diff --git a/vp9/common/x86/vp9_idct_ssse3_x86_64.asm b/vp9/common/x86/vp9_idct_ssse3_x86_64.asm deleted file mode 100644 index 2c1060710..000000000 --- a/vp9/common/x86/vp9_idct_ssse3_x86_64.asm +++ /dev/null @@ -1,300 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the inverse transformation. Part -; of the functions are originally derived from the ffmpeg project. -; Note that the current version applies to x86 64-bit only. - -SECTION_RODATA - -pw_11585x2: times 8 dw 23170 -pd_8192: times 4 dd 8192 -pw_16: times 8 dw 16 - -%macro TRANSFORM_COEFFS 2 -pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 -pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 -%endmacro - -TRANSFORM_COEFFS 6270, 15137 -TRANSFORM_COEFFS 3196, 16069 -TRANSFORM_COEFFS 13623, 9102 - -%macro PAIR_PP_COEFFS 2 -dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MP_COEFFS 2 -dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MM_COEFFS 2 -dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 -%endmacro - -PAIR_PP_COEFFS 30274, 12540 -PAIR_PP_COEFFS 6392, 32138 -PAIR_MP_COEFFS 18204, 27246 - -PAIR_PP_COEFFS 12540, 12540 -PAIR_PP_COEFFS 30274, 30274 -PAIR_PP_COEFFS 6392, 6392 -PAIR_PP_COEFFS 32138, 32138 -PAIR_MM_COEFFS 18204, 18204 -PAIR_PP_COEFFS 27246, 27246 - -SECTION .text - -%if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro IDCT8_1D 0 - SUM_SUB 0, 4, 9 - BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 - pmulhrsw m0, m12 - pmulhrsw m4, m12 - BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 - - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - SUM_SUB 0, 6, 9 - SUM_SUB 4, 2, 9 - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 4, 3, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 4 -%endmacro - -; This macro handles 8 pixels per line -%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero - paddw m%1, m11 - paddw m%2, m11 - psraw m%1, 5 - psraw m%2, 5 - - movh m%3, [outputq] - movh m%4, [outputq + strideq] - punpcklbw m%3, m%5 - punpcklbw m%4, m%5 - paddw m%3, m%1 - paddw m%4, m%2 - packuswb m%3, m%5 - packuswb m%4, m%5 - movh [outputq], m%3 - movh [outputq + strideq], m%4 -%endmacro - -INIT_XMM ssse3 -; full inverse 8x8 2D-DCT transform -cglobal idct8x8_64_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] - - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] - mova m4, [inputq + 64] - mova m5, [inputq + 80] - mova m6, [inputq + 96] - mova m7, [inputq + 112] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero -cglobal idct8x8_12_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] - - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] - - punpcklwd m0, m1 - punpcklwd m2, m3 - punpckhdq m9, m0, m2 - punpckldq m0, m2 - SWAP 2, 9 - - ; m0 -> [0], [0] - ; m1 -> [1], [1] - ; m2 -> [2], [2] - ; m3 -> [3], [3] - punpckhqdq m10, m0, m0 - punpcklqdq m0, m0 - punpckhqdq m9, m2, m2 - punpcklqdq m2, m2 - SWAP 1, 10 - SWAP 3, 9 - - pmulhrsw m0, m12 - pmulhrsw m2, [dpw_30274_12540] - pmulhrsw m1, [dpw_6392_32138] - pmulhrsw m3, [dpw_m18204_27246] - - SUM_SUB 0, 2, 9 - SUM_SUB 1, 3, 9 - - punpcklqdq m9, m3, m3 - punpckhqdq m5, m3, m9 - - SUM_SUB 3, 5, 9 - punpckhqdq m5, m3 - pmulhrsw m5, m12 - - punpckhqdq m9, m1, m5 - punpcklqdq m1, m5 - SWAP 5, 9 - - SUM_SUB 0, 5, 9 - SUM_SUB 2, 1, 9 - - punpckhqdq m3, m0, m0 - punpckhqdq m4, m1, m1 - punpckhqdq m6, m5, m5 - punpckhqdq m7, m2, m2 - - punpcklwd m0, m3 - punpcklwd m7, m2 - punpcklwd m1, m4 - punpcklwd m6, m5 - - punpckhdq m4, m0, m7 - punpckldq m0, m7 - punpckhdq m10, m1, m6 - punpckldq m5, m1, m6 - - punpckhqdq m1, m0, m5 - punpcklqdq m0, m5 - punpckhqdq m3, m4, m10 - punpcklqdq m2, m4, m10 - - - pmulhrsw m0, m12 - pmulhrsw m6, m2, [dpw_30274_30274] - pmulhrsw m4, m2, [dpw_12540_12540] - - pmulhrsw m7, m1, [dpw_32138_32138] - pmulhrsw m1, [dpw_6392_6392] - pmulhrsw m5, m3, [dpw_m18204_m18204] - pmulhrsw m3, [dpw_27246_27246] - - mova m2, m0 - SUM_SUB 0, 6, 9 - SUM_SUB 2, 4, 9 - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 2, 3, 9 - SUM_SUB 4, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 2 - SWAP 2, 4 - - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -%endif diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c index 0e7d2a140..b09eac0d1 100644 --- a/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/vp9/encoder/x86/vp9_dct_ssse3.c @@ -17,7 +17,7 @@ #include // SSSE3 #include "./vp9_rtcd.h" -#include "vp9/common/x86/vp9_idct_intrin_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index cc201bccd..159eea5f2 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -66,7 +66,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm @@ -96,13 +95,6 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c endif VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h - -ifeq ($(ARCH_X86_64), yes) -ifeq ($(CONFIG_USE_X86INC),yes) -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm -endif -endif VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) @@ -111,30 +103,4 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c endif -# neon with assembly and intrinsics implementations. If both are available -# prefer assembly. -ifeq ($(HAVE_NEON_ASM), yes) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) -else -ifeq ($(HAVE_NEON), yes) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c -endif # HAVE_NEON -endif # HAVE_NEON_ASM - $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/vpx_dsp/arm/idct16x16_1_add_neon.asm b/vpx_dsp/arm/idct16x16_1_add_neon.asm new file mode 100644 index 000000000..b1fd21bb6 --- /dev/null +++ b/vpx_dsp/arm/idct16x16_1_add_neon.asm @@ -0,0 +1,198 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_idct16x16_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_idct16x16_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asr r0, r0, #6 ; >> 6 + + vdup.s16 q0, r0 ; duplicate a1 + mov r0, #8 + sub r2, #8 + + ; load destination data row0 - row3 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row4 - row7 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row8 - row11 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row12 - row15 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |vp9_idct16x16_1_add_neon| + + END diff --git a/vpx_dsp/arm/idct16x16_1_add_neon.c b/vpx_dsp/arm/idct16x16_1_add_neon.c new file mode 100644 index 000000000..aa035e770 --- /dev/null +++ b/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +void vp9_idct16x16_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, j, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + for (d1 = d2 = dest, i = 0; i < 4; i++) { + for (j = 0; j < 2; j++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + } + return; +} diff --git a/vpx_dsp/arm/idct16x16_add_neon.asm b/vpx_dsp/arm/idct16x16_add_neon.asm new file mode 100644 index 000000000..a13c0d04b --- /dev/null +++ b/vpx_dsp/arm/idct16x16_add_neon.asm @@ -0,0 +1,1179 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_idct16x16_256_add_neon_pass1| + EXPORT |vp9_idct16x16_256_add_neon_pass2| + EXPORT |vp9_idct16x16_10_add_neon_pass1| + EXPORT |vp9_idct16x16_10_add_neon_pass2| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|vp9_idct16x16_256_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64 = 3196 + mov r3, #0xc00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r12, #0x3e00 + add r12, #0xc5 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r12 ; duplicate cospi_4_64 + + ; preloading to avoid stall + ; generate cospi_12_64 = 13623 + mov r3, #0x3500 + add r3, #0x37 + + ; generate cospi_20_64 = 9102 + mov r12, #0x2300 + add r12, #0x8e + + ; step2[4] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; step2[4] * cospi_4_64 + vmull.s16 q5, d18, d1 + vmull.s16 q6, d19, d1 + + ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 + vmlal.s16 q5, d30, d0 + vmlal.s16 q6, d31, d0 + + vdup.16 d2, r3 ; duplicate cospi_12_64 + vdup.16 d3, r12 ; duplicate cospi_20_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q5, #14 ; >> 14 + vqrshrn.s32 d15, q6, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; generate cospi_24_64 = 6270 + mov r12, #0x1800 + add r12, #0x7e + + ; step2[5] * cospi_12_64 + vmull.s16 q2, d26, d2 + vmull.s16 q3, d27, d2 + + ; step2[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q15, d27, d3 + + ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q2, d22, d3 + vmlsl.s16 q3, d23, d3 + + ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q15, d23, d2 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q2, #14 ; >> 14 + vqrshrn.s32 d11, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q15, #14 ; >> 14 + + ; stage 4 + vdup.16 d30, r3 ; cospi_16_64 + + ; step1[0] * cospi_16_64 + vmull.s16 q2, d16, d30 + vmull.s16 q11, d17, d30 + + ; step1[1] * cospi_16_64 + vmull.s16 q0, d24, d30 + vmull.s16 q1, d25, d30 + + ; generate cospi_8_64 = 15137 + mov r3, #0x3b00 + add r3, #0x21 + + vdup.16 d30, r12 ; duplicate cospi_24_64 + vdup.16 d31, r3 ; duplicate cospi_8_64 + + ; temp1 = (step1[0] + step1[1]) * cospi_16_64 + vadd.s32 q3, q2, q0 + vadd.s32 q12, q11, q1 + + ; temp2 = (step1[0] - step1[1]) * cospi_16_64 + vsub.s32 q13, q2, q0 + vsub.s32 q1, q11, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d16, q3, #14 ; >> 14 + vqrshrn.s32 d17, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d18, q13, #14 ; >> 14 + vqrshrn.s32 d19, q1, #14 ; >> 14 + + ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + ; step1[2] * cospi_8_64 + vmull.s16 q0, d20, d31 + vmull.s16 q1, d21, d31 + + ; step1[2] * cospi_24_64 + vmull.s16 q12, d20, d30 + vmull.s16 q13, d21, d30 + + ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q0, d28, d30 + vmlal.s16 q1, d29, d30 + + ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q12, d28, d31 + vmlsl.s16 q13, d29, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d22, q0, #14 ; >> 14 + vqrshrn.s32 d23, q1, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d20, q12, #14 ; >> 14 + vqrshrn.s32 d21, q13, #14 ; >> 14 + + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; + vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; stage 5 + vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; + vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; + vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; + vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; + + vdup.16 d16, r3; ; duplicate cospi_16_64 + + ; step2[5] * cospi_16_64 + vmull.s16 q11, d26, d16 + vmull.s16 q12, d27, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q6, q9, q11 + vsub.s32 q13, q10, q12 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q6, #14 ; >> 14 + vqrshrn.s32 d11, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d16}, [r1], r2 + vst1.64 {d17}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |vp9_idct16x16_256_add_neon_pass1| + +;void vp9_idct16x16_256_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|vp9_idct16x16_256_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate cospi_30_64 = 1606 + mov r3, #0x0600 + add r3, #0x46 + + ; generate cospi_2_64 = 16305 + mov r12, #0x3f00 + add r12, #0xb1 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d12, r3 ; duplicate cospi_30_64 + vdup.16 d13, r12 ; duplicate cospi_2_64 + + ; preloading to avoid stall + ; generate cospi_14_64 = 12665 + mov r3, #0x3100 + add r3, #0x79 + + ; generate cospi_18_64 = 10394 + mov r12, #0x2800 + add r12, #0x9a + + ; step1[8] * cospi_30_64 + vmull.s16 q2, d16, d12 + vmull.s16 q3, d17, d12 + + ; step1[8] * cospi_2_64 + vmull.s16 q1, d16, d13 + vmull.s16 q4, d17, d13 + + ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 + vmlsl.s16 q2, d30, d13 + vmlsl.s16 q3, d31, d13 + + ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 + vmlal.s16 q1, d30, d12 + vmlal.s16 q4, d31, d12 + + vdup.16 d30, r3 ; duplicate cospi_14_64 + vdup.16 d31, r12 ; duplicate cospi_18_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d0, q2, #14 ; >> 14 + vqrshrn.s32 d1, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q1, #14 ; >> 14 + vqrshrn.s32 d15, q4, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_22_64 = 7723 + mov r3, #0x1e00 + add r3, #0x2b + + ; generate cospi_10_64 = 14449 + mov r12, #0x3800 + add r12, #0x71 + + ; step1[9] * cospi_14_64 + vmull.s16 q2, d24, d30 + vmull.s16 q3, d25, d30 + + ; step1[9] * cospi_18_64 + vmull.s16 q4, d24, d31 + vmull.s16 q5, d25, d31 + + ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 + vmlsl.s16 q2, d22, d31 + vmlsl.s16 q3, d23, d31 + + ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 + vmlal.s16 q4, d22, d30 + vmlal.s16 q5, d23, d30 + + vdup.16 d30, r3 ; duplicate cospi_22_64 + vdup.16 d31, r12 ; duplicate cospi_10_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q2, #14 ; >> 14 + vqrshrn.s32 d3, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q4, #14 ; >> 14 + vqrshrn.s32 d13, q5, #14 ; >> 14 + + ; step1[10] * cospi_22_64 + vmull.s16 q11, d20, d30 + vmull.s16 q12, d21, d30 + + ; step1[10] * cospi_10_64 + vmull.s16 q4, d20, d31 + vmull.s16 q5, d21, d31 + + ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 + vmlsl.s16 q11, d26, d31 + vmlsl.s16 q12, d27, d31 + + ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 + vmlal.s16 q4, d26, d30 + vmlal.s16 q5, d27, d30 + + ; preloading to avoid stall + ; generate cospi_6_64 = 15679 + mov r3, #0x3d00 + add r3, #0x3f + + ; generate cospi_26_64 = 4756 + mov r12, #0x1200 + add r12, #0x94 + + vdup.16 d30, r3 ; duplicate cospi_6_64 + vdup.16 d31, r12 ; duplicate cospi_26_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d11, q5, #14 ; >> 14 + vqrshrn.s32 d10, q4, #14 ; >> 14 + + ; step1[11] * cospi_6_64 + vmull.s16 q10, d28, d30 + vmull.s16 q11, d29, d30 + + ; step1[11] * cospi_26_64 + vmull.s16 q12, d28, d31 + vmull.s16 q13, d29, d31 + + ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 + vmlsl.s16 q10, d18, d31 + vmlsl.s16 q11, d19, d31 + + ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 + vmlal.s16 q12, d18, d30 + vmlal.s16 q13, d19, d30 + + vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] + vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q11, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q12, #14 ; >> 14 + vqrshrn.s32 d9, q13, #14 ; >> 14 + + ; stage 3 + vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] + vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] + vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] + vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] + vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] + vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + + ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vdup.16 d30, r12 ; duplicate cospi_8_64 + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d18, d31 + vmull.s16 q3, d19, d31 + + ; step1[14] * cospi_24_64 + vmull.s16 q4, d28, d31 + vmull.s16 q5, d29, d31 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d28, d30 + vmlal.s16 q3, d29, d30 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q4, d18, d30 + vmlsl.s16 q5, d19, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q4, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + vmov.s16 q3, q11 + vmov.s16 q4, q12 + + ; - step1[13] * cospi_8_64 + vmull.s16 q11, d26, d30 + vmull.s16 q12, d27, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d20, d30 + vmull.s16 q9, d21, d30 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlsl.s16 q11, d20, d31 + vmlsl.s16 q12, d21, d31 + + ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d26, d31 + vmlal.s16 q9, d27, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q10, q3, q0 + vadd.s32 q4, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q10, #14 ; >> 14 + vqrshrn.s32 d11, q4, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + cmp r3, #0 ; check if need adding dest data + beq skip_adding_dest + + ldr r7, [sp, #28] ; dest used to save element 0-7 + mov r9, r7 ; save dest pointer for later use + ldr r8, [sp, #32] ; load dest_stride + + ; stage 7 + ; load the data in pass1 + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + + ; store the data output 8,9,10,11,12,13,14,15 + vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q8 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q9, q9, #6 + vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q9 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q2, q2, #6 + vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q2 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q3, q3, #6 + vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q3 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q4, q4, #6 + vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q4 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q5, q5, #6 + vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q5 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q14, q14, #6 + vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q14 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q15, q15, #6 + vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q15 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + b end_idct16x16_pass2 + +skip_adding_dest + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |vp9_idct16x16_256_add_neon_pass2| + +;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|vp9_idct16x16_10_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64*2 = 6392 + mov r3, #0x1800 + add r3, #0xf8 + + ; generate cospi_4_64*2 = 32138 + mov r12, #0x7d00 + add r12, #0x8a + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q0, r3 ; duplicate cospi_28_64*2 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, + ; double, and return the high 16 bits, effectively giving >> 15. Doubling + ; the constant will change this to >> 14. + ; dct_const_round_shift(step2[4] * cospi_28_64); + vqrdmulh.s16 q4, q9, q0 + + ; preloading to avoid stall + ; generate cospi_16_64*2 = 23170 + mov r3, #0x5a00 + add r3, #0x82 + + ; dct_const_round_shift(step2[4] * cospi_4_64); + vqrdmulh.s16 q7, q9, q1 + + ; stage 4 + vdup.16 q1, r3 ; cospi_16_64*2 + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + vdup.16 d4, r3; ; duplicate cospi_16_64 + + ; dct_const_round_shift(step1[0] * cospi_16_64) + vqrdmulh.s16 q8, q8, q1 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d14, d4 + vmull.s16 q10, d15, d4 + + ; step2[5] * cospi_16_64 + vmull.s16 q12, d9, d4 + vmull.s16 q11, d8, d4 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q15, q10, q12 + vsub.s32 q6, q9, q11 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d11, q15, #14 ; >> 14 + vqrshrn.s32 d10, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; + vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; + vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d4}, [r1], r2 + vst1.64 {d5}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |vp9_idct16x16_10_add_neon_pass1| + +;void vp9_idct16x16_10_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|vp9_idct16x16_10_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate 2*cospi_30_64 = 3212 + mov r3, #0xc00 + add r3, #0x8c + + ; generate 2*cospi_2_64 = 32610 + mov r12, #0x7f00 + add r12, #0x62 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q6, r3 ; duplicate 2*cospi_30_64 + + ; dct_const_round_shift(step1[8] * cospi_30_64) + vqrdmulh.s16 q0, q8, q6 + + vdup.16 q6, r12 ; duplicate 2*cospi_2_64 + + ; dct_const_round_shift(step1[8] * cospi_2_64) + vqrdmulh.s16 q7, q8, q6 + + ; preloading to avoid stall + ; generate 2*cospi_26_64 = 9512 + mov r12, #0x2500 + add r12, #0x28 + rsb r12, #0 + vdup.16 q15, r12 ; duplicate -2*cospi_26_64 + + ; generate 2*cospi_6_64 = 31358 + mov r3, #0x7a00 + add r3, #0x7e + vdup.16 q14, r3 ; duplicate 2*cospi_6_64 + + ; dct_const_round_shift(- step1[12] * cospi_26_64) + vqrdmulh.s16 q3, q9, q15 + + ; dct_const_round_shift(step1[12] * cospi_6_64) + vqrdmulh.s16 q4, q9, q14 + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + vdup.16 d30, r12 ; duplicate cospi_8_64 + + ; step1[14] * cospi_24_64 + vmull.s16 q12, d14, d31 + vmull.s16 q5, d15, d31 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d0, d31 + vmull.s16 q11, d1, d31 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q12, d0, d30 + vmlsl.s16 q5, d1, d30 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d14, d30 + vmlal.s16 q11, d15, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q12, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q11, #14 ; >> 14 + + ; - step1[13] * cospi_8_64 + vmull.s16 q10, d8, d30 + vmull.s16 q13, d9, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d6, d30 + vmull.s16 q9, d7, d30 + + ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 + vmlsl.s16 q10, d6, d31 + vmlsl.s16 q13, d7, d31 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d8, d31 + vmlal.s16 q9, d9, d31 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q10, #14 ; >> 14 + vqrshrn.s32 d5, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q0, q3, q0 + vadd.s32 q1, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q0, #14 ; >> 14 + vqrshrn.s32 d11, q1, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct10_16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |vp9_idct16x16_10_add_neon_pass2| + END diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c new file mode 100644 index 000000000..545388ae6 --- /dev/null +++ b/vpx_dsp/arm/idct16x16_add_neon.c @@ -0,0 +1,1317 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void TRANSPOSE8X8( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), + vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), + vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), + vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), + vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +void vp9_idct16x16_256_add_neon_pass1( + int16_t *in, + int16_t *out, + int output_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d18s16, d1s16); + q6s32 = vmull_s16(d19s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlal_s16(q5s32, d30s16, d0s16); + q6s32 = vmlal_s16(q6s32, d31s16, d0s16); + + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q5s32, 14); + d15s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + q2s32 = vmull_s16(d26s16, d2s16); + q3s32 = vmull_s16(d27s16, d2s16); + q9s32 = vmull_s16(d26s16, d3s16); + q15s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q15s32 = vmlal_s16(q15s32, d23s16, d2s16); + + d10s16 = vqrshrn_n_s32(q2s32, 14); + d11s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q15s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + d30s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d30s16); + q11s32 = vmull_s16(d17s16, d30s16); + q0s32 = vmull_s16(d24s16, d30s16); + q1s32 = vmull_s16(d25s16, d30s16); + + d30s16 = vdup_n_s16(cospi_24_64); + d31s16 = vdup_n_s16(cospi_8_64); + + q3s32 = vaddq_s32(q2s32, q0s32); + q12s32 = vaddq_s32(q11s32, q1s32); + q13s32 = vsubq_s32(q2s32, q0s32); + q1s32 = vsubq_s32(q11s32, q1s32); + + d16s16 = vqrshrn_n_s32(q3s32, 14); + d17s16 = vqrshrn_n_s32(q12s32, 14); + d18s16 = vqrshrn_n_s32(q13s32, 14); + d19s16 = vqrshrn_n_s32(q1s32, 14); + q8s16 = vcombine_s16(d16s16, d17s16); + q9s16 = vcombine_s16(d18s16, d19s16); + + q0s32 = vmull_s16(d20s16, d31s16); + q1s32 = vmull_s16(d21s16, d31s16); + q12s32 = vmull_s16(d20s16, d30s16); + q13s32 = vmull_s16(d21s16, d30s16); + + q0s32 = vmlal_s16(q0s32, d28s16, d30s16); + q1s32 = vmlal_s16(q1s32, d29s16, d30s16); + q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); + + d22s16 = vqrshrn_n_s32(q0s32, 14); + d23s16 = vqrshrn_n_s32(q1s32, 14); + d20s16 = vqrshrn_n_s32(q12s32, 14); + d21s16 = vqrshrn_n_s32(q13s32, 14); + q10s16 = vcombine_s16(d20s16, d21s16); + q11s16 = vcombine_s16(d22s16, d23s16); + + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q15s16 = vaddq_s16(q6s16, q7s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + // stage 5 + q0s16 = vaddq_s16(q8s16, q11s16); + q1s16 = vaddq_s16(q9s16, q10s16); + q2s16 = vsubq_s16(q9s16, q10s16); + q3s16 = vsubq_s16(q8s16, q11s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q11s32 = vmull_s16(d26s16, d16s16); + q12s32 = vmull_s16(d27s16, d16s16); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + + q6s32 = vsubq_s32(q9s32, q11s32); + q13s32 = vsubq_s32(q10s32, q12s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d10s16 = vqrshrn_n_s32(q6s32, 14); + d11s16 = vqrshrn_n_s32(q13s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q8s16 = vaddq_s16(q0s16, q15s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d16u64); + out += output_stride; + vst1_u64((uint64_t *)out, d17u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void vp9_idct16x16_256_add_neon_pass2( + int16_t *src, + int16_t *out, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride) { + uint8_t *d; + uint8x8_t d12u8, d13u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d24u64, d25u64, d26u64, d27u64; + int64x1_t d12s64, d13s64; + uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; + uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d12s16 = vdup_n_s16(cospi_30_64); + d13s16 = vdup_n_s16(cospi_2_64); + + q2s32 = vmull_s16(d16s16, d12s16); + q3s32 = vmull_s16(d17s16, d12s16); + q1s32 = vmull_s16(d16s16, d13s16); + q4s32 = vmull_s16(d17s16, d13s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); + q1s32 = vmlal_s16(q1s32, d30s16, d12s16); + q4s32 = vmlal_s16(q4s32, d31s16, d12s16); + + d0s16 = vqrshrn_n_s32(q2s32, 14); + d1s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q1s32, 14); + d15s16 = vqrshrn_n_s32(q4s32, 14); + q0s16 = vcombine_s16(d0s16, d1s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d30s16 = vdup_n_s16(cospi_14_64); + d31s16 = vdup_n_s16(cospi_18_64); + + q2s32 = vmull_s16(d24s16, d30s16); + q3s32 = vmull_s16(d25s16, d30s16); + q4s32 = vmull_s16(d24s16, d31s16); + q5s32 = vmull_s16(d25s16, d31s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); + q4s32 = vmlal_s16(q4s32, d22s16, d30s16); + q5s32 = vmlal_s16(q5s32, d23s16, d30s16); + + d2s16 = vqrshrn_n_s32(q2s32, 14); + d3s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q4s32, 14); + d13s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(cospi_22_64); + d31s16 = vdup_n_s16(cospi_10_64); + + q11s32 = vmull_s16(d20s16, d30s16); + q12s32 = vmull_s16(d21s16, d30s16); + q4s32 = vmull_s16(d20s16, d31s16); + q5s32 = vmull_s16(d21s16, d31s16); + + q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); + q4s32 = vmlal_s16(q4s32, d26s16, d30s16); + q5s32 = vmlal_s16(q5s32, d27s16, d30s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d11s16 = vqrshrn_n_s32(q5s32, 14); + d10s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + d30s16 = vdup_n_s16(cospi_6_64); + d31s16 = vdup_n_s16(cospi_26_64); + + q10s32 = vmull_s16(d28s16, d30s16); + q11s32 = vmull_s16(d29s16, d30s16); + q12s32 = vmull_s16(d28s16, d31s16); + q13s32 = vmull_s16(d29s16, d31s16); + + q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); + q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); + q12s32 = vmlal_s16(q12s32, d18s16, d30s16); + q13s32 = vmlal_s16(q13s32, d19s16, d30s16); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q11s32, 14); + d8s16 = vqrshrn_n_s32(q12s32, 14); + d9s16 = vqrshrn_n_s32(q13s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 3 + q9s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q10s16 = vsubq_s16(q3s16, q2s16); + q11s16 = vaddq_s16(q2s16, q3s16); + q12s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q6s16, q7s16); + + // stage 4 + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + q2s32 = vmull_s16(d18s16, d31s16); + q3s32 = vmull_s16(d19s16, d31s16); + q4s32 = vmull_s16(d28s16, d31s16); + q5s32 = vmull_s16(d29s16, d31s16); + + q2s32 = vmlal_s16(q2s32, d28s16, d30s16); + q3s32 = vmlal_s16(q3s32, d29s16, d30s16); + q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); + + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q3s32, 14); + d2s16 = vqrshrn_n_s32(q4s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + q3s16 = q11s16; + q4s16 = q12s16; + + d30s16 = vdup_n_s16(-cospi_8_64); + q11s32 = vmull_s16(d26s16, d30s16); + q12s32 = vmull_s16(d27s16, d30s16); + q8s32 = vmull_s16(d20s16, d30s16); + q9s32 = vmull_s16(d21s16, d30s16); + + q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); + q8s32 = vmlal_s16(q8s32, d26s16, d31s16); + q9s32 = vmlal_s16(q9s32, d27s16, d31s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16(cospi_16_64); + + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q10s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q10s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + if (skip_adding != 0) { + d = dest; + // load the data in pass1 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + // store the data out 8,9,10,11,12,13,14,15 + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q8s16 = vrshrq_n_s16(q8s16, 6); + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q9s16 = vrshrq_n_s16(q9s16, 6); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q2s16 = vrshrq_n_s16(q2s16, 6); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q3s16 = vrshrq_n_s16(q3s16, 6); + q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q4s16 = vrshrq_n_s16(q4s16, 6); + q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q5s16 = vrshrq_n_s16(q5s16, 6); + q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q14s16 = vrshrq_n_s16(q14s16, 6); + q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + q15s16 = vrshrq_n_s16(q15s16, 6); + q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + } else { // skip_adding_dest + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + } + return; +} + +void vp9_idct16x16_10_add_neon_pass1( + int16_t *in, + int16_t *out, + int output_stride) { + int16x4_t d4s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // stage 3 + q0s16 = vdupq_n_s16(cospi_28_64 * 2); + q1s16 = vdupq_n_s16(cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + // stage 4 + q1s16 = vdupq_n_s16(cospi_16_64 * 2); + d4s16 = vdup_n_s16(cospi_16_64); + + q8s16 = vqrdmulhq_s16(q8s16, q1s16); + + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + q9s32 = vmull_s16(d14s16, d4s16); + q10s32 = vmull_s16(d15s16, d4s16); + q12s32 = vmull_s16(d9s16, d4s16); + q11s32 = vmull_s16(d8s16, d4s16); + + q15s32 = vsubq_s32(q10s32, q12s32); + q6s32 = vsubq_s32(q9s32, q11s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d11s16 = vqrshrn_n_s32(q15s32, 14); + d10s16 = vqrshrn_n_s32(q6s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q2s16 = vaddq_s16(q8s16, q7s16); + q9s16 = vaddq_s16(q8s16, q6s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q8s16, q4s16); + q12s16 = vsubq_s16(q8s16, q4s16); + q13s16 = vsubq_s16(q8s16, q5s16); + q14s16 = vsubq_s16(q8s16, q6s16); + q15s16 = vsubq_s16(q8s16, q7s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d4u64); + out += output_stride; + vst1_u64((uint64_t *)out, d5u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void vp9_idct16x16_10_add_neon_pass2( + int16_t *src, + int16_t *out, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; + uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; + uint64x1_t d16u64, d17u64, d18u64, d19u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + (void)skip_adding; + (void)dest; + (void)dest_stride; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // stage 3 + q6s16 = vdupq_n_s16(cospi_30_64 * 2); + q0s16 = vqrdmulhq_s16(q8s16, q6s16); + q6s16 = vdupq_n_s16(cospi_2_64 * 2); + q7s16 = vqrdmulhq_s16(q8s16, q6s16); + + q15s16 = vdupq_n_s16(-cospi_26_64 * 2); + q14s16 = vdupq_n_s16(cospi_6_64 * 2); + q3s16 = vqrdmulhq_s16(q9s16, q15s16); + q4s16 = vqrdmulhq_s16(q9s16, q14s16); + + // stage 4 + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d6s16 = vget_low_s16(q3s16); + d7s16 = vget_high_s16(q3s16); + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + q12s32 = vmull_s16(d14s16, d31s16); + q5s32 = vmull_s16(d15s16, d31s16); + q2s32 = vmull_s16(d0s16, d31s16); + q11s32 = vmull_s16(d1s16, d31s16); + + q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); + q2s32 = vmlal_s16(q2s32, d14s16, d30s16); + q11s32 = vmlal_s16(q11s32, d15s16, d30s16); + + d2s16 = vqrshrn_n_s32(q12s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q11s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(-cospi_8_64); + q10s32 = vmull_s16(d8s16, d30s16); + q13s32 = vmull_s16(d9s16, d30s16); + q8s32 = vmull_s16(d6s16, d30s16); + q9s32 = vmull_s16(d7s16, d30s16); + + q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); + q8s32 = vmlal_s16(q8s32, d8s16, d31s16); + q9s32 = vmlal_s16(q9s32, d9s16, d31s16); + + d4s16 = vqrshrn_n_s32(q10s32, 14); + d5s16 = vqrshrn_n_s32(q13s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16(cospi_16_64); + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q0s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q0s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); + d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); + d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); + d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); + d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); + d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + vst1_u64((uint64_t *)out, d16u64); + out += 4; + vst1_u64((uint64_t *)out, d17u64); + out += 12; + vst1_u64((uint64_t *)out, d18u64); + out += 4; + vst1_u64((uint64_t *)out, d19u64); + out += 12; + vst1_u64((uint64_t *)out, d4u64); + out += 4; + vst1_u64((uint64_t *)out, d5u64); + out += 12; + vst1_u64((uint64_t *)out, d6u64); + out += 4; + vst1_u64((uint64_t *)out, d7u64); + out += 12; + vst1_u64((uint64_t *)out, d8u64); + out += 4; + vst1_u64((uint64_t *)out, d9u64); + out += 12; + vst1_u64((uint64_t *)out, d10u64); + out += 4; + vst1_u64((uint64_t *)out, d11u64); + out += 12; + vst1_u64((uint64_t *)out, d28u64); + out += 4; + vst1_u64((uint64_t *)out, d29u64); + out += 12; + vst1_u64((uint64_t *)out, d30u64); + out += 4; + vst1_u64((uint64_t *)out, d31u64); + return; +} diff --git a/vpx_dsp/arm/idct16x16_neon.c b/vpx_dsp/arm/idct16x16_neon.c new file mode 100644 index 000000000..ac9f348bc --- /dev/null +++ b/vpx_dsp/arm/idct16x16_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common.h" + +void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); +void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); + +#if HAVE_NEON_ASM +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ +extern void vp9_push_neon(int64_t *store); +extern void vp9_pop_neon(int64_t *store); +#endif // HAVE_NEON_ASM + +void vp9_idct16x16_256_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { +#if HAVE_NEON_ASM + int64_t store_reg[8]; +#endif + int16_t pass1_output[16*16] = {0}; + int16_t row_idct_output[16*16] = {0}; + +#if HAVE_NEON_ASM + // save d8-d15 register values. + vp9_push_neon(store_reg); +#endif + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vp9_idct16x16_256_add_neon_pass2(input+1, + row_idct_output, + pass1_output, + 0, + dest, + dest_stride); + + /* Parallel idct on the lower 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vp9_idct16x16_256_add_neon_pass2(input+8*16+1, + row_idct_output+8, + pass1_output, + 0, + dest, + dest_stride); + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, + row_idct_output, + pass1_output, + 1, + dest, + dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, + row_idct_output+8, + pass1_output, + 1, + dest+8, + dest_stride); + +#if HAVE_NEON_ASM + // restore d8-d15 register values. + vp9_pop_neon(store_reg); +#endif + + return; +} + +void vp9_idct16x16_10_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { +#if HAVE_NEON_ASM + int64_t store_reg[8]; +#endif + int16_t pass1_output[16*16] = {0}; + int16_t row_idct_output[16*16] = {0}; + +#if HAVE_NEON_ASM + // save d8-d15 register values. + vp9_push_neon(store_reg); +#endif + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vp9_idct16x16_10_add_neon_pass2(input+1, + row_idct_output, + pass1_output, + 0, + dest, + dest_stride); + + /* Skip Parallel idct on the lower 8 rows as they are all 0s */ + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, + row_idct_output, + pass1_output, + 1, + dest, + dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, + row_idct_output+8, + pass1_output, + 1, + dest+8, + dest_stride); + +#if HAVE_NEON_ASM + // restore d8-d15 register values. + vp9_pop_neon(store_reg); +#endif + + return; +} diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.asm b/vpx_dsp/arm/idct32x32_1_add_neon.asm new file mode 100644 index 000000000..d290d0753 --- /dev/null +++ b/vpx_dsp/arm/idct32x32_1_add_neon.asm @@ -0,0 +1,144 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp9_idct32x32_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ;TODO(hkuang): put the following macros in a seperate + ;file so other idct function could also use them. + MACRO + LD_16x8 $src, $stride + vld1.8 {q8}, [$src], $stride + vld1.8 {q9}, [$src], $stride + vld1.8 {q10}, [$src], $stride + vld1.8 {q11}, [$src], $stride + vld1.8 {q12}, [$src], $stride + vld1.8 {q13}, [$src], $stride + vld1.8 {q14}, [$src], $stride + vld1.8 {q15}, [$src], $stride + MEND + + MACRO + ADD_DIFF_16x8 $diff + vqadd.u8 q8, q8, $diff + vqadd.u8 q9, q9, $diff + vqadd.u8 q10, q10, $diff + vqadd.u8 q11, q11, $diff + vqadd.u8 q12, q12, $diff + vqadd.u8 q13, q13, $diff + vqadd.u8 q14, q14, $diff + vqadd.u8 q15, q15, $diff + MEND + + MACRO + SUB_DIFF_16x8 $diff + vqsub.u8 q8, q8, $diff + vqsub.u8 q9, q9, $diff + vqsub.u8 q10, q10, $diff + vqsub.u8 q11, q11, $diff + vqsub.u8 q12, q12, $diff + vqsub.u8 q13, q13, $diff + vqsub.u8 q14, q14, $diff + vqsub.u8 q15, q15, $diff + MEND + + MACRO + ST_16x8 $dst, $stride + vst1.8 {q8}, [$dst], $stride + vst1.8 {q9}, [$dst], $stride + vst1.8 {q10},[$dst], $stride + vst1.8 {q11},[$dst], $stride + vst1.8 {q12},[$dst], $stride + vst1.8 {q13},[$dst], $stride + vst1.8 {q14},[$dst], $stride + vst1.8 {q15},[$dst], $stride + MEND + +;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride + +|vp9_idct32x32_1_add_neon| PROC + push {lr} + pld [r1] + add r3, r1, #16 ; r3 dest + 16 for second loop + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asrs r0, r0, #6 ; >> 6 + bge diff_positive_32_32 + +diff_negative_32_32 + neg r0, r0 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_negative_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_negative_32_32_loop + pop {pc} + +diff_positive_32_32 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_positive_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_positive_32_32_loop + pop {pc} + + ENDP ; |vp9_idct32x32_1_add_neon| + END diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.c b/vpx_dsp/arm/idct32x32_1_add_neon.c new file mode 100644 index 000000000..e9c9c30c2 --- /dev/null +++ b/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" + +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +static INLINE void LD_16x8( + uint8_t *d, + int d_stride, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vld1q_u8(d); + d += d_stride; + *q9u8 = vld1q_u8(d); + d += d_stride; + *q10u8 = vld1q_u8(d); + d += d_stride; + *q11u8 = vld1q_u8(d); + d += d_stride; + *q12u8 = vld1q_u8(d); + d += d_stride; + *q13u8 = vld1q_u8(d); + d += d_stride; + *q14u8 = vld1q_u8(d); + d += d_stride; + *q15u8 = vld1q_u8(d); + return; +} + +static INLINE void ADD_DIFF_16x8( + uint8x16_t qdiffu8, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqaddq_u8(*q8u8, qdiffu8); + *q9u8 = vqaddq_u8(*q9u8, qdiffu8); + *q10u8 = vqaddq_u8(*q10u8, qdiffu8); + *q11u8 = vqaddq_u8(*q11u8, qdiffu8); + *q12u8 = vqaddq_u8(*q12u8, qdiffu8); + *q13u8 = vqaddq_u8(*q13u8, qdiffu8); + *q14u8 = vqaddq_u8(*q14u8, qdiffu8); + *q15u8 = vqaddq_u8(*q15u8, qdiffu8); + return; +} + +static INLINE void SUB_DIFF_16x8( + uint8x16_t qdiffu8, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqsubq_u8(*q8u8, qdiffu8); + *q9u8 = vqsubq_u8(*q9u8, qdiffu8); + *q10u8 = vqsubq_u8(*q10u8, qdiffu8); + *q11u8 = vqsubq_u8(*q11u8, qdiffu8); + *q12u8 = vqsubq_u8(*q12u8, qdiffu8); + *q13u8 = vqsubq_u8(*q13u8, qdiffu8); + *q14u8 = vqsubq_u8(*q14u8, qdiffu8); + *q15u8 = vqsubq_u8(*q15u8, qdiffu8); + return; +} + +static INLINE void ST_16x8( + uint8_t *d, + int d_stride, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + vst1q_u8(d, *q8u8); + d += d_stride; + vst1q_u8(d, *q9u8); + d += d_stride; + vst1q_u8(d, *q10u8); + d += d_stride; + vst1q_u8(d, *q11u8); + d += d_stride; + vst1q_u8(d, *q12u8); + d += d_stride; + vst1q_u8(d, *q13u8); + d += d_stride; + vst1q_u8(d, *q14u8); + d += d_stride; + vst1q_u8(d, *q15u8); + return; +} + +void vp9_idct32x32_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int i, j, dest_stride8; + uint8_t *d; + int16_t a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + dest_stride8 = dest_stride * 8; + if (a1 >= 0) { // diff_positive_32_32 + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + d += dest_stride8; + } + } + } else { // diff_negative_32_32 + a1 = -a1; + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + d += dest_stride8; + } + } + } + return; +} diff --git a/vpx_dsp/arm/idct32x32_add_neon.asm b/vpx_dsp/arm/idct32x32_add_neon.asm new file mode 100644 index 000000000..72e933eee --- /dev/null +++ b/vpx_dsp/arm/idct32x32_add_neon.asm @@ -0,0 +1,1299 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +;TODO(cd): adjust these constant to be able to use vqdmulh for faster +; dct_const_round_shift(a * b) within butterfly calculations. +cospi_1_64 EQU 16364 +cospi_2_64 EQU 16305 +cospi_3_64 EQU 16207 +cospi_4_64 EQU 16069 +cospi_5_64 EQU 15893 +cospi_6_64 EQU 15679 +cospi_7_64 EQU 15426 +cospi_8_64 EQU 15137 +cospi_9_64 EQU 14811 +cospi_10_64 EQU 14449 +cospi_11_64 EQU 14053 +cospi_12_64 EQU 13623 +cospi_13_64 EQU 13160 +cospi_14_64 EQU 12665 +cospi_15_64 EQU 12140 +cospi_16_64 EQU 11585 +cospi_17_64 EQU 11003 +cospi_18_64 EQU 10394 +cospi_19_64 EQU 9760 +cospi_20_64 EQU 9102 +cospi_21_64 EQU 8423 +cospi_22_64 EQU 7723 +cospi_23_64 EQU 7005 +cospi_24_64 EQU 6270 +cospi_25_64 EQU 5520 +cospi_26_64 EQU 4756 +cospi_27_64 EQU 3981 +cospi_28_64 EQU 3196 +cospi_29_64 EQU 2404 +cospi_30_64 EQU 1606 +cospi_31_64 EQU 804 + + + EXPORT |vp9_idct32x32_1024_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY + + ; -------------------------------------------------------------------------- + ; Load from transposed_buffer + ; q13 = transposed_buffer[first_offset] + ; q14 = transposed_buffer[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; transposed_buffer must be passed in. use 0 for first use. + MACRO + LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset + ; address calculation with proper stride and loading + add r0, #($first_offset - $prev_offset )*8*2 + vld1.s16 {q14}, [r0] + add r0, #($second_offset - $first_offset)*8*2 + vld1.s16 {q13}, [r0] + ; (used) two registers (q14, q13) + MEND + ; -------------------------------------------------------------------------- + ; Load from output (used as temporary storage) + ; reg1 = output[first_offset] + ; reg2 = output[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; output, whether reading or storing) must be passed in. use 0 for first + ; use. + MACRO + LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and loading + add r1, #($first_offset - $prev_offset )*32*2 + vld1.s16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vld1.s16 {$reg2}, [r1] + ; (used) two registers ($reg1, $reg2) + MEND + ; -------------------------------------------------------------------------- + ; Store into output (sometimes as as temporary storage) + ; output[first_offset] = reg1 + ; output[second_offset] = reg2 + ; for proper address calculation, the last offset used when manipulating + ; output, whether reading or storing) must be passed in. use 0 for first + ; use. + MACRO + STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and storing + add r1, #($first_offset - $prev_offset )*32*2 + vst1.16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vst1.16 {$reg2}, [r1] + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10] + vst1.16 {d11}, [r9] + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10]! + vst1.16 {d11}, [r9]! + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6] + vst1.16 {d4}, [r7] + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6]! + vst1.16 {d4}, [r7]! + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + ; TODO(cd): have special case to re-use constants when they are similar for + ; consecutive butterflies + ; TODO(cd): have special case when both constants are the same, do the + ; additions/subtractions before the multiplies. + ; generate the constants + ; generate scalar constants + mov r8, #$first_constant & 0xFF00 + mov r12, #$second_constant & 0xFF00 + add r8, #$first_constant & 0x00FF + add r12, #$second_constant & 0x00FF + ; generate vector constants + vdup.16 d30, r8 + vdup.16 d31, r12 + ; (used) two for inputs (regA-regD), one for constants (q15) + ; do some multiplications (ordered for maximum latency hiding) + vmull.s16 q8, $regC, d30 + vmull.s16 q10, $regA, d31 + vmull.s16 q9, $regD, d30 + vmull.s16 q11, $regB, d31 + vmull.s16 q12, $regC, d31 + ; (used) five for intermediate (q8-q12), one for constants (q15) + ; do some addition/subtractions (to get back two register) + vsub.s32 q8, q8, q10 + vsub.s32 q9, q9, q11 + ; do more multiplications (ordered for maximum latency hiding) + vmull.s16 q10, $regD, d31 + vmull.s16 q11, $regA, d30 + vmull.s16 q15, $regB, d30 + ; (used) six for intermediate (q8-q12, q15) + ; do more addition/subtractions + vadd.s32 q11, q12, q11 + vadd.s32 q10, q10, q15 + ; (used) four for intermediate (q8-q11) + ; dct_const_round_shift + vqrshrn.s32 $reg1, q8, #14 + vqrshrn.s32 $reg2, q9, #14 + vqrshrn.s32 $reg3, q11, #14 + vqrshrn.s32 $reg4, q10, #14 + ; (used) two for results, well four d registers + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + MEND + ; -------------------------------------------------------------------------- + +;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +; +; r0 int16_t *input, +; r1 uint8_t *dest, +; r2 int dest_stride) +; loop counters +; r4 bands loop counter +; r5 pass loop counter +; r8 transpose loop counter +; combine-add pointers +; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) +; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) +; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) +; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) + +|vp9_idct32x32_1024_add_neon| PROC + ; This function does one pass of idct32x32 transform. + ; + ; This is done by transposing the input and then doing a 1d transform on + ; columns. In the first pass, the transposed columns are the original + ; rows. In the second pass, after the transposition, the colums are the + ; original columns. + ; The 1d transform is done by looping over bands of eight columns (the + ; idct32_bands loop). For each band, the transform input transposition + ; is done on demand, one band of four 8x8 matrices at a time. The four + ; matrices are transposed by pairs (the idct32_transpose_pair loop). + push {r4-r11} + vpush {d8-d15} + ; stack operation + ; internal buffer used to transpose 8 lines into before transforming them + ; int16_t transpose_buffer[32 * 8]; + ; at sp + [4096, 4607] + ; results of the first pass (transpose and transform rows) + ; int16_t pass1[32 * 32]; + ; at sp + [0, 2047] + ; results of the second pass (transpose and transform columns) + ; int16_t pass2[32 * 32]; + ; at sp + [2048, 4095] + sub sp, sp, #512+2048+2048 + + ; r6 = dest + 31 * dest_stride + ; r7 = dest + 0 * dest_stride + ; r9 = dest + 15 * dest_stride + ; r10 = dest + 16 * dest_stride + rsb r6, r2, r2, lsl #5 + rsb r9, r2, r2, lsl #4 + add r10, r1, r2, lsl #4 + mov r7, r1 + add r6, r6, r1 + add r9, r9, r1 + ; r11 = -dest_stride + neg r11, r2 + ; r3 = input + mov r3, r0 + ; parameters for first pass + ; r0 = transpose_buffer[32 * 8] + add r0, sp, #4096 + ; r1 = pass1[32 * 32] + mov r1, sp + + mov r5, #0 ; initialize pass loop counter +idct32_pass_loop + mov r4, #4 ; initialize bands loop counter +idct32_bands_loop + mov r8, #2 ; initialize transpose loop counter +idct32_transpose_pair_loop + ; Load two horizontally consecutive 8x8 16bit data matrices. The first one + ; into q0-q7 and the second one into q8-q15. There is a stride of 64, + ; adjusted to 32 because of the two post-increments. + vld1.s16 {q8}, [r3]! + vld1.s16 {q0}, [r3]! + add r3, #32 + vld1.s16 {q9}, [r3]! + vld1.s16 {q1}, [r3]! + add r3, #32 + vld1.s16 {q10}, [r3]! + vld1.s16 {q2}, [r3]! + add r3, #32 + vld1.s16 {q11}, [r3]! + vld1.s16 {q3}, [r3]! + add r3, #32 + vld1.s16 {q12}, [r3]! + vld1.s16 {q4}, [r3]! + add r3, #32 + vld1.s16 {q13}, [r3]! + vld1.s16 {q5}, [r3]! + add r3, #32 + vld1.s16 {q14}, [r3]! + vld1.s16 {q6}, [r3]! + add r3, #32 + vld1.s16 {q15}, [r3]! + vld1.s16 {q7}, [r3]! + + ; Transpose the two 8x8 16bit data matrices. + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vswp d1, d8 + vswp d7, d14 + vswp d5, d12 + vswp d3, d10 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + ; Store both matrices after each other. There is a stride of 32, which + ; adjusts to nothing because of the post-increments. + vst1.16 {q8}, [r0]! + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + vst1.16 {q0}, [r0]! + vst1.16 {q1}, [r0]! + vst1.16 {q2}, [r0]! + vst1.16 {q3}, [r0]! + vst1.16 {q4}, [r0]! + vst1.16 {q5}, [r0]! + vst1.16 {q6}, [r0]! + vst1.16 {q7}, [r0]! + + ; increment pointers by adjusted stride (not necessary for r0/out) + ; go back by 7*32 for the seven lines moved fully by read and add + ; go back by 32 for the eigth line only read + ; advance by 16*2 to go the next pair + sub r3, r3, #7*32*2 + 32 - 16*2 + ; transpose pair loop processing + subs r8, r8, #1 + bne idct32_transpose_pair_loop + + ; restore r0/input to its original value + sub r0, r0, #32*8*2 + + ; Instead of doing the transforms stage by stage, it is done by loading + ; some input values and doing as many stages as possible to minimize the + ; storing/loading of intermediate results. To fit within registers, the + ; final coefficients are cut into four blocks: + ; BLOCK A: 16-19,28-31 + ; BLOCK B: 20-23,24-27 + ; BLOCK C: 8-10,11-15 + ; BLOCK D: 0-3,4-7 + ; Blocks A and C are straight calculation through the various stages. In + ; block B, further calculations are performed using the results from + ; block A. In block D, further calculations are performed using the results + ; from block C and then the final calculations are done using results from + ; block A and B which have been combined at the end of block B. + + ; -------------------------------------------------------------------------- + ; BLOCK A: 16-19,28-31 + ; -------------------------------------------------------------------------- + ; generate 16,17,30,31 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; + ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; + ;step1b[16][i] = dct_const_round_shift(temp1); + ;step1b[31][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 0, 1, 31 + DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; + ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; + ;step1b[17][i] = dct_const_round_shift(temp1); + ;step1b[30][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 31, 17, 15 + DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[16] = step1b[16][i] + step1b[17][i]; + ;step2[17] = step1b[16][i] - step1b[17][i]; + ;step2[30] = -step1b[30][i] + step1b[31][i]; + ;step2[31] = step1b[30][i] + step1b[31][i]; + vadd.s16 q4, q0, q1 + vsub.s16 q13, q0, q1 + vadd.s16 q6, q2, q3 + vsub.s16 q14, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; + ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; + ;step3[17] = dct_const_round_shift(temp1); + ;step3[30] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; generate 18,19,28,29 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; + ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; + ;step1b[18][i] = dct_const_round_shift(temp1); + ;step1b[29][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 15, 9, 23 + DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; + ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; + ;step1b[19][i] = dct_const_round_shift(temp1); + ;step1b[28][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 23, 25, 7 + DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[18] = -step1b[18][i] + step1b[19][i]; + ;step2[19] = step1b[18][i] + step1b[19][i]; + ;step2[28] = step1b[28][i] + step1b[29][i]; + ;step2[29] = step1b[28][i] - step1b[29][i]; + vsub.s16 q13, q3, q2 + vadd.s16 q3, q3, q2 + vsub.s16 q14, q1, q0 + vadd.s16 q2, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); + ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); + ;step3[29] = dct_const_round_shift(temp1); + ;step3[18] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 + ; -------------------------------------------------------------------------- + ; combine 16-19,28-31 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[16] = step1b[16][i] + step1b[19][i]; + ;step1[17] = step1b[17][i] + step1b[18][i]; + ;step1[18] = step1b[17][i] - step1b[18][i]; + ;step1[29] = step1b[30][i] - step1b[29][i]; + ;step1[30] = step1b[30][i] + step1b[29][i]; + ;step1[31] = step1b[31][i] + step1b[28][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q0 + vadd.s16 q10, q7, q1 + vadd.s16 q15, q6, q3 + vsub.s16 q13, q5, q0 + vsub.s16 q14, q7, q1 + STORE_IN_OUTPUT 0, 16, 31, q8, q15 + STORE_IN_OUTPUT 31, 17, 30, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; + ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; + ;step2[18] = dct_const_round_shift(temp1); + ;step2[29] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 + STORE_IN_OUTPUT 30, 29, 18, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[19] = step1b[16][i] - step1b[19][i]; + ;step1[28] = step1b[31][i] - step1b[28][i]; + vsub.s16 q13, q4, q2 + vsub.s16 q14, q6, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; + ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; + ;step2[19] = dct_const_round_shift(temp1); + ;step2[28] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 + STORE_IN_OUTPUT 18, 19, 28, q4, q6 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK B: 20-23,24-27 + ; -------------------------------------------------------------------------- + ; generate 20,21,26,27 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; + ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; + ;step1b[20][i] = dct_const_round_shift(temp1); + ;step1b[27][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 7, 5, 27 + DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; + ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; + ;step1b[21][i] = dct_const_round_shift(temp1); + ;step1b[26][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 27, 21, 11 + DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[20] = step1b[20][i] + step1b[21][i]; + ;step2[21] = step1b[20][i] - step1b[21][i]; + ;step2[26] = -step1b[26][i] + step1b[27][i]; + ;step2[27] = step1b[26][i] + step1b[27][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; + ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; + ;step3[21] = dct_const_round_shift(temp1); + ;step3[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 22,23,24,25 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; + ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; + ;step1b[22][i] = dct_const_round_shift(temp1); + ;step1b[25][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 11, 13, 19 + DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; + ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; + ;step1b[23][i] = dct_const_round_shift(temp1); + ;step1b[24][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 19, 29, 3 + DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[22] = -step1b[22][i] + step1b[23][i]; + ;step2[23] = step1b[22][i] + step1b[23][i]; + ;step2[24] = step1b[24][i] + step1b[25][i]; + ;step2[25] = step1b[24][i] - step1b[25][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); + ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); + ;step3[25] = dct_const_round_shift(temp1); + ;step3[22] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 20-23,24-27 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[22] = step1b[22][i] + step1b[21][i]; + ;step1[23] = step1b[23][i] + step1b[20][i]; + vadd.s16 q10, q7, q1 + vadd.s16 q11, q5, q0 + ;step1[24] = step1b[24][i] + step1b[27][i]; + ;step1[25] = step1b[25][i] + step1b[26][i]; + vadd.s16 q12, q6, q2 + vadd.s16 q15, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[16] = step1b[16][i] + step1b[23][i]; + ;step3[17] = step1b[17][i] + step1b[22][i]; + ;step3[22] = step1b[17][i] - step1b[22][i]; + ;step3[23] = step1b[16][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 + vadd.s16 q8, q14, q11 + vadd.s16 q9, q13, q10 + vsub.s16 q13, q13, q10 + vsub.s16 q11, q14, q11 + STORE_IN_OUTPUT 17, 17, 16, q9, q8 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[24] = step1b[31][i] - step1b[24][i]; + ;step3[25] = step1b[30][i] - step1b[25][i]; + ;step3[30] = step1b[30][i] + step1b[25][i]; + ;step3[31] = step1b[31][i] + step1b[24][i]; + LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 + vsub.s16 q8, q9, q12 + vadd.s16 q10, q14, q15 + vsub.s16 q14, q14, q15 + vadd.s16 q12, q9, q12 + STORE_IN_OUTPUT 31, 30, 31, q10, q12 + ; -------------------------------------------------------------------------- + ; TODO(cd) do some register allocation change to remove these push/pop + vpush {q8} ; [24] + vpush {q11} ; [23] + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; + ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; + ;step1[22] = dct_const_round_shift(temp1); + ;step1[25] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 31, 25, 22, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; + ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; + ;step1[23] = dct_const_round_shift(temp1); + ;step1[24] = dct_const_round_shift(temp2); + ; TODO(cd) do some register allocation change to remove these push/pop + vpop {q13} ; [23] + vpop {q14} ; [24] + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 22, 24, 23, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[20] = step1b[23][i] - step1b[20][i]; + ;step1[27] = step1b[24][i] - step1b[27][i]; + vsub.s16 q14, q5, q0 + vsub.s16 q13, q6, q2 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); + ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); + ;step2[27] = dct_const_round_shift(temp1); + ;step2[20] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[21] = step1b[22][i] - step1b[21][i]; + ;step1[26] = step1b[25][i] - step1b[26][i]; + vsub.s16 q14, q7, q1 + vsub.s16 q13, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); + ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); + ;step2[26] = dct_const_round_shift(temp1); + ;step2[21] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[18] = step1b[18][i] + step1b[21][i]; + ;step3[19] = step1b[19][i] + step1b[20][i]; + ;step3[20] = step1b[19][i] - step1b[20][i]; + ;step3[21] = step1b[18][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 + vadd.s16 q8, q14, q1 + vadd.s16 q9, q13, q6 + vsub.s16 q13, q13, q6 + vsub.s16 q1, q14, q1 + STORE_IN_OUTPUT 19, 18, 19, q8, q9 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[27] = step1b[28][i] - step1b[27][i]; + ;step3[28] = step1b[28][i] + step1b[27][i]; + ;step3[29] = step1b[29][i] + step1b[26][i]; + ;step3[26] = step1b[29][i] - step1b[26][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 + vsub.s16 q14, q8, q5 + vadd.s16 q10, q8, q5 + vadd.s16 q11, q9, q0 + vsub.s16 q0, q9, q0 + STORE_IN_OUTPUT 29, 28, 29, q10, q11 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; + ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; + ;step1[20] = dct_const_round_shift(temp1); + ;step1[27] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 29, 20, 27, q13, q14 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; + ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; + ;step1[21] = dct_const_round_shift(temp1); + ;step1[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 + STORE_IN_OUTPUT 27, 21, 26, q1, q0 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK C: 8-10,11-15 + ; -------------------------------------------------------------------------- + ; generate 8,9,14,15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; + ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; + ;step2[8] = dct_const_round_shift(temp1); + ;step2[15] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 3, 2, 30 + DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; + ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; + ;step2[9] = dct_const_round_shift(temp1); + ;step2[14] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 30, 18, 14 + DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[8] = step1b[8][i] + step1b[9][i]; + ;step3[9] = step1b[8][i] - step1b[9][i]; + ;step3[14] = step1b[15][i] - step1b[14][i]; + ;step3[15] = step1b[15][i] + step1b[14][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; + ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; + ;step1[9] = dct_const_round_shift(temp1); + ;step1[14] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 10,11,12,13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; + ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; + ;step2[10] = dct_const_round_shift(temp1); + ;step2[13] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 14, 10, 22 + DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; + ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; + ;step2[11] = dct_const_round_shift(temp1); + ;step2[12] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 22, 26, 6 + DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[10] = step1b[11][i] - step1b[10][i]; + ;step3[11] = step1b[11][i] + step1b[10][i]; + ;step3[12] = step1b[12][i] + step1b[13][i]; + ;step3[13] = step1b[12][i] - step1b[13][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); + ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); + ;step1[13] = dct_const_round_shift(temp1); + ;step1[10] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 8-10,11-15 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[8] = step1b[8][i] + step1b[11][i]; + ;step2[9] = step1b[9][i] + step1b[10][i]; + ;step2[10] = step1b[9][i] - step1b[10][i]; + vadd.s16 q8, q0, q5 + vadd.s16 q9, q1, q7 + vsub.s16 q13, q1, q7 + ;step2[13] = step1b[14][i] - step1b[13][i]; + ;step2[14] = step1b[14][i] + step1b[13][i]; + ;step2[15] = step1b[15][i] + step1b[12][i]; + vsub.s16 q14, q3, q4 + vadd.s16 q10, q3, q4 + vadd.s16 q15, q2, q6 + STORE_IN_OUTPUT 26, 8, 15, q8, q15 + STORE_IN_OUTPUT 15, 9, 14, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; + ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; + ;step3[10] = dct_const_round_shift(temp1); + ;step3[13] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 14, 13, 10, q3, q1 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[11] = step1b[8][i] - step1b[11][i]; + ;step2[12] = step1b[15][i] - step1b[12][i]; + vsub.s16 q13, q0, q5 + vsub.s16 q14, q2, q6 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; + ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; + ;step3[11] = dct_const_round_shift(temp1); + ;step3[12] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 10, 11, 12, q1, q3 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK D: 0-3,4-7 + ; -------------------------------------------------------------------------- + ; generate 4,5,6,7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; + ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; + ;step3[4] = dct_const_round_shift(temp1); + ;step3[7] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 6, 4, 28 + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; + ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; + ;step3[5] = dct_const_round_shift(temp1); + ;step3[6] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 28, 20, 12 + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[4] = step1b[4][i] + step1b[5][i]; + ;step1[5] = step1b[4][i] - step1b[5][i]; + ;step1[6] = step1b[7][i] - step1b[6][i]; + ;step1[7] = step1b[7][i] + step1b[6][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; + ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; + ;step2[5] = dct_const_round_shift(temp1); + ;step2[6] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 0,1,2,3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; + ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; + ;step1[1] = dct_const_round_shift(temp1); + ;step1[0] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 12, 0, 16 + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; + ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; + ;step1[2] = dct_const_round_shift(temp1); + ;step1[3] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 16, 8, 24 + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[0] = step1b[0][i] + step1b[3][i]; + ;step2[1] = step1b[1][i] + step1b[2][i]; + ;step2[2] = step1b[1][i] - step1b[2][i]; + ;step2[3] = step1b[0][i] - step1b[3][i]; + vadd.s16 q4, q7, q6 + vsub.s16 q7, q7, q6 + vsub.s16 q6, q5, q14 + vadd.s16 q5, q5, q14 + ; -------------------------------------------------------------------------- + ; combine 0-3,4-7 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[0] = step1b[0][i] + step1b[7][i]; + ;step3[1] = step1b[1][i] + step1b[6][i]; + ;step3[2] = step1b[2][i] + step1b[5][i]; + ;step3[3] = step1b[3][i] + step1b[4][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q3 + vadd.s16 q10, q6, q1 + vadd.s16 q11, q7, q0 + ;step3[4] = step1b[3][i] - step1b[4][i]; + ;step3[5] = step1b[2][i] - step1b[5][i]; + ;step3[6] = step1b[1][i] - step1b[6][i]; + ;step3[7] = step1b[0][i] - step1b[7][i]; + vsub.s16 q12, q7, q0 + vsub.s16 q13, q6, q1 + vsub.s16 q14, q5, q3 + vsub.s16 q15, q4, q2 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[0] = step1b[0][i] + step1b[15][i]; + ;step1[1] = step1b[1][i] + step1b[14][i]; + ;step1[14] = step1b[1][i] - step1b[14][i]; + ;step1[15] = step1b[0][i] - step1b[15][i]; + LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 + vadd.s16 q2, q8, q1 + vadd.s16 q3, q9, q0 + vsub.s16 q4, q9, q0 + vsub.s16 q5, q8, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[14 * 32] = step1b[14][i] + step1b[17][i]; + ;output[15 * 32] = step1b[15][i] + step1b[16][i]; + ;output[16 * 32] = step1b[15][i] - step1b[16][i]; + ;output[17 * 32] = step1b[14][i] - step1b[17][i]; + LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + + cmp r5, #0 + bgt idct32_bands_end_2nd_pass + +idct32_bands_end_1st_pass + STORE_IN_OUTPUT 17, 16, 17, q6, q7 + STORE_IN_OUTPUT 17, 14, 15, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 31, 30, 31, q6, q7 + STORE_IN_OUTPUT 31, 0, 1, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 19, 18, 19, q6, q7 + STORE_IN_OUTPUT 19, 12, 13, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 29, 28, 29, q6, q7 + STORE_IN_OUTPUT 29, 2, 3, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 21, 20, 21, q6, q7 + STORE_IN_OUTPUT 21, 10, 11, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 27, 26, 27, q6, q7 + STORE_IN_OUTPUT 27, 4, 5, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 23, 22, 23, q6, q7 + STORE_IN_OUTPUT 23, 8, 9, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 25, 24, 25, q6, q7 + STORE_IN_OUTPUT 25, 6, 7, q4, q5 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #7*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; parameters for second pass + ; the input of pass2 is the result of pass1. we have to remove the offset + ; of 32 columns induced by the above idct32_bands_loop + sub r3, r1, #32*2 + ; r1 = pass2[32 * 32] + add r1, sp, #2048 + + ; pass loop processing + add r5, r5, #1 + b idct32_pass_loop + +idct32_bands_end_2nd_pass + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; restore pointers to their initial indices for next band pass by + ; removing/adding dest_stride * 8. The actual increment by eight + ; is taken care of within the _LAST macros. + add r6, r6, r2, lsl #3 + add r9, r9, r2, lsl #3 + sub r7, r7, r2, lsl #3 + sub r10, r10, r2, lsl #3 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #25*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; stack operation + add sp, sp, #512+2048+2048 + vpop {d8-d15} + pop {r4-r11} + bx lr + ENDP ; |vp9_idct32x32_1024_add_neon| + END diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c new file mode 100644 index 000000000..3656a7696 --- /dev/null +++ b/vpx_dsp/arm/idct32x32_add_neon.c @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" + +#define LOAD_FROM_TRANSPOSED(prev, first, second) \ + q14s16 = vld1q_s16(trans_buf + first * 8); \ + q13s16 = vld1q_s16(trans_buf + second * 8); + +#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ + qA = vld1q_s16(out + first * 32); \ + qB = vld1q_s16(out + second * 32); + +#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ + vst1q_s16(out + first * 32, qA); \ + vst1q_s16(out + second * 32, qB); + +#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ + __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ + q6s16, q7s16, q8s16, q9s16); +static INLINE void __STORE_COMBINE_CENTER_RESULTS( + uint8_t *p1, + uint8_t *p2, + int stride, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16) { + int16x4_t d8s16, d9s16, d10s16, d11s16; + + d8s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d11s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d9s16 = vld1_s16((int16_t *)p1); + d10s16 = vld1_s16((int16_t *)p2); + + q7s16 = vrshrq_n_s16(q7s16, 6); + q8s16 = vrshrq_n_s16(q8s16, 6); + q9s16 = vrshrq_n_s16(q9s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + + q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), + vreinterpret_u8_s16(d9s16))); + q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_s16(d10s16))); + q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_s16(d11s16))); + q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), + vreinterpret_u8_s16(d8s16))); + + d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); + d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); + d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + + vst1_s16((int16_t *)p1, d9s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d10s16); + p2 += stride; + vst1_s16((int16_t *)p1, d8s16); + vst1_s16((int16_t *)p2, d11s16); + return; +} + +#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ + __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ + q4s16, q5s16, q6s16, q7s16); +static INLINE void __STORE_COMBINE_EXTREME_RESULTS( + uint8_t *p1, + uint8_t *p2, + int stride, + int16x8_t q4s16, + int16x8_t q5s16, + int16x8_t q6s16, + int16x8_t q7s16) { + int16x4_t d4s16, d5s16, d6s16, d7s16; + + d4s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d7s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d5s16 = vld1_s16((int16_t *)p1); + d6s16 = vld1_s16((int16_t *)p2); + + q5s16 = vrshrq_n_s16(q5s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + q7s16 = vrshrq_n_s16(q7s16, 6); + q4s16 = vrshrq_n_s16(q4s16, 6); + + q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16), + vreinterpret_u8_s16(d5s16))); + q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), + vreinterpret_u8_s16(d6s16))); + q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), + vreinterpret_u8_s16(d7s16))); + q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16), + vreinterpret_u8_s16(d4s16))); + + d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); + d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); + + vst1_s16((int16_t *)p1, d5s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d6s16); + p2 += stride; + vst1_s16((int16_t *)p2, d7s16); + vst1_s16((int16_t *)p1, d4s16); + return; +} + +#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ + DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); +static INLINE void DO_BUTTERFLY( + int16x8_t q14s16, + int16x8_t q13s16, + int16_t first_const, + int16_t second_const, + int16x8_t *qAs16, + int16x8_t *qBs16) { + int16x4_t d30s16, d31s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; + int16x4_t dCs16, dDs16, dAs16, dBs16; + + dCs16 = vget_low_s16(q14s16); + dDs16 = vget_high_s16(q14s16); + dAs16 = vget_low_s16(q13s16); + dBs16 = vget_high_s16(q13s16); + + d30s16 = vdup_n_s16(first_const); + d31s16 = vdup_n_s16(second_const); + + q8s32 = vmull_s16(dCs16, d30s16); + q10s32 = vmull_s16(dAs16, d31s16); + q9s32 = vmull_s16(dDs16, d30s16); + q11s32 = vmull_s16(dBs16, d31s16); + q12s32 = vmull_s16(dCs16, d31s16); + + q8s32 = vsubq_s32(q8s32, q10s32); + q9s32 = vsubq_s32(q9s32, q11s32); + + q10s32 = vmull_s16(dDs16, d31s16); + q11s32 = vmull_s16(dAs16, d30s16); + q15s32 = vmull_s16(dBs16, d30s16); + + q11s32 = vaddq_s32(q12s32, q11s32); + q10s32 = vaddq_s32(q10s32, q15s32); + + *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), + vqrshrn_n_s32(q9s32, 14)); + *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), + vqrshrn_n_s32(q10s32, 14)); + return; +} + +static INLINE void idct32_transpose_pair( + int16_t *input, + int16_t *t_buf) { + int16_t *in; + int i; + const int stride = 32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + for (i = 0; i < 4; i++, input += 8) { + in = input; + q8s16 = vld1q_s16(in); + in += stride; + q9s16 = vld1q_s16(in); + in += stride; + q10s16 = vld1q_s16(in); + in += stride; + q11s16 = vld1q_s16(in); + in += stride; + q12s16 = vld1q_s16(in); + in += stride; + q13s16 = vld1q_s16(in); + in += stride; + q14s16 = vld1q_s16(in); + in += stride; + q15s16 = vld1q_s16(in); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + q12s16 = vcombine_s16(d17s16, d25s16); + q13s16 = vcombine_s16(d19s16, d27s16); + q14s16 = vcombine_s16(d21s16, d29s16); + q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16), + vreinterpretq_s32_s16(q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16), + vreinterpretq_s32_s16(q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16), + vreinterpretq_s32_s16(q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + vst1q_s16(t_buf, q0x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q0x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[1]); + t_buf += 8; + } + return; +} + +static INLINE void idct32_bands_end_1st_pass( + int16_t *out, + int16x8_t q2s16, + int16x8_t q3s16, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16, + int16x8_t q10s16, + int16x8_t q11s16, + int16x8_t q12s16, + int16x8_t q13s16, + int16x8_t q14s16, + int16x8_t q15s16) { + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); + STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); + + LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); + STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); + + LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); + STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); + + LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); + STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); + + LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); + STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); + + LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); + STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); + + LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); + STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); + + LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); + STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); + return; +} + +static INLINE void idct32_bands_end_2nd_pass( + int16_t *out, + uint8_t *dest, + int stride, + int16x8_t q2s16, + int16x8_t q3s16, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16, + int16x8_t q10s16, + int16x8_t q11s16, + int16x8_t q12s16, + int16x8_t q13s16, + int16x8_t q14s16, + int16x8_t q15s16) { + uint8_t *r6 = dest + 31 * stride; + uint8_t *r7 = dest/* + 0 * stride*/; + uint8_t *r9 = dest + 15 * stride; + uint8_t *r10 = dest + 16 * stride; + int str2 = stride << 1; + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + + LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + return; +} + +void vp9_idct32x32_1024_add_neon( + int16_t *input, + uint8_t *dest, + int stride) { + int i, idct32_pass_loop; + int16_t trans_buf[32 * 8]; + int16_t pass1[32 * 32]; + int16_t pass2[32 * 32]; + int16_t *out; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + + for (idct32_pass_loop = 0, out = pass1; + idct32_pass_loop < 2; + idct32_pass_loop++, + input = pass1, // the input of pass2 is the result of pass1 + out = pass2) { + for (i = 0; + i < 4; i++, + input += 32 * 8, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + LOAD_FROM_TRANSPOSED(0, 1, 31) + DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(31, 17, 15) + DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) + // part of stage 2 + q4s16 = vaddq_s16(q0s16, q1s16); + q13s16 = vsubq_s16(q0s16, q1s16); + q6s16 = vaddq_s16(q2s16, q3s16); + q14s16 = vsubq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) + + // generate 18,19,28,29 + // part of stage 1 + LOAD_FROM_TRANSPOSED(15, 9, 23) + DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(23, 25, 7) + DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q3s16, q2s16); + q3s16 = vaddq_s16(q3s16, q2s16); + q14s16 = vsubq_s16(q1s16, q0s16); + q2s16 = vaddq_s16(q1s16, q0s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) + // part of stage 4 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q10s16 = vaddq_s16(q7s16, q1s16); + q15s16 = vaddq_s16(q6s16, q3s16); + q13s16 = vsubq_s16(q5s16, q0s16); + q14s16 = vsubq_s16(q7s16, q1s16); + STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) + STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) + STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) + // part of stage 4 + q13s16 = vsubq_s16(q4s16, q2s16); + q14s16 = vsubq_s16(q6s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) + STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + LOAD_FROM_TRANSPOSED(7, 5, 27) + DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(27, 21, 11) + DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + + // generate 22,23,24,25 + // part of stage 1 + LOAD_FROM_TRANSPOSED(11, 13, 19) + DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(19, 29, 3) + DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) + // part of stage 2 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) + // part of stage 4 + q10s16 = vaddq_s16(q7s16, q1s16); + q11s16 = vaddq_s16(q5s16, q0s16); + q12s16 = vaddq_s16(q6s16, q2s16); + q15s16 = vaddq_s16(q4s16, q3s16); + // part of stage 6 + LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q11s16); + q9s16 = vaddq_s16(q13s16, q10s16); + q13s16 = vsubq_s16(q13s16, q10s16); + q11s16 = vsubq_s16(q14s16, q11s16); + STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) + LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) + q8s16 = vsubq_s16(q9s16, q12s16); + q10s16 = vaddq_s16(q14s16, q15s16); + q14s16 = vsubq_s16(q14s16, q15s16); + q12s16 = vaddq_s16(q9s16, q12s16); + STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) + q13s16 = q11s16; + q14s16 = q8s16; + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) + // part of stage 4 + q14s16 = vsubq_s16(q5s16, q0s16); + q13s16 = vsubq_s16(q6s16, q2s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); + q14s16 = vsubq_s16(q7s16, q1s16); + q13s16 = vsubq_s16(q4s16, q3s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); + // part of stage 6 + LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q1s16); + q9s16 = vaddq_s16(q13s16, q6s16); + q13s16 = vsubq_s16(q13s16, q6s16); + q1s16 = vsubq_s16(q14s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) + LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) + q14s16 = vsubq_s16(q8s16, q5s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q9s16, q0s16); + q0s16 = vsubq_s16(q9s16, q0s16); + STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) + DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, + &q1s16, &q0s16); + STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + LOAD_FROM_TRANSPOSED(3, 2, 30) + DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(30, 18, 14) + DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) + // part of stage 3 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 4 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) + + // generate 10,11,12,13 + // part of stage 2 + LOAD_FROM_TRANSPOSED(14, 10, 22) + DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(22, 26, 6) + DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) + // part of stage 3 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 4 + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) + // part of stage 5 + q8s16 = vaddq_s16(q0s16, q5s16); + q9s16 = vaddq_s16(q1s16, q7s16); + q13s16 = vsubq_s16(q1s16, q7s16); + q14s16 = vsubq_s16(q3s16, q4s16); + q10s16 = vaddq_s16(q3s16, q4s16); + q15s16 = vaddq_s16(q2s16, q6s16); + STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) + STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) + // part of stage 6 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) + q13s16 = vsubq_s16(q0s16, q5s16); + q14s16 = vsubq_s16(q2s16, q6s16); + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + LOAD_FROM_TRANSPOSED(6, 4, 28) + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(28, 20, 12) + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + // part of stage 4 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + + // generate 0,1,2,3 + // part of stage 4 + LOAD_FROM_TRANSPOSED(12, 0, 16) + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(16, 8, 24) + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) + // part of stage 5 + q4s16 = vaddq_s16(q7s16, q6s16); + q7s16 = vsubq_s16(q7s16, q6s16); + q6s16 = vsubq_s16(q5s16, q14s16); + q5s16 = vaddq_s16(q5s16, q14s16); + // part of stage 6 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q3s16); + q10s16 = vaddq_s16(q6s16, q1s16); + q11s16 = vaddq_s16(q7s16, q0s16); + q12s16 = vsubq_s16(q7s16, q0s16); + q13s16 = vsubq_s16(q6s16, q1s16); + q14s16 = vsubq_s16(q5s16, q3s16); + q15s16 = vsubq_s16(q4s16, q2s16); + // part of stage 7 + LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) + q2s16 = vaddq_s16(q8s16, q1s16); + q3s16 = vaddq_s16(q9s16, q0s16); + q4s16 = vsubq_s16(q9s16, q0s16); + q5s16 = vsubq_s16(q8s16, q1s16); + LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, + q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); + } else { + idct32_bands_end_2nd_pass(out, dest, stride, + q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); + dest += 8; + } + } + } + return; +} diff --git a/vpx_dsp/arm/idct4x4_1_add_neon.asm b/vpx_dsp/arm/idct4x4_1_add_neon.asm new file mode 100644 index 000000000..0d4a721c4 --- /dev/null +++ b/vpx_dsp/arm/idct4x4_1_add_neon.asm @@ -0,0 +1,68 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_idct4x4_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_idct4x4_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.s16 q0, r0 ; duplicate a1 + + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q8, q0, d2 ; dest[x] + a1 + vaddw.u8 q9, q0, d4 + + vqmovun.s16 d6, q8 ; clip_pixel + vqmovun.s16 d7, q9 + + vst1.32 {d6[0]}, [r12], r2 + vst1.32 {d6[1]}, [r12], r2 + vst1.32 {d7[0]}, [r12], r2 + vst1.32 {d7[1]}, [r12] + + bx lr + ENDP ; |vp9_idct4x4_1_add_neon| + + END diff --git a/vpx_dsp/arm/idct4x4_1_add_neon.c b/vpx_dsp/arm/idct4x4_1_add_neon.c new file mode 100644 index 000000000..75e14ccde --- /dev/null +++ b/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +void vp9_idct4x4_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d6u8; + uint32x2_t d2u32 = vdup_n_u32(0); + uint16x8_t q8u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + q0s16 = vdupq_n_s16(a1); + + // dc_only_idct_add + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); + d1 += dest_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), + vreinterpret_u8_u32(d2u32)); + d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); + d2 += dest_stride; + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); + d2 += dest_stride; + } + return; +} diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm new file mode 100644 index 000000000..00283fc8d --- /dev/null +++ b/vpx_dsp/arm/idct4x4_add_neon.asm @@ -0,0 +1,190 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_idct4x4_16_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY ; name this block of code +;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_idct4x4_16_add_neon| PROC + + ; The 2D transform is done with two passes which are actually pretty + ; similar. We first transform the rows. This is done by transposing + ; the inputs, doing an SIMD column transform (the columns are the + ; transposed rows) and then transpose the results (so that it goes back + ; in normal/row positions). Then, we transform the columns by doing + ; another SIMD column transform. + ; So, two passes of a transpose followed by a column transform. + + ; load the inputs into q8-q9, d16-d19 + vld1.s16 {q8,q9}, [r0]! + + ; generate scalar constants + ; cospi_8_64 = 15137 = 0x3b21 + mov r0, #0x3b00 + add r0, #0x21 + ; cospi_16_64 = 11585 = 0x2d41 + mov r3, #0x2d00 + add r3, #0x41 + ; cospi_24_64 = 6270 = 0x 187e + mov r12, #0x1800 + add r12, #0x7e + + ; transpose the input data + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + + ; generate constant vectors + vdup.16 d20, r0 ; replicate cospi_8_64 + vdup.16 d21, r3 ; replicate cospi_16_64 + + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + vdup.16 d22, r12 ; replicate cospi_24_64 + + ; do the transform on transposed rows + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + vswp d18, d19 + + ; transpose the results + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + ; do the transform on columns + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + + ; The results are in two registers, one of them being swapped. This will + ; be taken care of by loading the 'dest' value in a swapped fashion and + ; also storing them in the same swapped fashion. + ; temp_out[0, 1] = d16, d17 = q8 + ; temp_out[2, 3] = d19, d18 = q9 swapped + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + + vld1.32 {d26[0]}, [r1], r2 + vld1.32 {d26[1]}, [r1], r2 + vld1.32 {d27[1]}, [r1], r2 + vld1.32 {d27[0]}, [r1] ; no post-increment + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d26 + vaddw.u8 q9, q9, d27 + + ; clip_pixel + vqmovun.s16 d26, q8 + vqmovun.s16 d27, q9 + + ; do the stores in reverse order with negative post-increment, by changing + ; the sign of the stride + rsb r2, r2, #0 + vst1.32 {d27[0]}, [r1], r2 + vst1.32 {d27[1]}, [r1], r2 + vst1.32 {d26[1]}, [r1], r2 + vst1.32 {d26[0]}, [r1] ; no post-increment + bx lr + ENDP ; |vp9_idct4x4_16_add_neon| + + END diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c new file mode 100644 index 000000000..dc91e0f30 --- /dev/null +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp9_idct4x4_16_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d26u8, d27u8; + uint32x2_t d26u32, d27u32; + uint16x8_t q8u16, q9u16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; + int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; + int16x8_t q8s16, q9s16, q13s16, q14s16; + int32x4_t q1s32, q13s32, q14s32, q15s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + uint8_t *d; + int16_t cospi_8_64 = 15137; + int16_t cospi_16_64 = 11585; + int16_t cospi_24_64 = 6270; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + d20s16 = vdup_n_s16(cospi_8_64); + d21s16 = vdup_n_s16(cospi_16_64); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + d22s16 = vdup_n_s16(cospi_24_64); + + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_high_s16(q9s16); // vswp d18 d19 + d19s16 = vget_low_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + // do the transform on columns + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d = dest; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); + d += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + d = dest; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); + return; +} diff --git a/vpx_dsp/arm/idct8x8_1_add_neon.asm b/vpx_dsp/arm/idct8x8_1_add_neon.asm new file mode 100644 index 000000000..421d202d4 --- /dev/null +++ b/vpx_dsp/arm/idct8x8_1_add_neon.asm @@ -0,0 +1,88 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_idct8x8_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_idct8x8_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 5) + add r0, r0, #16 ; + (1 <<((5) - 1)) + asr r0, r0, #5 ; >> 5 + + vdup.s16 q0, r0 ; duplicate a1 + + ; load destination data + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r2 + vld1.64 {d17}, [r1] + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |vp9_idct8x8_1_add_neon| + + END diff --git a/vpx_dsp/arm/idct8x8_1_add_neon.c b/vpx_dsp/arm/idct8x8_1_add_neon.c new file mode 100644 index 000000000..10e8e931e --- /dev/null +++ b/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +void vp9_idct8x8_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 5); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d5u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + return; +} diff --git a/vpx_dsp/arm/idct8x8_add_neon.asm b/vpx_dsp/arm/idct8x8_add_neon.asm new file mode 100644 index 000000000..ab5bb6920 --- /dev/null +++ b/vpx_dsp/arm/idct8x8_add_neon.asm @@ -0,0 +1,519 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_idct8x8_64_add_neon| + EXPORT |vp9_idct8x8_12_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are + ; loaded in q8-q15. The output will be stored back into q8-q15 registers. + ; This macro will touch q0-q7 registers and use them as buffer during + ; calculation. + MACRO + IDCT8x8_1D + ; stage 1 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r4 ; duplicate cospi_4_64 + vdup.16 d2, r5 ; duplicate cospi_12_64 + vdup.16 d3, r6 ; duplicate cospi_20_64 + + ; input[1] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; input[5] * cospi_12_64 + vmull.s16 q5, d26, d2 + vmull.s16 q6, d27, d2 + + ; input[1]*cospi_28_64-input[7]*cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q5, d22, d3 + vmlsl.s16 q6, d23, d3 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q5, #14 ; >> 14 + vqrshrn.s32 d11, q6, #14 ; >> 14 + + ; input[1] * cospi_4_64 + vmull.s16 q2, d18, d1 + vmull.s16 q3, d19, d1 + + ; input[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q13, d27, d3 + + ; input[1]*cospi_4_64+input[7]*cospi_28_64 + vmlal.s16 q2, d30, d0 + vmlal.s16 q3, d31, d0 + + ; input[5] * cospi_20_64 + input[3] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q13, d23, d2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d14, q2, #14 ; >> 14 + vqrshrn.s32 d15, q3, #14 ; >> 14 + + ; stage 2 & stage 3 - even half + vdup.16 d0, r7 ; duplicate cospi_16_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q13, #14 ; >> 14 + + ; input[0] * cospi_16_64 + vmull.s16 q2, d16, d0 + vmull.s16 q3, d17, d0 + + ; input[0] * cospi_16_64 + vmull.s16 q13, d16, d0 + vmull.s16 q15, d17, d0 + + ; (input[0] + input[2]) * cospi_16_64 + vmlal.s16 q2, d24, d0 + vmlal.s16 q3, d25, d0 + + ; (input[0] - input[2]) * cospi_16_64 + vmlsl.s16 q13, d24, d0 + vmlsl.s16 q15, d25, d0 + + vdup.16 d0, r8 ; duplicate cospi_24_64 + vdup.16 d1, r9 ; duplicate cospi_8_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d18, q2, #14 ; >> 14 + vqrshrn.s32 d19, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d22, q13, #14 ; >> 14 + vqrshrn.s32 d23, q15, #14 ; >> 14 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + ; input[1] * cospi_24_64 + vmull.s16 q2, d20, d0 + vmull.s16 q3, d21, d0 + + ; input[1] * cospi_8_64 + vmull.s16 q8, d20, d1 + vmull.s16 q12, d21, d1 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q2, d28, d1 + vmlsl.s16 q3, d29, d1 + + ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q8, d28, d0 + vmlal.s16 q12, d29, d0 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d26, q2, #14 ; >> 14 + vqrshrn.s32 d27, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d30, q8, #14 ; >> 14 + vqrshrn.s32 d31, q12, #14 ; >> 14 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + MEND + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_idct8x8_64_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + IDCT8x8_1D + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |vp9_idct8x8_64_add_neon| + +;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_idct8x8_12_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + ; stage 1 + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling + ; multiply and shift the result by 16 bits instead of 14 bits. So we need + ; to double the constants before multiplying to compensate this. + mov r12, r3, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_28_64*2 + mov r12, r4, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; dct_const_round_shift(input[1] * cospi_28_64) + vqrdmulh.s16 q4, q9, q0 + + mov r12, r6, lsl #1 + rsb r12, #0 + vdup.16 q0, r12 ; duplicate -cospi_20_64*2 + + ; dct_const_round_shift(input[1] * cospi_4_64) + vqrdmulh.s16 q7, q9, q1 + + mov r12, r5, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_12_64*2 + + ; dct_const_round_shift(- input[3] * cospi_20_64) + vqrdmulh.s16 q5, q11, q0 + + mov r12, r7, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_16_64*2 + + ; dct_const_round_shift(input[3] * cospi_12_64) + vqrdmulh.s16 q6, q11, q1 + + ; stage 2 & stage 3 - even half + mov r12, r8, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_24_64*2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrdmulh.s16 q9, q8, q0 + + mov r12, r9, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_8_64*2 + + ; dct_const_round_shift(input[1] * cospi_24_64) + vqrdmulh.s16 q13, q10, q1 + + ; dct_const_round_shift(input[1] * cospi_8_64) + vqrdmulh.s16 q15, q10, q0 + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |vp9_idct8x8_12_add_neon| + + END diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c new file mode 100644 index 000000000..ea3ce4943 --- /dev/null +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void TRANSPOSE8X8( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), + vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), + vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), + vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), + vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +static INLINE void IDCT8x8_1D( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d26s16, d2s16); + q6s32 = vmull_s16(d27s16, d2s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); + q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q2s32 = vmull_s16(d18s16, d1s16); + q3s32 = vmull_s16(d19s16, d1s16); + q9s32 = vmull_s16(d26s16, d3s16); + q13s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlal_s16(q2s32, d30s16, d0s16); + q3s32 = vmlal_s16(q3s32, d31s16, d0s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q13s32 = vmlal_s16(q13s32, d23s16, d2s16); + + d14s16 = vqrshrn_n_s32(q2s32, 14); + d15s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q13s32, 14); + q6s16 = vcombine_s16(d12s16, d13s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d0s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d0s16); + q3s32 = vmull_s16(d17s16, d0s16); + q13s32 = vmull_s16(d16s16, d0s16); + q15s32 = vmull_s16(d17s16, d0s16); + + q2s32 = vmlal_s16(q2s32, d24s16, d0s16); + q3s32 = vmlal_s16(q3s32, d25s16, d0s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); + q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); + + d0s16 = vdup_n_s16(cospi_24_64); + d1s16 = vdup_n_s16(cospi_8_64); + + d18s16 = vqrshrn_n_s32(q2s32, 14); + d19s16 = vqrshrn_n_s32(q3s32, 14); + d22s16 = vqrshrn_n_s32(q13s32, 14); + d23s16 = vqrshrn_n_s32(q15s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q2s32 = vmull_s16(d20s16, d0s16); + q3s32 = vmull_s16(d21s16, d0s16); + q8s32 = vmull_s16(d20s16, d1s16); + q12s32 = vmull_s16(d21s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); + q8s32 = vmlal_s16(q8s32, d28s16, d0s16); + q12s32 = vmlal_s16(q12s32, d29s16, d0s16); + + d26s16 = vqrshrn_n_s32(q2s32, 14); + d27s16 = vqrshrn_n_s32(q3s32, 14); + d30s16 = vqrshrn_n_s32(q8s32, 14); + d31s16 = vqrshrn_n_s32(q12s32, 14); + *q13s16 = vcombine_s16(d26s16, d27s16); + *q15s16 = vcombine_s16(d30s16, d31s16); + + q0s16 = vaddq_s16(*q9s16, *q15s16); + q1s16 = vaddq_s16(*q11s16, *q13s16); + q2s16 = vsubq_s16(*q11s16, *q13s16); + q3s16 = vsubq_s16(*q9s16, *q15s16); + + *q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + *q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + *q8s16 = vaddq_s16(q0s16, q7s16); + *q9s16 = vaddq_s16(q1s16, q6s16); + *q10s16 = vaddq_s16(q2s16, q5s16); + *q11s16 = vaddq_s16(q3s16, q4s16); + *q12s16 = vsubq_s16(q3s16, q4s16); + *q13s16 = vsubq_s16(q2s16, q5s16); + *q14s16 = vsubq_s16(q1s16, q6s16); + *q15s16 = vsubq_s16(q0s16, q7s16); + return; +} + +void vp9_idct8x8_64_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} + +void vp9_idct8x8_12_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // First transform rows + // stage 1 + q0s16 = vdupq_n_s16(cospi_28_64 * 2); + q1s16 = vdupq_n_s16(cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + + q0s16 = vdupq_n_s16(-cospi_20_64 * 2); + + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + q1s16 = vdupq_n_s16(cospi_12_64 * 2); + + q5s16 = vqrdmulhq_s16(q11s16, q0s16); + + q0s16 = vdupq_n_s16(cospi_16_64 * 2); + + q6s16 = vqrdmulhq_s16(q11s16, q1s16); + + // stage 2 & stage 3 - even half + q1s16 = vdupq_n_s16(cospi_24_64 * 2); + + q9s16 = vqrdmulhq_s16(q8s16, q0s16); + + q0s16 = vdupq_n_s16(cospi_8_64 * 2); + + q13s16 = vqrdmulhq_s16(q10s16, q1s16); + + q15s16 = vqrdmulhq_s16(q10s16, q0s16); + + // stage 3 -odd half + q0s16 = vaddq_s16(q9s16, q15s16); + q1s16 = vaddq_s16(q9s16, q13s16); + q2s16 = vsubq_s16(q9s16, q13s16); + q3s16 = vsubq_s16(q9s16, q15s16); + + // stage 2 - odd half + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + q8s16 = vaddq_s16(q0s16, q7s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q7s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c new file mode 100644 index 000000000..6babf54d0 --- /dev/null +++ b/vpx_dsp/inv_txfm.c @@ -0,0 +1,2476 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_dsp/inv_txfm.h" + +void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { +/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = WRAPLOW(a1, 8); + op[1] = WRAPLOW(b1, 8); + op[2] = WRAPLOW(c1, 8); + op[3] = WRAPLOW(d1, 8); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1); + + ip++; + dest++; + } +} + +void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = WRAPLOW(a1, 8); + op[1] = op[2] = op[3] = WRAPLOW(e1, 8); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); + dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); + dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); + dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); + ip++; + dest++; + } +} + +void idct4_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = WRAPLOW(dct_const_round_shift(temp1), 8); + step[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = WRAPLOW(dct_const_round_shift(temp1), 8); + step[3] = WRAPLOW(dct_const_round_shift(temp2), 8); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3], 8); + output[1] = WRAPLOW(step[1] + step[2], 8); + output[2] = WRAPLOW(step[1] - step[2], 8); + output[3] = WRAPLOW(step[0] - step[3], 8); +} + +void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + idct4_c(input, outptr); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + idct4_c(temp_in, temp_out); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 4)); + } + } +} + +void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, + int dest_stride) { + int i; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = clip_pixel_add(dest[0], a1); + dest[1] = clip_pixel_add(dest[1], a1); + dest[2] = clip_pixel_add(dest[2], a1); + dest[3] = clip_pixel_add(dest[3], a1); + dest += dest_stride; + } +} + +void idct8_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + + // stage 2 & stage 3 - even half + idct4_c(step1, step1); + + // stage 2 - odd half + step2[4] = WRAPLOW(step1[4] + step1[5], 8); + step2[5] = WRAPLOW(step1[4] - step1[5], 8); + step2[6] = WRAPLOW(-step1[6] + step1[7], 8); + step2[7] = WRAPLOW(step1[6] + step1[7], 8); + + // stage 3 -odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7], 8); + output[1] = WRAPLOW(step1[1] + step1[6], 8); + output[2] = WRAPLOW(step1[2] + step1[5], 8); + output[3] = WRAPLOW(step1[3] + step1[4], 8); + output[4] = WRAPLOW(step1[3] - step1[4], 8); + output[5] = WRAPLOW(step1[2] - step1[5], 8); + output[6] = WRAPLOW(step1[1] - step1[6], 8); + output[7] = WRAPLOW(step1[0] - step1[7], 8); +} + +void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) + dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void iadst4_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8); + output[2] = WRAPLOW(dct_const_round_shift(s2), 8); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8); +} + +void iadst8_c(const tran_low_t *input, tran_low_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); + s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); + s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); + s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); + s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); + s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); + s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); + s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); + + x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8); + + // stage 2 + s0 = (int)x0; + s1 = (int)x1; + s2 = (int)x2; + s3 = (int)x3; + s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); + s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); + s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); + s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); + + x0 = WRAPLOW(s0 + s2, 8); + x1 = WRAPLOW(s1 + s3, 8); + x2 = WRAPLOW(s0 - s2, 8); + x3 = WRAPLOW(s1 - s3, 8); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); + + // stage 3 + s2 = (int)(cospi_16_64 * (x2 + x3)); + s3 = (int)(cospi_16_64 * (x2 - x3)); + s6 = (int)(cospi_16_64 * (x6 + x7)); + s7 = (int)(cospi_16_64 * (x6 - x7)); + + x2 = WRAPLOW(dct_const_round_shift(s2), 8); + x3 = WRAPLOW(dct_const_round_shift(s3), 8); + x6 = WRAPLOW(dct_const_round_shift(s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s7), 8); + + output[0] = WRAPLOW(x0, 8); + output[1] = WRAPLOW(-x4, 8); + output[2] = WRAPLOW(x6, 8); + output[3] = WRAPLOW(-x2, 8); + output[4] = WRAPLOW(x3, 8); + output[5] = WRAPLOW(-x7, 8); + output[6] = WRAPLOW(x5, 8); + output[7] = WRAPLOW(-x1, 8); +} + +void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void idct16_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = input[0/2]; + step1[1] = input[16/2]; + step1[2] = input[8/2]; + step1[3] = input[24/2]; + step1[4] = input[4/2]; + step1[5] = input[20/2]; + step1[6] = input[12/2]; + step1[7] = input[28/2]; + step1[8] = input[2/2]; + step1[9] = input[18/2]; + step1[10] = input[10/2]; + step1[11] = input[26/2]; + step1[12] = input[6/2]; + step1[13] = input[22/2]; + step1[14] = input[14/2]; + step1[15] = input[30/2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + + step1[8] = WRAPLOW(step2[8] + step2[9], 8); + step1[9] = WRAPLOW(step2[8] - step2[9], 8); + step1[10] = WRAPLOW(-step2[10] + step2[11], 8); + step1[11] = WRAPLOW(step2[10] + step2[11], 8); + step1[12] = WRAPLOW(step2[12] + step2[13], 8); + step1[13] = WRAPLOW(step2[12] - step2[13], 8); + step1[14] = WRAPLOW(-step2[14] + step2[15], 8); + step1[15] = WRAPLOW(step2[14] + step2[15], 8); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[4] = WRAPLOW(step1[4] + step1[5], 8); + step2[5] = WRAPLOW(step1[4] - step1[5], 8); + step2[6] = WRAPLOW(-step1[6] + step1[7], 8); + step2[7] = WRAPLOW(step1[6] + step1[7], 8); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3], 8); + step1[1] = WRAPLOW(step2[1] + step2[2], 8); + step1[2] = WRAPLOW(step2[1] - step2[2], 8); + step1[3] = WRAPLOW(step2[0] - step2[3], 8); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11], 8); + step1[9] = WRAPLOW(step2[9] + step2[10], 8); + step1[10] = WRAPLOW(step2[9] - step2[10], 8); + step1[11] = WRAPLOW(step2[8] - step2[11], 8); + step1[12] = WRAPLOW(-step2[12] + step2[15], 8); + step1[13] = WRAPLOW(-step2[13] + step2[14], 8); + step1[14] = WRAPLOW(step2[13] + step2[14], 8); + step1[15] = WRAPLOW(step2[12] + step2[15], 8); + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7], 8); + step2[1] = WRAPLOW(step1[1] + step1[6], 8); + step2[2] = WRAPLOW(step1[2] + step1[5], 8); + step2[3] = WRAPLOW(step1[3] + step1[4], 8); + step2[4] = WRAPLOW(step1[3] - step1[4], 8); + step2[5] = WRAPLOW(step1[2] - step1[5], 8); + step2[6] = WRAPLOW(step1[1] - step1[6], 8); + step2[7] = WRAPLOW(step1[0] - step1[7], 8); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = WRAPLOW(step2[0] + step2[15], 8); + output[1] = WRAPLOW(step2[1] + step2[14], 8); + output[2] = WRAPLOW(step2[2] + step2[13], 8); + output[3] = WRAPLOW(step2[3] + step2[12], 8); + output[4] = WRAPLOW(step2[4] + step2[11], 8); + output[5] = WRAPLOW(step2[5] + step2[10], 8); + output[6] = WRAPLOW(step2[6] + step2[9], 8); + output[7] = WRAPLOW(step2[7] + step2[8], 8); + output[8] = WRAPLOW(step2[7] - step2[8], 8); + output[9] = WRAPLOW(step2[6] - step2[9], 8); + output[10] = WRAPLOW(step2[5] - step2[10], 8); + output[11] = WRAPLOW(step2[4] - step2[11], 8); + output[12] = WRAPLOW(step2[3] - step2[12], 8); + output[13] = WRAPLOW(step2[2] - step2[13], 8); + output[14] = WRAPLOW(step2[1] - step2[14], 8); + output[15] = WRAPLOW(step2[0] - step2[15], 8); +} + +void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void iadst16_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = output[8] + = output[9] = output[10] = output[11] = output[12] + = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4, 8); + x1 = WRAPLOW(s1 + s5, 8); + x2 = WRAPLOW(s2 + s6, 8); + x3 = WRAPLOW(s3 + s7, 8); + x4 = WRAPLOW(s0 - s4, 8); + x5 = WRAPLOW(s1 - s5, 8); + x6 = WRAPLOW(s2 - s6, 8); + x7 = WRAPLOW(s3 - s7, 8); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = WRAPLOW(check_range(s0 + s2), 8); + x1 = WRAPLOW(check_range(s1 + s3), 8); + x2 = WRAPLOW(check_range(s0 - s2), 8); + x3 = WRAPLOW(check_range(s1 - s3), 8); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); + x8 = WRAPLOW(check_range(s8 + s10), 8); + x9 = WRAPLOW(check_range(s9 + s11), 8); + x10 = WRAPLOW(check_range(s8 - s10), 8); + x11 = WRAPLOW(check_range(s9 - s11), 8); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = WRAPLOW(dct_const_round_shift(s2), 8); + x3 = WRAPLOW(dct_const_round_shift(s3), 8); + x6 = WRAPLOW(dct_const_round_shift(s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s7), 8); + x10 = WRAPLOW(dct_const_round_shift(s10), 8); + x11 = WRAPLOW(dct_const_round_shift(s11), 8); + x14 = WRAPLOW(dct_const_round_shift(s14), 8); + x15 = WRAPLOW(dct_const_round_shift(s15), 8); + + output[0] = WRAPLOW(x0, 8); + output[1] = WRAPLOW(-x8, 8); + output[2] = WRAPLOW(x12, 8); + output[3] = WRAPLOW(-x4, 8); + output[4] = WRAPLOW(x6, 8); + output[5] = WRAPLOW(x14, 8); + output[6] = WRAPLOW(x10, 8); + output[7] = WRAPLOW(x2, 8); + output[8] = WRAPLOW(x3, 8); + output[9] = WRAPLOW(x11, 8); + output[10] = WRAPLOW(x15, 8); + output[11] = WRAPLOW(x7, 8); + output[12] = WRAPLOW(x5, 8); + output[13] = WRAPLOW(-x13, 8); + output[14] = WRAPLOW(x9, 8); + output[15] = WRAPLOW(-x1, 8); +} + +void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void idct32_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + + step2[16] = WRAPLOW(step1[16] + step1[17], 8); + step2[17] = WRAPLOW(step1[16] - step1[17], 8); + step2[18] = WRAPLOW(-step1[18] + step1[19], 8); + step2[19] = WRAPLOW(step1[18] + step1[19], 8); + step2[20] = WRAPLOW(step1[20] + step1[21], 8); + step2[21] = WRAPLOW(step1[20] - step1[21], 8); + step2[22] = WRAPLOW(-step1[22] + step1[23], 8); + step2[23] = WRAPLOW(step1[22] + step1[23], 8); + step2[24] = WRAPLOW(step1[24] + step1[25], 8); + step2[25] = WRAPLOW(step1[24] - step1[25], 8); + step2[26] = WRAPLOW(-step1[26] + step1[27], 8); + step2[27] = WRAPLOW(step1[26] + step1[27], 8); + step2[28] = WRAPLOW(step1[28] + step1[29], 8); + step2[29] = WRAPLOW(step1[28] - step1[29], 8); + step2[30] = WRAPLOW(-step1[30] + step1[31], 8); + step2[31] = WRAPLOW(step1[30] + step1[31], 8); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + + step1[8] = WRAPLOW(step2[8] + step2[9], 8); + step1[9] = WRAPLOW(step2[8] - step2[9], 8); + step1[10] = WRAPLOW(-step2[10] + step2[11], 8); + step1[11] = WRAPLOW(step2[10] + step2[11], 8); + step1[12] = WRAPLOW(step2[12] + step2[13], 8); + step1[13] = WRAPLOW(step2[12] - step2[13], 8); + step1[14] = WRAPLOW(-step2[14] + step2[15], 8); + step1[15] = WRAPLOW(step2[14] + step2[15], 8); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[4] = WRAPLOW(step1[4] + step1[5], 8); + step2[5] = WRAPLOW(step1[4] - step1[5], 8); + step2[6] = WRAPLOW(-step1[6] + step1[7], 8); + step2[7] = WRAPLOW(step1[6] + step1[7], 8); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = WRAPLOW(step1[16] + step1[19], 8); + step2[17] = WRAPLOW(step1[17] + step1[18], 8); + step2[18] = WRAPLOW(step1[17] - step1[18], 8); + step2[19] = WRAPLOW(step1[16] - step1[19], 8); + step2[20] = WRAPLOW(-step1[20] + step1[23], 8); + step2[21] = WRAPLOW(-step1[21] + step1[22], 8); + step2[22] = WRAPLOW(step1[21] + step1[22], 8); + step2[23] = WRAPLOW(step1[20] + step1[23], 8); + + step2[24] = WRAPLOW(step1[24] + step1[27], 8); + step2[25] = WRAPLOW(step1[25] + step1[26], 8); + step2[26] = WRAPLOW(step1[25] - step1[26], 8); + step2[27] = WRAPLOW(step1[24] - step1[27], 8); + step2[28] = WRAPLOW(-step1[28] + step1[31], 8); + step2[29] = WRAPLOW(-step1[29] + step1[30], 8); + step2[30] = WRAPLOW(step1[29] + step1[30], 8); + step2[31] = WRAPLOW(step1[28] + step1[31], 8); + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3], 8); + step1[1] = WRAPLOW(step2[1] + step2[2], 8); + step1[2] = WRAPLOW(step2[1] - step2[2], 8); + step1[3] = WRAPLOW(step2[0] - step2[3], 8); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11], 8); + step1[9] = WRAPLOW(step2[9] + step2[10], 8); + step1[10] = WRAPLOW(step2[9] - step2[10], 8); + step1[11] = WRAPLOW(step2[8] - step2[11], 8); + step1[12] = WRAPLOW(-step2[12] + step2[15], 8); + step1[13] = WRAPLOW(-step2[13] + step2[14], 8); + step1[14] = WRAPLOW(step2[13] + step2[14], 8); + step1[15] = WRAPLOW(step2[12] + step2[15], 8); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7], 8); + step2[1] = WRAPLOW(step1[1] + step1[6], 8); + step2[2] = WRAPLOW(step1[2] + step1[5], 8); + step2[3] = WRAPLOW(step1[3] + step1[4], 8); + step2[4] = WRAPLOW(step1[3] - step1[4], 8); + step2[5] = WRAPLOW(step1[2] - step1[5], 8); + step2[6] = WRAPLOW(step1[1] - step1[6], 8); + step2[7] = WRAPLOW(step1[0] - step1[7], 8); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = WRAPLOW(step1[16] + step1[23], 8); + step2[17] = WRAPLOW(step1[17] + step1[22], 8); + step2[18] = WRAPLOW(step1[18] + step1[21], 8); + step2[19] = WRAPLOW(step1[19] + step1[20], 8); + step2[20] = WRAPLOW(step1[19] - step1[20], 8); + step2[21] = WRAPLOW(step1[18] - step1[21], 8); + step2[22] = WRAPLOW(step1[17] - step1[22], 8); + step2[23] = WRAPLOW(step1[16] - step1[23], 8); + + step2[24] = WRAPLOW(-step1[24] + step1[31], 8); + step2[25] = WRAPLOW(-step1[25] + step1[30], 8); + step2[26] = WRAPLOW(-step1[26] + step1[29], 8); + step2[27] = WRAPLOW(-step1[27] + step1[28], 8); + step2[28] = WRAPLOW(step1[27] + step1[28], 8); + step2[29] = WRAPLOW(step1[26] + step1[29], 8); + step2[30] = WRAPLOW(step1[25] + step1[30], 8); + step2[31] = WRAPLOW(step1[24] + step1[31], 8); + + // stage 7 + step1[0] = WRAPLOW(step2[0] + step2[15], 8); + step1[1] = WRAPLOW(step2[1] + step2[14], 8); + step1[2] = WRAPLOW(step2[2] + step2[13], 8); + step1[3] = WRAPLOW(step2[3] + step2[12], 8); + step1[4] = WRAPLOW(step2[4] + step2[11], 8); + step1[5] = WRAPLOW(step2[5] + step2[10], 8); + step1[6] = WRAPLOW(step2[6] + step2[9], 8); + step1[7] = WRAPLOW(step2[7] + step2[8], 8); + step1[8] = WRAPLOW(step2[7] - step2[8], 8); + step1[9] = WRAPLOW(step2[6] - step2[9], 8); + step1[10] = WRAPLOW(step2[5] - step2[10], 8); + step1[11] = WRAPLOW(step2[4] - step2[11], 8); + step1[12] = WRAPLOW(step2[3] - step2[12], 8); + step1[13] = WRAPLOW(step2[2] - step2[13], 8); + step1[14] = WRAPLOW(step2[1] - step2[14], 8); + step1[15] = WRAPLOW(step2[0] - step2[15], 8); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = WRAPLOW(step1[0] + step1[31], 8); + output[1] = WRAPLOW(step1[1] + step1[30], 8); + output[2] = WRAPLOW(step1[2] + step1[29], 8); + output[3] = WRAPLOW(step1[3] + step1[28], 8); + output[4] = WRAPLOW(step1[4] + step1[27], 8); + output[5] = WRAPLOW(step1[5] + step1[26], 8); + output[6] = WRAPLOW(step1[6] + step1[25], 8); + output[7] = WRAPLOW(step1[7] + step1[24], 8); + output[8] = WRAPLOW(step1[8] + step1[23], 8); + output[9] = WRAPLOW(step1[9] + step1[22], 8); + output[10] = WRAPLOW(step1[10] + step1[21], 8); + output[11] = WRAPLOW(step1[11] + step1[20], 8); + output[12] = WRAPLOW(step1[12] + step1[19], 8); + output[13] = WRAPLOW(step1[13] + step1[18], 8); + output[14] = WRAPLOW(step1[14] + step1[17], 8); + output[15] = WRAPLOW(step1[15] + step1[16], 8); + output[16] = WRAPLOW(step1[15] - step1[16], 8); + output[17] = WRAPLOW(step1[14] - step1[17], 8); + output[18] = WRAPLOW(step1[13] - step1[18], 8); + output[19] = WRAPLOW(step1[12] - step1[19], 8); + output[20] = WRAPLOW(step1[11] - step1[20], 8); + output[21] = WRAPLOW(step1[10] - step1[21], 8); + output[22] = WRAPLOW(step1[9] - step1[22], 8); + output[23] = WRAPLOW(step1[8] - step1[23], 8); + output[24] = WRAPLOW(step1[7] - step1[24], 8); + output[25] = WRAPLOW(step1[6] - step1[25], 8); + output[26] = WRAPLOW(step1[5] - step1[26], 8); + output[27] = WRAPLOW(step1[4] - step1[27], 8); + output[28] = WRAPLOW(step1[3] - step1[28], 8); + output[29] = WRAPLOW(step1[2] - step1[29], 8); + output[30] = WRAPLOW(step1[1] - step1[30], 8); + output[31] = WRAPLOW(step1[0] - step1[31], 8); +} + +void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + int16_t zero_coeff[16]; + for (j = 0; j < 16; ++j) + zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + idct32_c(input, outptr); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = {0}; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { + idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) + dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = WRAPLOW(a1, bd); + op[1] = WRAPLOW(b1, bd); + op[2] = WRAPLOW(c1, bd); + op[3] = WRAPLOW(d1, bd); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); + + ip++; + dest++; + } +} + +void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + (void) bd; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = WRAPLOW(a1, bd); + op[1] = op[2] = op[3] = WRAPLOW(e1, bd); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = highbd_clip_pixel_add( + dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = highbd_clip_pixel_add( + dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = highbd_clip_pixel_add( + dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = highbd_clip_pixel_add( + dest[dest_stride * 3], e1, bd); + ip++; + dest++; + } +} + +void vp9_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + (void) bd; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3], bd); + output[1] = WRAPLOW(step[1] + step[2], bd); + output[2] = WRAPLOW(step[1] - step[2], bd); + output[3] = WRAPLOW(step[0] - step[3], bd); +} + +void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 4; ++i) { + vp9_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + vp9_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } +} + +void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1; + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); + dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); + dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); + dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); + dest += dest_stride; + } +} + +void vp9_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + // stage 2 & stage 3 - even half + vp9_highbd_idct4_c(step1, step1, bd); + + // stage 2 - odd half + step2[4] = WRAPLOW(step1[4] + step1[5], bd); + step2[5] = WRAPLOW(step1[4] - step1[5], bd); + step2[6] = WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = WRAPLOW(step1[6] + step1[7], bd); + + // stage 3 - odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7], bd); + output[1] = WRAPLOW(step1[1] + step1[6], bd); + output[2] = WRAPLOW(step1[2] + step1[5], bd); + output[3] = WRAPLOW(step1[3] + step1[4], bd); + output[4] = WRAPLOW(step1[3] - step1[4], bd); + output[5] = WRAPLOW(step1[2] - step1[5], bd); + output[6] = WRAPLOW(step1[1] - step1[6], bd); + output[7] = WRAPLOW(step1[0] - step1[7], bd); +} + +void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + for (i = 0; i < 8; ++i) { + vp9_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Then transform columns. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vp9_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} + +void highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + (void) bd; + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = (tran_high_t)(x0 - x2 + x3); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd); + output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd); + output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd); +} + +void highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[7]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[5]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[3]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[1]; + tran_low_t x7 = input[6]; + (void) bd; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd); + x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = WRAPLOW(s0 + s2, bd); + x1 = WRAPLOW(s1 + s3, bd); + x2 = WRAPLOW(s0 - s2, bd); + x3 = WRAPLOW(s1 - s3, bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); + + output[0] = WRAPLOW(x0, bd); + output[1] = WRAPLOW(-x4, bd); + output[2] = WRAPLOW(x6, bd); + output[3] = WRAPLOW(-x2, bd); + output[4] = WRAPLOW(x3, bd); + output[5] = WRAPLOW(-x7, bd); + output[6] = WRAPLOW(x5, bd); + output[7] = WRAPLOW(-x1, bd); +} + +void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + // Only first 4 row has non-zero coefs. + for (i = 0; i < 4; ++i) { + vp9_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + // Then transform columns. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vp9_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +void vp9_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + (void) bd; + + // stage 1 + step1[0] = input[0/2]; + step1[1] = input[16/2]; + step1[2] = input[8/2]; + step1[3] = input[24/2]; + step1[4] = input[4/2]; + step1[5] = input[20/2]; + step1[6] = input[12/2]; + step1[7] = input[28/2]; + step1[8] = input[2/2]; + step1[9] = input[18/2]; + step1[10] = input[10/2]; + step1[11] = input[26/2]; + step1[12] = input[6/2]; + step1[13] = input[22/2]; + step1[14] = input[14/2]; + step1[15] = input[30/2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + step1[8] = WRAPLOW(step2[8] + step2[9], bd); + step1[9] = WRAPLOW(step2[8] - step2[9], bd); + step1[10] = WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = WRAPLOW(step2[10] + step2[11], bd); + step1[12] = WRAPLOW(step2[12] + step2[13], bd); + step1[13] = WRAPLOW(step2[12] - step2[13], bd); + step1[14] = WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = WRAPLOW(step2[14] + step2[15], bd); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[4] = WRAPLOW(step1[4] + step1[5], bd); + step2[5] = WRAPLOW(step1[4] - step1[5], bd); + step2[6] = WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = WRAPLOW(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3], bd); + step1[1] = WRAPLOW(step2[1] + step2[2], bd); + step1[2] = WRAPLOW(step2[1] - step2[2], bd); + step1[3] = WRAPLOW(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11], bd); + step1[9] = WRAPLOW(step2[9] + step2[10], bd); + step1[10] = WRAPLOW(step2[9] - step2[10], bd); + step1[11] = WRAPLOW(step2[8] - step2[11], bd); + step1[12] = WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = WRAPLOW(step2[13] + step2[14], bd); + step1[15] = WRAPLOW(step2[12] + step2[15], bd); + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7], bd); + step2[1] = WRAPLOW(step1[1] + step1[6], bd); + step2[2] = WRAPLOW(step1[2] + step1[5], bd); + step2[3] = WRAPLOW(step1[3] + step1[4], bd); + step2[4] = WRAPLOW(step1[3] - step1[4], bd); + step2[5] = WRAPLOW(step1[2] - step1[5], bd); + step2[6] = WRAPLOW(step1[1] - step1[6], bd); + step2[7] = WRAPLOW(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = WRAPLOW(step2[0] + step2[15], bd); + output[1] = WRAPLOW(step2[1] + step2[14], bd); + output[2] = WRAPLOW(step2[2] + step2[13], bd); + output[3] = WRAPLOW(step2[3] + step2[12], bd); + output[4] = WRAPLOW(step2[4] + step2[11], bd); + output[5] = WRAPLOW(step2[5] + step2[10], bd); + output[6] = WRAPLOW(step2[6] + step2[9], bd); + output[7] = WRAPLOW(step2[7] + step2[8], bd); + output[8] = WRAPLOW(step2[7] - step2[8], bd); + output[9] = WRAPLOW(step2[6] - step2[9], bd); + output[10] = WRAPLOW(step2[5] - step2[10], bd); + output[11] = WRAPLOW(step2[4] - step2[11], bd); + output[12] = WRAPLOW(step2[3] - step2[12], bd); + output[13] = WRAPLOW(step2[2] - step2[13], bd); + output[14] = WRAPLOW(step2[1] - step2[14], bd); + output[15] = WRAPLOW(step2[0] - step2[15], bd); +} + +void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + for (i = 0; i < 16; ++i) { + vp9_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns. + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + vp9_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_low_t x0 = input[15]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[13]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[11]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[9]; + tran_low_t x7 = input[6]; + tran_low_t x8 = input[7]; + tran_low_t x9 = input[8]; + tran_low_t x10 = input[5]; + tran_low_t x11 = input[10]; + tran_low_t x12 = input[3]; + tran_low_t x13 = input[12]; + tran_low_t x14 = input[1]; + tran_low_t x15 = input[14]; + (void) bd; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd); + x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd); + x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd); + x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4, bd); + x1 = WRAPLOW(s1 + s5, bd); + x2 = WRAPLOW(s2 + s6, bd); + x3 = WRAPLOW(s3 + s7, bd); + x4 = WRAPLOW(s0 - s4, bd); + x5 = WRAPLOW(s1 - s5, bd); + x6 = WRAPLOW(s2 - s6, bd); + x7 = WRAPLOW(s3 - s7, bd); + x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd); + x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = WRAPLOW(s0 + s2, bd); + x1 = WRAPLOW(s1 + s3, bd); + x2 = WRAPLOW(s0 - s2, bd); + x3 = WRAPLOW(s1 - s3, bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); + x8 = WRAPLOW(s8 + s10, bd); + x9 = WRAPLOW(s9 + s11, bd); + x10 = WRAPLOW(s8 - s10, bd); + x11 = WRAPLOW(s9 - s11, bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd); + + output[0] = WRAPLOW(x0, bd); + output[1] = WRAPLOW(-x8, bd); + output[2] = WRAPLOW(x12, bd); + output[3] = WRAPLOW(-x4, bd); + output[4] = WRAPLOW(x6, bd); + output[5] = WRAPLOW(x14, bd); + output[6] = WRAPLOW(x10, bd); + output[7] = WRAPLOW(x2, bd); + output[8] = WRAPLOW(x3, bd); + output[9] = WRAPLOW(x11, bd); + output[10] = WRAPLOW(x15, bd); + output[11] = WRAPLOW(x7, bd); + output[12] = WRAPLOW(x5, bd); + output[13] = WRAPLOW(-x13, bd); + output[14] = WRAPLOW(x9, bd); + output[15] = WRAPLOW(-x1, bd); +} + +void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + vp9_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns. + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + vp9_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} + +void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + (void) bd; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + step2[16] = WRAPLOW(step1[16] + step1[17], bd); + step2[17] = WRAPLOW(step1[16] - step1[17], bd); + step2[18] = WRAPLOW(-step1[18] + step1[19], bd); + step2[19] = WRAPLOW(step1[18] + step1[19], bd); + step2[20] = WRAPLOW(step1[20] + step1[21], bd); + step2[21] = WRAPLOW(step1[20] - step1[21], bd); + step2[22] = WRAPLOW(-step1[22] + step1[23], bd); + step2[23] = WRAPLOW(step1[22] + step1[23], bd); + step2[24] = WRAPLOW(step1[24] + step1[25], bd); + step2[25] = WRAPLOW(step1[24] - step1[25], bd); + step2[26] = WRAPLOW(-step1[26] + step1[27], bd); + step2[27] = WRAPLOW(step1[26] + step1[27], bd); + step2[28] = WRAPLOW(step1[28] + step1[29], bd); + step2[29] = WRAPLOW(step1[28] - step1[29], bd); + step2[30] = WRAPLOW(-step1[30] + step1[31], bd); + step2[31] = WRAPLOW(step1[30] + step1[31], bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + + step1[8] = WRAPLOW(step2[8] + step2[9], bd); + step1[9] = WRAPLOW(step2[8] - step2[9], bd); + step1[10] = WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = WRAPLOW(step2[10] + step2[11], bd); + step1[12] = WRAPLOW(step2[12] + step2[13], bd); + step1[13] = WRAPLOW(step2[12] - step2[13], bd); + step1[14] = WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = WRAPLOW(step2[14] + step2[15], bd); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[4] = WRAPLOW(step1[4] + step1[5], bd); + step2[5] = WRAPLOW(step1[4] - step1[5], bd); + step2[6] = WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = WRAPLOW(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = WRAPLOW(step1[16] + step1[19], bd); + step2[17] = WRAPLOW(step1[17] + step1[18], bd); + step2[18] = WRAPLOW(step1[17] - step1[18], bd); + step2[19] = WRAPLOW(step1[16] - step1[19], bd); + step2[20] = WRAPLOW(-step1[20] + step1[23], bd); + step2[21] = WRAPLOW(-step1[21] + step1[22], bd); + step2[22] = WRAPLOW(step1[21] + step1[22], bd); + step2[23] = WRAPLOW(step1[20] + step1[23], bd); + + step2[24] = WRAPLOW(step1[24] + step1[27], bd); + step2[25] = WRAPLOW(step1[25] + step1[26], bd); + step2[26] = WRAPLOW(step1[25] - step1[26], bd); + step2[27] = WRAPLOW(step1[24] - step1[27], bd); + step2[28] = WRAPLOW(-step1[28] + step1[31], bd); + step2[29] = WRAPLOW(-step1[29] + step1[30], bd); + step2[30] = WRAPLOW(step1[29] + step1[30], bd); + step2[31] = WRAPLOW(step1[28] + step1[31], bd); + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3], bd); + step1[1] = WRAPLOW(step2[1] + step2[2], bd); + step1[2] = WRAPLOW(step2[1] - step2[2], bd); + step1[3] = WRAPLOW(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11], bd); + step1[9] = WRAPLOW(step2[9] + step2[10], bd); + step1[10] = WRAPLOW(step2[9] - step2[10], bd); + step1[11] = WRAPLOW(step2[8] - step2[11], bd); + step1[12] = WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = WRAPLOW(step2[13] + step2[14], bd); + step1[15] = WRAPLOW(step2[12] + step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7], bd); + step2[1] = WRAPLOW(step1[1] + step1[6], bd); + step2[2] = WRAPLOW(step1[2] + step1[5], bd); + step2[3] = WRAPLOW(step1[3] + step1[4], bd); + step2[4] = WRAPLOW(step1[3] - step1[4], bd); + step2[5] = WRAPLOW(step1[2] - step1[5], bd); + step2[6] = WRAPLOW(step1[1] - step1[6], bd); + step2[7] = WRAPLOW(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = WRAPLOW(step1[16] + step1[23], bd); + step2[17] = WRAPLOW(step1[17] + step1[22], bd); + step2[18] = WRAPLOW(step1[18] + step1[21], bd); + step2[19] = WRAPLOW(step1[19] + step1[20], bd); + step2[20] = WRAPLOW(step1[19] - step1[20], bd); + step2[21] = WRAPLOW(step1[18] - step1[21], bd); + step2[22] = WRAPLOW(step1[17] - step1[22], bd); + step2[23] = WRAPLOW(step1[16] - step1[23], bd); + + step2[24] = WRAPLOW(-step1[24] + step1[31], bd); + step2[25] = WRAPLOW(-step1[25] + step1[30], bd); + step2[26] = WRAPLOW(-step1[26] + step1[29], bd); + step2[27] = WRAPLOW(-step1[27] + step1[28], bd); + step2[28] = WRAPLOW(step1[27] + step1[28], bd); + step2[29] = WRAPLOW(step1[26] + step1[29], bd); + step2[30] = WRAPLOW(step1[25] + step1[30], bd); + step2[31] = WRAPLOW(step1[24] + step1[31], bd); + + // stage 7 + step1[0] = WRAPLOW(step2[0] + step2[15], bd); + step1[1] = WRAPLOW(step2[1] + step2[14], bd); + step1[2] = WRAPLOW(step2[2] + step2[13], bd); + step1[3] = WRAPLOW(step2[3] + step2[12], bd); + step1[4] = WRAPLOW(step2[4] + step2[11], bd); + step1[5] = WRAPLOW(step2[5] + step2[10], bd); + step1[6] = WRAPLOW(step2[6] + step2[9], bd); + step1[7] = WRAPLOW(step2[7] + step2[8], bd); + step1[8] = WRAPLOW(step2[7] - step2[8], bd); + step1[9] = WRAPLOW(step2[6] - step2[9], bd); + step1[10] = WRAPLOW(step2[5] - step2[10], bd); + step1[11] = WRAPLOW(step2[4] - step2[11], bd); + step1[12] = WRAPLOW(step2[3] - step2[12], bd); + step1[13] = WRAPLOW(step2[2] - step2[13], bd); + step1[14] = WRAPLOW(step2[1] - step2[14], bd); + step1[15] = WRAPLOW(step2[0] - step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = WRAPLOW(step1[0] + step1[31], bd); + output[1] = WRAPLOW(step1[1] + step1[30], bd); + output[2] = WRAPLOW(step1[2] + step1[29], bd); + output[3] = WRAPLOW(step1[3] + step1[28], bd); + output[4] = WRAPLOW(step1[4] + step1[27], bd); + output[5] = WRAPLOW(step1[5] + step1[26], bd); + output[6] = WRAPLOW(step1[6] + step1[25], bd); + output[7] = WRAPLOW(step1[7] + step1[24], bd); + output[8] = WRAPLOW(step1[8] + step1[23], bd); + output[9] = WRAPLOW(step1[9] + step1[22], bd); + output[10] = WRAPLOW(step1[10] + step1[21], bd); + output[11] = WRAPLOW(step1[11] + step1[20], bd); + output[12] = WRAPLOW(step1[12] + step1[19], bd); + output[13] = WRAPLOW(step1[13] + step1[18], bd); + output[14] = WRAPLOW(step1[14] + step1[17], bd); + output[15] = WRAPLOW(step1[15] + step1[16], bd); + output[16] = WRAPLOW(step1[15] - step1[16], bd); + output[17] = WRAPLOW(step1[14] - step1[17], bd); + output[18] = WRAPLOW(step1[13] - step1[18], bd); + output[19] = WRAPLOW(step1[12] - step1[19], bd); + output[20] = WRAPLOW(step1[11] - step1[20], bd); + output[21] = WRAPLOW(step1[10] - step1[21], bd); + output[22] = WRAPLOW(step1[9] - step1[22], bd); + output[23] = WRAPLOW(step1[8] - step1[23], bd); + output[24] = WRAPLOW(step1[7] - step1[24], bd); + output[25] = WRAPLOW(step1[6] - step1[25], bd); + output[26] = WRAPLOW(step1[5] - step1[26], bd); + output[27] = WRAPLOW(step1[4] - step1[27], bd); + output[28] = WRAPLOW(step1[3] - step1[28], bd); + output[29] = WRAPLOW(step1[2] - step1[29], bd); + output[30] = WRAPLOW(step1[1] - step1[30], bd); + output[31] = WRAPLOW(step1[0] - step1[31], bd); +} + +void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 32; ++i) { + tran_low_t zero_coeff[16]; + for (j = 0; j < 16; ++j) + zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + highbd_idct32_c(input, outptr, bd); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vp9_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[32 * 32] = {0}; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + // Only upper-left 8x8 has non-zero coeff. + for (i = 0; i < 8; ++i) { + highbd_idct32_c(input, outptr, bd); + input += 32; + outptr += 32; + } + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + int a1; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h new file mode 100644 index 000000000..b11039ab8 --- /dev/null +++ b/vpx_dsp/inv_txfm.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_INV_TXFM_H_ +#define VPX_DSP_INV_TXFM_H_ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE tran_low_t check_range(tran_high_t input) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid VP9 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt VP9 streams. However, strictly checking + // this range for every intermediate coefficient can burdensome for a decoder, + // therefore the following assertion is only enabled when configured with + // --enable-coefficient-range-checking. + assert(INT16_MIN <= input); + assert(input <= INT16_MAX); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + return (tran_low_t)input; +} + +static INLINE tran_low_t dct_const_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return check_range(rv); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE tran_low_t highbd_check_range(tran_high_t input, + int bd) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid highbitdepth VP9 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + const int32_t int_max = (1 << (7 + bd)) - 1; + const int32_t int_min = -int_max - 1; + assert(int_min <= input); + assert(input <= int_max); + (void) int_min; +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + (void) bd; + return (tran_low_t)input; +} + +static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input, + int bd) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return highbd_check_range(rv, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_EMULATE_HARDWARE +// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a +// non-normative method to handle overflows. A stream that causes +// overflows in the inverse transform is considered invalid in VP9, +// and a hardware implementer is free to choose any reasonable +// method to handle overflows. However to aid in hardware +// verification they can use a specific implementation of the +// WRAPLOW() macro below that is identical to their intended +// hardware implementation (and also use configure options to trigger +// the C-implementation of the transform). +// +// The particular WRAPLOW implementation below performs strict +// overflow wrapping to match common hardware implementations. +// bd of 8 uses trans_low with 16bits, need to remove 16bits +// bd of 10 uses trans_low with 18bits, need to remove 14bits +// bd of 12 uses trans_low with 20bits, need to remove 12bits +// bd of x uses trans_low with 8+x bits, need to remove 24-x bits +#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd)) +#else +#define WRAPLOW(x, bd) ((int32_t)(x)) +#endif // CONFIG_EMULATE_HARDWARE + +void idct4_c(const tran_low_t *input, tran_low_t *output); +void idct8_c(const tran_low_t *input, tran_low_t *output); +void idct16_c(const tran_low_t *input, tran_low_t *output); +void idct32_c(const tran_low_t *input, tran_low_t *output); +void iadst4_c(const tran_low_t *input, tran_low_t *output); +void iadst8_c(const tran_low_t *input, tran_low_t *output); +void iadst16_c(const tran_low_t *input, tran_low_t *output); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); +void vp9_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); +void vp9_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); +void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd); + +void highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd); +void highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd); +void highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); + +static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, + int bd) { + trans = WRAPLOW(trans, bd); + return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd); +} +#endif + +static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { + trans = WRAPLOW(trans, 8); + return clip_pixel(WRAPLOW(dest + trans, 8)); +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_INV_TXFM_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 4c5545c1a..1416b98d4 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -169,6 +169,43 @@ DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c endif # CONFIG_VP9_ENCODER +# inverse transform +ifeq ($(CONFIG_VP9),yes) +DSP_SRCS-yes += inv_txfm.h +DSP_SRCS-yes += inv_txfm.c +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.asm +ifeq ($(ARCH_X86_64),yes) +ifeq ($(CONFIG_USE_X86INC),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm +endif # CONFIG_USE_X86INC +endif # ARCH_X86_64 + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) +DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM) +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/idct4x4_1_add_neon.c +DSP_SRCS-yes += arm/idct4x4_add_neon.c +DSP_SRCS-yes += arm/idct8x8_1_add_neon.c +DSP_SRCS-yes += arm/idct8x8_add_neon.c +DSP_SRCS-yes += arm/idct16x16_1_add_neon.c +DSP_SRCS-yes += arm/idct16x16_add_neon.c +DSP_SRCS-yes += arm/idct32x32_1_add_neon.c +DSP_SRCS-yes += arm/idct32x32_add_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c +endif # CONFIG_VP9 + # quantization ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += quantize.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3b732b9f3..ca564bcf2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -592,6 +592,193 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER +# +# Inverse transform +if (vpx_config("CONFIG_VP9") eq "yes") { +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + # Note as optimized versions of these functions are added we need to add a check to ensure + # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. + add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_1_add/; + + add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_16_add/; + + add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_1_add/; + + add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_64_add/; + + add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_12_add/; + + add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_1_add/; + + add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_256_add/; + + add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_10_add/; + + add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1024_add/; + + add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_34_add/; + + add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1_add/; + + add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_1_add/; + + add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_16_add/; + + add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct4x4_1_add/; + + add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct8x8_1_add/; + + add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct16x16_1_add/; + + add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct32x32_1024_add/; + + add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct32x32_34_add/; + + add_proto qw/void vp9_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct32x32_1_add/; + + add_proto qw/void vp9_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_iwht4x4_1_add/; + + add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_iwht4x4_16_add/; + + # Force C versions if CONFIG_EMULATE_HARDWARE is 1 + if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { + add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct4x4_16_add/; + + add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct8x8_64_add/; + + add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct8x8_10_add/; + + add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct16x16_256_add/; + + add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct16x16_10_add/; + } else { + add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct4x4_16_add sse2/; + + add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct8x8_64_add sse2/; + + add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct8x8_10_add sse2/; + + add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct16x16_256_add sse2/; + + add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_highbd_idct16x16_10_add sse2/; + } # CONFIG_EMULATE_HARDWARE +} else { + # Force C versions if CONFIG_EMULATE_HARDWARE is 1 + if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { + add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_1_add/; + + add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_16_add/; + + add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_1_add/; + + add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_64_add/; + + add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_12_add/; + + add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_1_add/; + + add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_256_add/; + + add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_10_add/; + + add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1024_add/; + + add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_34_add/; + + add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1_add/; + + add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_1_add/; + + add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_16_add/; + } else { + add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_1_add sse2 neon dspr2 msa/; + + add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_16_add sse2 neon dspr2 msa/; + + add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/; + + add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc"; + + add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc"; + + add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/; + + add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/; + + add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/; + + add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/; + + add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/; + # Need to add 34 eob idct32x32 neon implementation. + $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon; + + add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/; + + add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_1_add msa/; + + add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc"; + } # CONFIG_EMULATE_HARDWARE +} # CONFIG_VP9_HIGHBITDEPTH +} # CONFIG_VP9 + # # Quantization # diff --git a/vpx_dsp/x86/inv_txfm_sse2.asm b/vpx_dsp/x86/inv_txfm_sse2.asm new file mode 100644 index 000000000..69b68e6d8 --- /dev/null +++ b/vpx_dsp/x86/inv_txfm_sse2.asm @@ -0,0 +1,102 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro REORDER_INPUTS 0 + ; a c d b to a b c d + SWAP 1, 3, 2 +%endmacro + +%macro TRANSFORM_COLS 0 + ; input: + ; m0 a + ; m1 b + ; m2 c + ; m3 d + paddw m0, m2 + psubw m3, m1 + + ; wide subtract + punpcklwd m4, m0 + punpcklwd m5, m3 + psrad m4, 16 + psrad m5, 16 + psubd m4, m5 + psrad m4, 1 + packssdw m4, m4 ; e + + psubw m5, m4, m1 ; b + psubw m4, m2 ; c + psubw m0, m5 + paddw m3, m4 + ; m0 a + SWAP 1, 5 ; m1 b + SWAP 2, 4 ; m2 c + ; m3 d +%endmacro + +%macro TRANSPOSE_4X4 0 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 +%macro TRANSPOSE_4X4_WIDE 0 + mova m3, m0 + punpcklwd m0, m1 + punpckhwd m3, m1 + mova m2, m0 + punpcklwd m0, m3 + punpckhwd m2, m3 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero + movq m%3, [outputq] + movq m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%1, m%3 + paddw m%2, m%4 + packuswb m%1, m%5 + packuswb m%2, m%5 + movd [outputq], m%1 + movd [outputq + strideq], m%2 +%endmacro + +INIT_XMM sse2 +cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride + mova m0, [inputq + 0] + mova m1, [inputq + 16] + + psraw m0, 2 + psraw m1, 2 + + TRANSPOSE_4X4_WIDE + REORDER_INPUTS + TRANSFORM_COLS + TRANSPOSE_4X4 + REORDER_INPUTS + TRANSFORM_COLS + + pxor m4, m4 + ADD_STORE_4P_2X 0, 1, 5, 6, 4 + lea outputq, [outputq + 2 * strideq] + ADD_STORE_4P_2X 2, 3, 5, 6, 4 + + RET diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c new file mode 100644 index 000000000..f894ee678 --- /dev/null +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -0,0 +1,4053 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +#define RECON_AND_STORE4X4(dest, in_x) \ +{ \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)(dest) = _mm_cvtsi128_si32(d0); \ +} + +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + const __m128i cst = _mm_setr_epi16( + (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i input0, input1, input2, input3; + + // Rows + input0 = _mm_load_si128((const __m128i *)input); + input2 = _mm_load_si128((const __m128i *)(input + 8)); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input0, 0xd8); + input0 = _mm_shufflehi_epi16(input0, 0xd8); + input2 = _mm_shufflelo_epi16(input2, 0xd8); + input2 = _mm_shufflehi_epi16(input2, 0xd8); + + input1 = _mm_unpackhi_epi32(input0, input0); + input0 = _mm_unpacklo_epi32(input0, input0); + input3 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpacklo_epi32(input2, input2); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input1); + input1 = _mm_packs_epi32(input2, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Columns + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_unpacklo_epi32(input2, input2); + input1 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpackhi_epi32(input3, input3); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input2); + input1 = _mm_packs_epi32(input1, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Final round and shift + input2 = _mm_add_epi16(input2, eight); + input3 = _mm_add_epi16(input3, eight); + + input2 = _mm_srai_epi16(input2, 4); + input3 = _mm_srai_epi16(input3, 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d2 = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, input2); + d2 = _mm_add_epi16(d2, input3); + d0 = _mm_packus_epi16(d0, d2); + // store input0 + *(int *)dest = _mm_cvtsi128_si32(d0); + // store input1 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store input2 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + // store input3 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + } +} + +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE4X4(dest + 0 * stride, dc_value); + RECON_AND_STORE4X4(dest + 1 * stride, dc_value); + RECON_AND_STORE4X4(dest + 2 * stride, dc_value); + RECON_AND_STORE4X4(dest + 3 * stride, dc_value); +} + +static INLINE void transpose_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + +void idct4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + + transpose_4x4(in); + // stage 1 + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[3], v[2]); + + // stage 2 + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); +} + +void iadst4_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); + const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8], in7; + + transpose_4x4(in); + in7 = _mm_srli_si128(in[1], 8); + in7 = _mm_add_epi16(in7, in[0]); + in7 = _mm_sub_epi16(in7, in[1]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpackhi_epi16(in[0], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 + v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(v[3], v[4]); + u[2] = v[2]; + u[3] = _mm_add_epi32(u[0], u[1]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_add_epi32(u[3], v[5]); + u[6] = _mm_sub_epi32(u[5], u[4]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); +} + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } + +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ + out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + } + +#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + } + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ + cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + } + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_4, \ + stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_0, \ + stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + out0 = _mm_adds_epi16(stp1_0, stp2_7); \ + out1 = _mm_adds_epi16(stp1_1, stp1_6); \ + out2 = _mm_adds_epi16(stp1_2, stp1_5); \ + out3 = _mm_adds_epi16(stp1_3, stp2_4); \ + out4 = _mm_subs_epi16(stp1_3, stp2_4); \ + out5 = _mm_subs_epi16(stp1_2, stp1_5); \ + out6 = _mm_subs_epi16(stp1_1, stp1_6); \ + out7 = _mm_subs_epi16(stp1_0, stp2_7); \ + } + +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = _mm_load_si128((const __m128i *)input); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 5); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE(dest + 0 * stride, dc_value); + RECON_AND_STORE(dest + 1 * stride, dc_value); + RECON_AND_STORE(dest + 2 * stride, dc_value); + RECON_AND_STORE(dest + 3 * stride, dc_value); + RECON_AND_STORE(dest + 4 * stride, dc_value); + RECON_AND_STORE(dest + 5 * stride, dc_value); + RECON_AND_STORE(dest + 6 * stride, dc_value); + RECON_AND_STORE(dest + 7 * stride, dc_value); +} + +void idct8_sse2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() + TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], + in0, in1, in2, in3, in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); +} + +void iadst8_sse2(__m128i *in) { + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // transpose + array_transpose_8x8(in, in); + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); +} + +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // Rows. Load 4-row input data. + in0 = _mm_load_si128((const __m128i *)input); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + + // 8x4 Transpose + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + // Stage1 + { + const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + stp1_5 = _mm_packs_epi32(tmp4, tmp6); + } + + // Stage2 + { + const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_04, stg2_0); + tmp2 = _mm_madd_epi16(lo_04, stg2_1); + tmp4 = _mm_madd_epi16(lo_26, stg2_2); + tmp6 = _mm_madd_epi16(lo_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, tmp2); + stp2_2 = _mm_packs_epi32(tmp6, tmp4); + + tmp0 = _mm_adds_epi16(stp1_4, stp1_5); + tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage3 + { + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + + tmp4 = _mm_adds_epi16(stp2_0, stp2_2); + tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); + stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); + + tmp0 = _mm_madd_epi16(lo_56, stg3_0); + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + tmp0 = _mm_adds_epi16(stp1_3, stp2_4); + tmp1 = _mm_adds_epi16(stp1_2, stp1_5); + tmp2 = _mm_subs_epi16(stp1_3, stp2_4); + tmp3 = _mm_subs_epi16(stp1_2, stp1_5); + + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, + in0, in1, in2, in3, in4, in5, in6, in7); + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +#define IDCT16 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ + stg2_0, stg2_1, stg2_2, stg2_3, \ + stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ + stg2_4, stg2_5, stg2_6, stg2_7, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ + stg3_0, stg3_1, stg3_2, stg3_3, \ + stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ + stg4_0, stg4_1, stg4_2, stg4_3, \ + stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +#define IDCT16_10 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ + stg2_0, stg2_1, stg2_6, stg2_7, \ + stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ + stg3_0, stg3_1, \ + stp2_4, stp2_7) \ + \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ + stg4_0, stg4_1, \ + stp1_0, stp1_1) \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[16], l[16], r[16], *curr1; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + curr1 = l; + for (i = 0; i < 2; i++) { + // 1-D idct + + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); + in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); + in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); + in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); + in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); + in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); + in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); + in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); + in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); + + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + + IDCT16 + + // Stage7 + curr1[0] = _mm_add_epi16(stp2_0, stp1_15); + curr1[1] = _mm_add_epi16(stp2_1, stp1_14); + curr1[2] = _mm_add_epi16(stp2_2, stp2_13); + curr1[3] = _mm_add_epi16(stp2_3, stp2_12); + curr1[4] = _mm_add_epi16(stp2_4, stp2_11); + curr1[5] = _mm_add_epi16(stp2_5, stp2_10); + curr1[6] = _mm_add_epi16(stp2_6, stp1_9); + curr1[7] = _mm_add_epi16(stp2_7, stp1_8); + curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); + curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); + curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); + curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); + curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); + curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); + curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); + curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); + + curr1 = r; + input += 128; + } + for (i = 0; i < 2; i++) { + int j; + // 1-D idct + array_transpose_8x8(l + i * 8, in); + array_transpose_8x8(r + i * 8, in + 8); + + IDCT16 + + // 2-D + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 2; ++i) { + RECON_AND_STORE(dest + 0 * stride, dc_value); + RECON_AND_STORE(dest + 1 * stride, dc_value); + RECON_AND_STORE(dest + 2 * stride, dc_value); + RECON_AND_STORE(dest + 3 * stride, dc_value); + RECON_AND_STORE(dest + 4 * stride, dc_value); + RECON_AND_STORE(dest + 5 * stride, dc_value); + RECON_AND_STORE(dest + 6 * stride, dc_value); + RECON_AND_STORE(dest + 7 * stride, dc_value); + RECON_AND_STORE(dest + 8 * stride, dc_value); + RECON_AND_STORE(dest + 9 * stride, dc_value); + RECON_AND_STORE(dest + 10 * stride, dc_value); + RECON_AND_STORE(dest + 11 * stride, dc_value); + RECON_AND_STORE(dest + 12 * stride, dc_value); + RECON_AND_STORE(dest + 13 * stride, dc_value); + RECON_AND_STORE(dest + 14 * stride, dc_value); + RECON_AND_STORE(dest + 15 * stride, dc_value); + dest += 8; + } +} + +static void iadst16_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +static void idct16_8col(__m128i *in) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(t[0], t[1]); + u[1] = _mm_unpackhi_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_unpacklo_epi16(s[5], s[6]); + u[1] = _mm_unpackhi_epi16(s[5], s[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + t[5] = _mm_packs_epi32(u[0], u[1]); + t[6] = _mm_packs_epi32(u[2], u[3]); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +void idct16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + idct16_8col(in0); + idct16_8col(in1); +} + +void iadst16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + iadst16_8col(in0); + iadst16_8col(in1); +} + +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i in[16], l[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + // First 1-D inverse DCT + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); + + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_8 = _mm_packs_epi32(tmp0, tmp2); + stp2_11 = _mm_packs_epi32(tmp5, tmp7); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); + + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); + + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp1_0 = _mm_packs_epi32(tmp0, tmp0); + stp1_1 = _mm_packs_epi32(tmp2, tmp2); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp5, tmp7); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); + } + + // Stage5 and Stage6 + { + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_6 = _mm_packs_epi32(tmp3, tmp1); + + stp2_10 = _mm_packs_epi32(tmp0, zero); + stp2_13 = _mm_packs_epi32(tmp2, zero); + stp2_11 = _mm_packs_epi32(tmp4, zero); + stp2_12 = _mm_packs_epi32(tmp6, zero); + + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage7. Left 8x16 only. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + int j; + array_transpose_4X8(l + 8 * i, in); + + IDCT16_10 + + // Stage7 + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +#define LOAD_DQCOEFF(reg, input) \ + { \ + reg = _mm_load_si128((const __m128i *) input); \ + input += 8; \ + } \ + +#define IDCT32_34 \ +/* Stage1 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ + \ + const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ + \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ + stg1_1, stp1_16, stp1_31); \ + MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ + stg1_7, stp1_19, stp1_28); \ + MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ + stg1_9, stp1_20, stp1_27); \ + MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ + stg1_15, stp1_23, stp1_24); \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ + stg2_1, stp2_8, stp2_15); \ + MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ + stg2_7, stp2_11, stp2_12); \ + \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ + \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ + \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ + \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + \ + MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ + stg3_1, stp1_4, stp1_7); \ + \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ + stg4_1, stp2_0, stp2_1); \ + \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + + +#define IDCT32 \ +/* Stage1 */ \ +{ \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ + const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ + stp1_17, stp1_30) \ + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ + \ + const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ + stp2_14) \ + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ + stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + \ + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ + stp1_6) \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ + stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + +// Only upper-left 8x8 has non-zero coeff +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[32]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. Only need to load the top left 8x8 block. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 32)); + in[2] = _mm_load_si128((const __m128i *)(input + 64)); + in[3] = _mm_load_si128((const __m128i *)(input + 96)); + in[4] = _mm_load_si128((const __m128i *)(input + 128)); + in[5] = _mm_load_si128((const __m128i *)(input + 160)); + in[6] = _mm_load_si128((const __m128i *)(input + 192)); + in[7] = _mm_load_si128((const __m128i *)(input + 224)); + + for (i = 8; i < 32; ++i) { + in[i] = _mm_setzero_si128(); + } + + array_transpose_8x8(in, in); + // TODO(hkuang): Following transposes are unnecessary. But remove them will + // lead to performance drop on some devices. + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32_34 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[0] = _mm_add_epi16(stp1_0, stp1_31); + col[1] = _mm_add_epi16(stp1_1, stp1_30); + col[2] = _mm_add_epi16(stp1_2, stp1_29); + col[3] = _mm_add_epi16(stp1_3, stp1_28); + col[4] = _mm_add_epi16(stp1_4, stp1_27); + col[5] = _mm_add_epi16(stp1_5, stp1_26); + col[6] = _mm_add_epi16(stp1_6, stp1_25); + col[7] = _mm_add_epi16(stp1_7, stp1_24); + col[8] = _mm_add_epi16(stp1_8, stp1_23); + col[9] = _mm_add_epi16(stp1_9, stp1_22); + col[10] = _mm_add_epi16(stp1_10, stp1_21); + col[11] = _mm_add_epi16(stp1_11, stp1_20); + col[12] = _mm_add_epi16(stp1_12, stp1_19); + col[13] = _mm_add_epi16(stp1_13, stp1_18); + col[14] = _mm_add_epi16(stp1_14, stp1_17); + col[15] = _mm_add_epi16(stp1_15, stp1_16); + col[16] = _mm_sub_epi16(stp1_15, stp1_16); + col[17] = _mm_sub_epi16(stp1_14, stp1_17); + col[18] = _mm_sub_epi16(stp1_13, stp1_18); + col[19] = _mm_sub_epi16(stp1_12, stp1_19); + col[20] = _mm_sub_epi16(stp1_11, stp1_20); + col[21] = _mm_sub_epi16(stp1_10, stp1_21); + col[22] = _mm_sub_epi16(stp1_9, stp1_22); + col[23] = _mm_sub_epi16(stp1_8, stp1_23); + col[24] = _mm_sub_epi16(stp1_7, stp1_24); + col[25] = _mm_sub_epi16(stp1_6, stp1_25); + col[26] = _mm_sub_epi16(stp1_5, stp1_26); + col[27] = _mm_sub_epi16(stp1_4, stp1_27); + col[28] = _mm_sub_epi16(stp1_3, stp1_28); + col[29] = _mm_sub_epi16(stp1_2, stp1_29); + col[30] = _mm_sub_epi16(stp1_1, stp1_30); + col[31] = _mm_sub_epi16(stp1_0, stp1_31); + for (i = 0; i < 4; i++) { + int j; + const __m128i zero = _mm_setzero_si128(); + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + i * 8, in); + IDCT32_34 + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[128], zero_idx[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j, i32; + + for (i = 0; i < 4; i++) { + i32 = (i << 5); + // First 1-D idct + // Load input data. + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); + + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); + + // checking if all entries are zero + zero_idx[0] = _mm_or_si128(in[0], in[1]); + zero_idx[1] = _mm_or_si128(in[2], in[3]); + zero_idx[2] = _mm_or_si128(in[4], in[5]); + zero_idx[3] = _mm_or_si128(in[6], in[7]); + zero_idx[4] = _mm_or_si128(in[8], in[9]); + zero_idx[5] = _mm_or_si128(in[10], in[11]); + zero_idx[6] = _mm_or_si128(in[12], in[13]); + zero_idx[7] = _mm_or_si128(in[14], in[15]); + zero_idx[8] = _mm_or_si128(in[16], in[17]); + zero_idx[9] = _mm_or_si128(in[18], in[19]); + zero_idx[10] = _mm_or_si128(in[20], in[21]); + zero_idx[11] = _mm_or_si128(in[22], in[23]); + zero_idx[12] = _mm_or_si128(in[24], in[25]); + zero_idx[13] = _mm_or_si128(in[26], in[27]); + zero_idx[14] = _mm_or_si128(in[28], in[29]); + zero_idx[15] = _mm_or_si128(in[30], in[31]); + + zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); + zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); + + zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); + + if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { + col[i32 + 0] = _mm_setzero_si128(); + col[i32 + 1] = _mm_setzero_si128(); + col[i32 + 2] = _mm_setzero_si128(); + col[i32 + 3] = _mm_setzero_si128(); + col[i32 + 4] = _mm_setzero_si128(); + col[i32 + 5] = _mm_setzero_si128(); + col[i32 + 6] = _mm_setzero_si128(); + col[i32 + 7] = _mm_setzero_si128(); + col[i32 + 8] = _mm_setzero_si128(); + col[i32 + 9] = _mm_setzero_si128(); + col[i32 + 10] = _mm_setzero_si128(); + col[i32 + 11] = _mm_setzero_si128(); + col[i32 + 12] = _mm_setzero_si128(); + col[i32 + 13] = _mm_setzero_si128(); + col[i32 + 14] = _mm_setzero_si128(); + col[i32 + 15] = _mm_setzero_si128(); + col[i32 + 16] = _mm_setzero_si128(); + col[i32 + 17] = _mm_setzero_si128(); + col[i32 + 18] = _mm_setzero_si128(); + col[i32 + 19] = _mm_setzero_si128(); + col[i32 + 20] = _mm_setzero_si128(); + col[i32 + 21] = _mm_setzero_si128(); + col[i32 + 22] = _mm_setzero_si128(); + col[i32 + 23] = _mm_setzero_si128(); + col[i32 + 24] = _mm_setzero_si128(); + col[i32 + 25] = _mm_setzero_si128(); + col[i32 + 26] = _mm_setzero_si128(); + col[i32 + 27] = _mm_setzero_si128(); + col[i32 + 28] = _mm_setzero_si128(); + col[i32 + 29] = _mm_setzero_si128(); + col[i32 + 30] = _mm_setzero_si128(); + col[i32 + 31] = _mm_setzero_si128(); + continue; + } + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } + for (i = 0; i < 4; i++) { + // Second 1-D idct + j = i << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + j, in); + array_transpose_8x8(col + j + 32, in + 8); + array_transpose_8x8(col + j + 64, in + 16); + array_transpose_8x8(col + j + 96, in + 24); + + IDCT32 + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 4; ++i) { + int j; + for (j = 0; j < 32; ++j) { + RECON_AND_STORE(dest + j * stride, dc_value); + } + dest += 8; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { + __m128i ubounded, retval; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + ubounded = _mm_cmpgt_epi16(value, max); + retval = _mm_andnot_si128(ubounded, value); + ubounded = _mm_and_si128(ubounded, max); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); + return retval; +} + +void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + __m128i inptr[4]; + __m128i sign_bits[2]; + __m128i temp_mm, min_input, max_input; + int test; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int optimised_cols = 0; + const __m128i zero = _mm_set1_epi16(0); + const __m128i eight = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(12043); + const __m128i min = _mm_set1_epi16(-12043); + // Load input into __m128i + inptr[0] = _mm_loadu_si128((const __m128i *)input); + inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); + inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); + inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); + + // Pack to 16 bits + inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); + inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); + + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (!test) { + // Do the row transform + idct4_sse2(inptr); + + // Check the min & max values + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (test) { + transpose_4x4(inptr); + sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); + sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); + inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); + inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); + inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); + inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); + _mm_storeu_si128((__m128i *)outptr, inptr[0]); + _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); + _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); + _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vp9_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + } + + if (optimised_cols) { + idct4_sse2(inptr); + + // Final round and shift + inptr[0] = _mm_add_epi16(inptr[0], eight); + inptr[1] = _mm_add_epi16(inptr[1], eight); + + inptr[0] = _mm_srai_epi16(inptr[0], 4); + inptr[1] = _mm_srai_epi16(inptr[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); + __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi64( + d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); + d2 = _mm_unpacklo_epi64( + d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); + d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); + d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); + // store input0 + _mm_storel_epi64((__m128i *)dest, d0); + // store input1 + d0 = _mm_srli_si128(d0, 8); + _mm_storel_epi64((__m128i *)(dest + stride), d0); + // store input2 + _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); + // store input3 + d2 = _mm_srli_si128(d2, 8); + _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[4], temp_out[4]; + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + vp9_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } + } +} + +void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_8x8(inptr, inptr); + for (i = 0; i < 8; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 8; ++i) { + vp9_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vp9_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + +void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // only first 4 row has non-zero coefs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_4X8(inptr, inptr); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vp9_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vp9_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + +void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_16x16(inptr, inptr + 16); + for (i = 0; i < 16; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 16; ++i) { + vp9_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i ] = _mm_add_epi16(inptr[i ], rounding); + inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); + inptr[i ] = _mm_srai_epi16(inptr[i ], 6); + inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + vp9_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} + +void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // Since all non-zero dct coefficients are in upper-left 4x4 area, + // we only need to consider first 4 rows here. + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform (N.B. This transposes inptr) + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 16; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_8x8(inptr, inptr); + array_transpose_8x8(inptr + 8, inptr + 16); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vp9_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i ] = _mm_add_epi16(inptr[i ], rounding); + inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); + inptr[i ] = _mm_srai_epi16(inptr[i ], 6); + inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + vp9_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h new file mode 100644 index 000000000..658a91487 --- /dev/null +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_ +#define VPX_DSP_X86_INV_TXFM_SSE2_H_ + +#include // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" + +// perform 8x8 transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + } + +static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); + + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); + in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); + in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); + in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); + in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); + in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); + in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); + in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); +} + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ + } + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); + RECON_AND_STORE(dest + 8 * stride, in[8]); + RECON_AND_STORE(dest + 9 * stride, in[9]); + RECON_AND_STORE(dest + 10 * stride, in[10]); + RECON_AND_STORE(dest + 11 * stride, in[11]); + RECON_AND_STORE(dest + 12 * stride, in[12]); + RECON_AND_STORE(dest + 13 * stride, in[13]); + RECON_AND_STORE(dest + 14 * stride, in[14]); + RECON_AND_STORE(dest + 15 * stride, in[15]); +} + +void idct4_sse2(__m128i *in); +void idct8_sse2(__m128i *in); +void idct16_sse2(__m128i *in0, __m128i *in1); +void iadst4_sse2(__m128i *in); +void iadst8_sse2(__m128i *in); +void iadst16_sse2(__m128i *in0, __m128i *in1); + +#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm new file mode 100644 index 000000000..2c1060710 --- /dev/null +++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -0,0 +1,300 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the inverse transformation. Part +; of the functions are originally derived from the ffmpeg project. +; Note that the current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 +pw_16: times 8 dw 16 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 +%endmacro + +TRANSFORM_COEFFS 6270, 15137 +TRANSFORM_COEFFS 3196, 16069 +TRANSFORM_COEFFS 13623, 9102 + +%macro PAIR_PP_COEFFS 2 +dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MP_COEFFS 2 +dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MM_COEFFS 2 +dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 +%endmacro + +PAIR_PP_COEFFS 30274, 12540 +PAIR_PP_COEFFS 6392, 32138 +PAIR_MP_COEFFS 18204, 27246 + +PAIR_PP_COEFFS 12540, 12540 +PAIR_PP_COEFFS 30274, 30274 +PAIR_PP_COEFFS 6392, 6392 +PAIR_PP_COEFFS 32138, 32138 +PAIR_MM_COEFFS 18204, 18204 +PAIR_PP_COEFFS 27246, 27246 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro IDCT8_1D 0 + SUM_SUB 0, 4, 9 + BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 + pmulhrsw m0, m12 + pmulhrsw m4, m12 + BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 + + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + SUM_SUB 0, 6, 9 + SUM_SUB 4, 2, 9 + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 4, 3, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 4 +%endmacro + +; This macro handles 8 pixels per line +%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero + paddw m%1, m11 + paddw m%2, m11 + psraw m%1, 5 + psraw m%2, 5 + + movh m%3, [outputq] + movh m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%3, m%1 + paddw m%4, m%2 + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [outputq], m%3 + movh [outputq + strideq], m%4 +%endmacro + +INIT_XMM ssse3 +; full inverse 8x8 2D-DCT transform +cglobal idct8x8_64_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + mova m4, [inputq + 64] + mova m5, [inputq + 80] + mova m6, [inputq + 96] + mova m7, [inputq + 112] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero +cglobal idct8x8_12_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m9, m0, m2 + punpckldq m0, m2 + SWAP 2, 9 + + ; m0 -> [0], [0] + ; m1 -> [1], [1] + ; m2 -> [2], [2] + ; m3 -> [3], [3] + punpckhqdq m10, m0, m0 + punpcklqdq m0, m0 + punpckhqdq m9, m2, m2 + punpcklqdq m2, m2 + SWAP 1, 10 + SWAP 3, 9 + + pmulhrsw m0, m12 + pmulhrsw m2, [dpw_30274_12540] + pmulhrsw m1, [dpw_6392_32138] + pmulhrsw m3, [dpw_m18204_27246] + + SUM_SUB 0, 2, 9 + SUM_SUB 1, 3, 9 + + punpcklqdq m9, m3, m3 + punpckhqdq m5, m3, m9 + + SUM_SUB 3, 5, 9 + punpckhqdq m5, m3 + pmulhrsw m5, m12 + + punpckhqdq m9, m1, m5 + punpcklqdq m1, m5 + SWAP 5, 9 + + SUM_SUB 0, 5, 9 + SUM_SUB 2, 1, 9 + + punpckhqdq m3, m0, m0 + punpckhqdq m4, m1, m1 + punpckhqdq m6, m5, m5 + punpckhqdq m7, m2, m2 + + punpcklwd m0, m3 + punpcklwd m7, m2 + punpcklwd m1, m4 + punpcklwd m6, m5 + + punpckhdq m4, m0, m7 + punpckldq m0, m7 + punpckhdq m10, m1, m6 + punpckldq m5, m1, m6 + + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m3, m4, m10 + punpcklqdq m2, m4, m10 + + + pmulhrsw m0, m12 + pmulhrsw m6, m2, [dpw_30274_30274] + pmulhrsw m4, m2, [dpw_12540_12540] + + pmulhrsw m7, m1, [dpw_32138_32138] + pmulhrsw m1, [dpw_6392_6392] + pmulhrsw m5, m3, [dpw_m18204_m18204] + pmulhrsw m3, [dpw_27246_27246] + + mova m2, m0 + SUM_SUB 0, 6, 9 + SUM_SUB 2, 4, 9 + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 2, 3, 9 + SUM_SUB 4, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 2 + SWAP 2, 4 + + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +%endif -- cgit v1.2.3