diff options
author | James Yu <james.yu@linaro.org> | 2014-01-13 16:44:08 +0800 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2014-08-20 09:25:29 -0700 |
commit | eed005b07603e325efa5d1da9d758b7f09b16aae (patch) | |
tree | 58a1718a71f1ccc413871323593becda53171204 /vp8/encoder/arm | |
parent | 6d6fdd9c3d763c6fbcd4f79a07ed2ec131500bfd (diff) | |
download | libvpx-eed005b07603e325efa5d1da9d758b7f09b16aae.tar libvpx-eed005b07603e325efa5d1da9d758b7f09b16aae.tar.gz libvpx-eed005b07603e325efa5d1da9d758b7f09b16aae.tar.bz2 libvpx-eed005b07603e325efa5d1da9d758b7f09b16aae.zip |
VP8 encoder for ARMv8 by using NEON intrinsics 6
Add shortfdct_neon.c
- vp8_short_fdct4x4_neon
- vp8_short_fdct8x4_neon
Change-Id: I90152c803b484f5fab839473d632c50af0524e68
Signed-off-by: James Yu <james.yu@linaro.org>
Diffstat (limited to 'vp8/encoder/arm')
-rw-r--r-- | vp8/encoder/arm/neon/shortfdct_neon.asm | 221 | ||||
-rw-r--r-- | vp8/encoder/arm/neon/shortfdct_neon.c | 269 |
2 files changed, 269 insertions, 221 deletions
diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm deleted file mode 100644 index 5ea8dd83d..000000000 --- a/vp8/encoder/arm/neon/shortfdct_neon.asm +++ /dev/null @@ -1,221 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_fdct4x4_neon| - EXPORT |vp8_short_fdct8x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=4 - - - ALIGN 16 ; enable use of @128 bit aligned loads -coeff - DCW 5352, 5352, 5352, 5352 - DCW 2217, 2217, 2217, 2217 - DCD 14500, 14500, 14500, 14500 - DCD 7500, 7500, 7500, 7500 - DCD 12000, 12000, 12000, 12000 - DCD 51000, 51000, 51000, 51000 - -;void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_short_fdct4x4_neon| PROC - - ; Part one - vld1.16 {d0}, [r0@64], r2 - adr r12, coeff - vld1.16 {d1}, [r0@64], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {d2}, [r0@64], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {d3}, [r0@64], r2 - - ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3] - vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2] - vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2] - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3] - - vshl.s16 q2, q2, #3 ; (a1, b1) << 3 - vshl.s16 q3, q3, #3 ; (c1, d1) << 3 - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 - vsub.s16 d2, d4, d5 ; op[2] = a1 - b1 - - vmlal.s16 q9, d7, d16 ; d1*5352 + 14500 - vmlal.s16 q10, d7, d17 ; d1*2217 + 7500 - vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500 - - vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - - ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vmov.s16 d26, #7 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12] - vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8] - vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8] - vadd.s16 d4, d4, d26 ; a1 + 7 - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12] - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7 - vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7 - - vmlal.s16 q11, d7, d16 ; d1*5352 + 12000 - vmlal.s16 q12, d7, d17 ; d1*2217 + 51000 - - vceq.s16 d4, d7, #0 - - vshr.s16 d0, d0, #4 - vshr.s16 d2, d2, #4 - - vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000 - - vmvn d4, d4 - vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16 - vsub.s16 d1, d1, d4 ; op[4] += (d1!=0) - vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - -;void vp8_short_fdct8x4_c(short *input, short *output, int pitch) -|vp8_short_fdct8x4_neon| PROC - - ; Part one - - vld1.16 {q0}, [r0@128], r2 - adr r12, coeff - vld1.16 {q1}, [r0@128], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {q2}, [r0@128], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {q3}, [r0@128], r2 - - ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3] - vtrn.32 q0, q2 ; [A0|B0] - vtrn.32 q1, q3 ; [A1|B1] - vtrn.16 q0, q1 ; [A2|B2] - vtrn.16 q2, q3 ; [A3|B3] - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3] - vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2] - vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3] - - vshl.s16 q11, q11, #3 ; a1 << 3 - vshl.s16 q12, q12, #3 ; b1 << 3 - vshl.s16 q13, q13, #3 ; c1 << 3 - vshl.s16 q14, q14, #3 ; d1 << 3 - - vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1 - vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1 - - vmov.s16 q11, q9 ; 14500 - vmov.s16 q12, q10 ; 7500 - - vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500 - vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500 - vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500 - vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500 - - vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500 - vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500 - - vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12 - vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000 - - ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12] - vtrn.32 q0, q2 ; q0=[A0 | B0] - vtrn.32 q1, q3 ; q1=[A4 | B4] - vtrn.16 q0, q1 ; q2=[A8 | B8] - vtrn.16 q2, q3 ; q3=[A12|B12] - - vmov.s16 q15, #7 - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12] - vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8] - vadd.s16 q11, q11, q15 ; a1 + 7 - vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12] - - vadd.s16 q0, q11, q12 ; a1 + b1 + 7 - vsub.s16 q1, q11, q12 ; a1 - b1 + 7 - - vmov.s16 q11, q9 ; 12000 - vmov.s16 q12, q10 ; 51000 - - vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4 - vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4 - vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4 - vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4 - - - vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000 - vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000 - vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000 - vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000 - - vceq.s16 q14, q14, #0 - - vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000 - vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000 - - vmvn q14, q14 - - vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d1, d1, d28 ; A[4] += (d1!=0) - - vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d5, d5, d29 ; B[4] += (d1!=0) - - vst1.16 {q0, q1}, [r1@128]! ; block A - vst1.16 {q2, q3}, [r1@128]! ; block B - - bx lr - - ENDP - - END - diff --git a/vp8/encoder/arm/neon/shortfdct_neon.c b/vp8/encoder/arm/neon/shortfdct_neon.c new file mode 100644 index 000000000..391e5f990 --- /dev/null +++ b/vp8/encoder/arm/neon/shortfdct_neon.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_short_fdct4x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, dEmptys16; + uint16x4_t d4u16; + int16x8_t q0s16, q1s16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + q11s32 = vdupq_n_s32(12000); + q12s32 = vdupq_n_s32(51000); + + // Part one + pitch >>= 1; + d0s16 = vld1_s16(input); + input += pitch; + d1s16 = vld1_s16(input); + input += pitch; + d2s16 = vld1_s16(input); + input += pitch; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d4s16 = vshl_n_s16(d4s16, 3); + d5s16 = vshl_n_s16(d5s16, 3); + d6s16 = vshl_n_s16(d6s16, 3); + d7s16 = vshl_n_s16(d7s16, 3); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q9s32 = vmlal_s16(q9s32, d7s16, d16s16); + q10s32 = vmlal_s16(q10s32, d7s16, d17s16); + q9s32 = vmlal_s16(q9s32, d6s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 12); + d3s16 = vshrn_n_s32(q10s32, 12); + + // Part two + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d26s16 = vdup_n_s16(7); + d4s16 = vadd_s16(d4s16, d26s16); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q11s32 = vmlal_s16(q11s32, d7s16, d16s16); + q12s32 = vmlal_s16(q12s32, d7s16, d17s16); + + dEmptys16 = vdup_n_s16(0); + d4u16 = vceq_s16(d7s16, dEmptys16); + + d0s16 = vshr_n_s16(d0s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + + q11s32 = vmlal_s16(q11s32, d6s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); + + d4u16 = vmvn_u16(d4u16); + d1s16 = vshrn_n_s32(q11s32, 16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); + d3s16 = vshrn_n_s32(q12s32, 16); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} + +void vp8_short_fdct8x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; + uint16x4_t d28u16, d29u16; + uint16x8_t q14u16; + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x8x2_t v2tmp0, v2tmp1; + int32x4x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + + // Part one + pitch >>= 1; + q0s16 = vld1q_s16(input); + input += pitch; + q1s16 = vld1q_s16(input); + input += pitch; + q2s16 = vld1q_s16(input); + input += pitch; + q3s16 = vld1q_s16(input); + + v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), + vreinterpretq_s32_s16(q2s16)); + v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), + vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q11s16 = vshlq_n_s16(q11s16, 3); + q12s16 = vshlq_n_s16(q12s16, 3); + q13s16 = vshlq_n_s16(q13s16, 3); + q14s16 = vshlq_n_s16(q14s16, 3); + + q0s16 = vaddq_s16(q11s16, q12s16); + q2s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d2s16 = vshrn_n_s32(q9s32, 12); + d6s16 = vshrn_n_s32(q10s32, 12); + d3s16 = vshrn_n_s32(q11s32, 12); + d7s16 = vshrn_n_s32(q12s32, 12); + q1s16 = vcombine_s16(d2s16, d3s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + // Part two + q9s32 = vdupq_n_s32(12000); + q10s32 = vdupq_n_s32(51000); + + v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), + vreinterpretq_s32_s16(q2s16)); + v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), + vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q15s16 = vdupq_n_s16(7); + q11s16 = vaddq_s16(q11s16, q15s16); + q0s16 = vaddq_s16(q11s16, q12s16); + q1s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d2s16 = vget_low_s16(q1s16); + d3s16 = vget_high_s16(q1s16); + + d0s16 = vshr_n_s16(d0s16, 4); + d4s16 = vshr_n_s16(d1s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + d6s16 = vshr_n_s16(d3s16, 4); + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 16); + d3s16 = vshrn_n_s32(q10s32, 16); + d5s16 = vshrn_n_s32(q11s32, 16); + d7s16 = vshrn_n_s32(q12s32, 16); + + qEmptys16 = vdupq_n_s16(0); + q14u16 = vceqq_s16(q14s16, qEmptys16); + q14u16 = vmvnq_u16(q14u16); + + d28u16 = vget_low_u16(q14u16); + d29u16 = vget_high_u16(q14u16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); + d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + vst1q_s16(output + 16, q2s16); + vst1q_s16(output + 24, q3s16); + return; +} |