diff options
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/highbd_idct4x4_add_neon.c | 177 | ||||
-rw-r--r-- | vpx_dsp/arm/idct4x4_add_neon.c | 39 | ||||
-rw-r--r-- | vpx_dsp/arm/idct_neon.h | 34 | ||||
-rw-r--r-- | vpx_dsp/arm/transpose_neon.h | 38 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 2 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 3 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.c | 52 |
7 files changed, 282 insertions, 63 deletions
diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c new file mode 100644 index 000000000..96625b98b --- /dev/null +++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, + int dest_stride, int bd) { + int i; + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const tran_low_t out0 = dct_const_round_shift(input[0] * cospi_16_64); + const tran_low_t out1 = dct_const_round_shift(out0 * cospi_16_64); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); + const int16x8_t dc = vdupq_n_s16(a1); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int16x8_t a; + uint16x8_t b; + uint16x4_t d0, d1; + + for (i = 0; i < 2; i++) { + d0 = vld1_u16(dest); + d1 = vld1_u16(dest + dest_stride); + a = vreinterpretq_s16_u16(vcombine_u16(d0, d1)); + a = vaddq_s16(dc, a); + a = vminq_s16(a, max); + b = vqshluq_n_s16(a, 0); + vst1_u16(dest, vget_low_u16(b)); + dest += dest_stride; + vst1_u16(dest, vget_high_u16(b)); + dest += dest_stride; + } +} + +static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, + int32x4_t *const a0, + int32x4_t *const a1, + int32x4_t *const a2, + int32x4_t *const a3) { + int32x4_t b0, b1, b2, b3; + + transpose_s32_4x4(a0, a1, a2, a3); + b0 = vaddq_s32(*a0, *a2); + b1 = vsubq_s32(*a0, *a2); + b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); + b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); + b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); + b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); + b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); + b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); + b0 = vrshrq_n_s32(b0, 14); + b1 = vrshrq_n_s32(b1, 14); + b2 = vrshrq_n_s32(b2, 14); + b3 = vrshrq_n_s32(b3, 14); + *a0 = vaddq_s32(b0, b3); + *a1 = vaddq_s32(b1, b2); + *a2 = vsubq_s32(b1, b2); + *a3 = vsubq_s32(b0, b3); +} + +static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, + int32x4_t *const a0, + int32x4_t *const a1, + int32x4_t *const a2, + int32x4_t *const a3) { + int32x4_t b0, b1, b2, b3; + int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; + + transpose_s32_4x4(a0, a1, a2, a3); + b0 = vaddq_s32(*a0, *a2); + b1 = vsubq_s32(*a0, *a2); + c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); + c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); + c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); + c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); + c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); + c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); + c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); + c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); + c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); + c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); + c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); + c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); + c4 = vsubq_s64(c4, c8); + c5 = vsubq_s64(c5, c9); + c6 = vaddq_s64(c6, c10); + c7 = vaddq_s64(c7, c11); + b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14)); + b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14)); + b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14)); + b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14)); + *a0 = vaddq_s32(b0, b3); + *a1 = vaddq_s32(b1, b2); + *a2 = vsubq_s32(b1, b2); + *a3 = vsubq_s32(b0, b3); +} + +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8, + int dest_stride, int bd) { + DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585, + 6270 }; + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int32x4_t c0 = vld1q_s32(input); + int32x4_t c1 = vld1q_s32(input + 4); + int32x4_t c2 = vld1q_s32(input + 8); + int32x4_t c3 = vld1q_s32(input + 12); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const uint16_t *dst = dest; + int16x8_t a0, a1, d01, d32; + int16x4_t d0, d1, d2, d3; + uint16x8_t d01_u16, d32_u16; + + if (bd == 8) { + const int16x4_t cospis = vld1_s16(kCospi); + + // Rows + a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); + a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + + // Columns + a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a0 = vrshrq_n_s16(a0, 4); + a1 = vrshrq_n_s16(a1, 4); + } else { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + } else { + idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + } + // Note: In some profile tests, a0 and a1 are quite close to +/-32767. + // We use saturating narrow shift in case they could be even larger. + a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); + a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); + } + + d0 = vreinterpret_s16_u16(vld1_u16(dst)); + dst += dest_stride; + d1 = vreinterpret_s16_u16(vld1_u16(dst)); + dst += dest_stride; + d2 = vreinterpret_s16_u16(vld1_u16(dst)); + dst += dest_stride; + d3 = vreinterpret_s16_u16(vld1_u16(dst)); + d01 = vcombine_s16(d0, d1); + d32 = vcombine_s16(d3, d2); + + // Note: In some profile tests, a0 and a1 is quite close to +/-32767. + // We use saturating addition. + d01 = vqaddq_s16(a0, d01); + d32 = vqaddq_s16(a1, d32); + d01 = vminq_s16(d01, max); + d32 = vminq_s16(d32, max); + d01_u16 = vqshluq_n_s16(d01, 0); + d32_u16 = vqshluq_n_s16(d32, 0); + + vst1_u16(dest, vget_low_u16(d01_u16)); + dest += dest_stride; + vst1_u16(dest, vget_high_u16(d01_u16)); + dest += dest_stride; + vst1_u16(dest, vget_high_u16(d32_u16)); + dest += dest_stride; + vst1_u16(dest, vget_low_u16(d32_u16)); +} diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index 6ac516140..5ccc95ce0 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -13,45 +13,12 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" -#include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void idct4x4_16_kernel(const int16x4_t cospis, int16x8_t *a0, - int16x8_t *a1) { - int16x4_t b0, b1, b2, b3; - int32x4_t c0, c1, c2, c3; - int16x8_t d0, d1; - - transpose_s16_4x4q(a0, a1); - b0 = vget_low_s16(*a0); - b1 = vget_high_s16(*a0); - b2 = vget_low_s16(*a1); - b3 = vget_high_s16(*a1); - c0 = vmull_lane_s16(b0, cospis, 2); - c2 = vmull_lane_s16(b1, cospis, 2); - c1 = vsubq_s32(c0, c2); - c0 = vaddq_s32(c0, c2); - c2 = vmull_lane_s16(b2, cospis, 3); - c3 = vmull_lane_s16(b2, cospis, 1); - c2 = vmlsl_lane_s16(c2, b3, cospis, 1); - c3 = vmlal_lane_s16(c3, b3, cospis, 3); - b0 = vrshrn_n_s32(c0, 14); - b1 = vrshrn_n_s32(c1, 14); - b2 = vrshrn_n_s32(c2, 14); - b3 = vrshrn_n_s32(c3, 14); - d0 = vcombine_s16(b0, b1); - d1 = vcombine_s16(b3, b2); - *a0 = vaddq_s16(d0, d1); - *a1 = vsubq_s16(d0, d1); -} - void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride) { - DECLARE_ALIGNED(16, static const int16_t, cospi[4]) = { - 0, (int16_t)cospi_8_64, (int16_t)cospi_16_64, (int16_t)cospi_24_64 - }; const uint8_t *dst = dest; - const int16x4_t cospis = vld1_s16(cospi); + const int16x4_t cospis = vld1_s16(kCospi); uint32x2_t dest01_u32 = vdup_n_u32(0); uint32x2_t dest32_u32 = vdup_n_u32(0); int16x8_t a0, a1; @@ -64,11 +31,11 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, // Rows a0 = load_tran_low_to_s16q(input); a1 = load_tran_low_to_s16q(input + 8); - idct4x4_16_kernel(cospis, &a0, &a1); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); // Columns a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); - idct4x4_16_kernel(cospis, &a0, &a1); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); a0 = vrshrq_n_s16(a0, 4); a1 = vrshrq_n_s16(a1, 4); diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index e4493a105..51eba062b 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -17,6 +17,9 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/vpx_dsp_common.h" +DECLARE_ALIGNED(16, static const int16_t, kCospi[4]) = { 0, 15137, 11585, + 6270 }; + //------------------------------------------------------------------------------ // Helper functions used to load tran_low_t into int16, narrowing if necessary. @@ -180,4 +183,35 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, b += b_stride; vst1_u8(b, b7); } + +static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, + int16x8_t *const a0, + int16x8_t *const a1) { + int16x4_t b0, b1, b2, b3; + int32x4_t c0, c1, c2, c3; + int16x8_t d0, d1; + + transpose_s16_4x4q(a0, a1); + b0 = vget_low_s16(*a0); + b1 = vget_high_s16(*a0); + b2 = vget_low_s16(*a1); + b3 = vget_high_s16(*a1); + c0 = vmull_lane_s16(b0, cospis, 2); + c2 = vmull_lane_s16(b1, cospis, 2); + c1 = vsubq_s32(c0, c2); + c0 = vaddq_s32(c0, c2); + c2 = vmull_lane_s16(b2, cospis, 3); + c3 = vmull_lane_s16(b2, cospis, 1); + c2 = vmlsl_lane_s16(c2, b3, cospis, 1); + c3 = vmlal_lane_s16(c3, b3, cospis, 3); + b0 = vrshrn_n_s32(c0, 14); + b1 = vrshrn_n_s32(c1, 14); + b2 = vrshrn_n_s32(c2, 14); + b3 = vrshrn_n_s32(c3, 14); + d0 = vcombine_s16(b0, b1); + d1 = vcombine_s16(b3, b2); + *a0 = vaddq_s16(d0, d1); + *a1 = vsubq_s16(d0, d1); +} + #endif // VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 29144a7e8..d0634fd0a 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -30,6 +30,13 @@ static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { return b0; } +static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { + int32x4x2_t b0; + b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); + b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); + return b0; +} + static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) { uint8x16x2_t b0; b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)), @@ -172,6 +179,37 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { *a1 = d0.val[1]; } +static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, + int32x4_t *a2, int32x4_t *a3) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + + *a0 = c0.val[0]; + *a1 = c1.val[0]; + *a2 = c0.val[1]; + *a3 = c1.val[1]; +} + static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, const int16x4_t a2, const int16x4_t a3, const int16x4_t a4, const int16x4_t a5, diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 200ef07f1..f04736ae0 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -226,6 +226,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c +else # CONFIG_VP9_HIGHBITDEPTH +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d78a35757..58969695b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -618,6 +618,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_iwht4x4_16_add sse2/; add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vpx_highbd_idct4x4_1_add neon/; add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; @@ -709,7 +710,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct32x32_1_add neon sse2/; add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vpx_highbd_idct4x4_16_add sse2/; + specialize qw/vpx_highbd_idct4x4_16_add neon sse2/; add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/vpx_highbd_idct8x8_64_add sse2/; diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index d5fc1440c..487a474a6 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -402,10 +402,10 @@ void iadst4_sse2(__m128i *in) { MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ } \ \ /* Stage3 */ \ @@ -413,10 +413,10 @@ void iadst4_sse2(__m128i *in) { const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ \ tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ @@ -438,14 +438,14 @@ void iadst4_sse2(__m128i *in) { } \ \ /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ + out0 = _mm_add_epi16(stp1_0, stp2_7); \ + out1 = _mm_add_epi16(stp1_1, stp1_6); \ + out2 = _mm_add_epi16(stp1_2, stp1_5); \ + out3 = _mm_add_epi16(stp1_3, stp2_4); \ + out4 = _mm_sub_epi16(stp1_3, stp2_4); \ + out5 = _mm_sub_epi16(stp1_2, stp1_5); \ + out6 = _mm_sub_epi16(stp1_1, stp1_6); \ + out7 = _mm_sub_epi16(stp1_0, stp2_7); \ } void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, @@ -866,8 +866,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, stp2_0 = _mm_packs_epi32(tmp0, tmp2); stp2_2 = _mm_packs_epi32(tmp6, tmp4); - tmp0 = _mm_adds_epi16(stp1_4, stp1_5); - tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + tmp0 = _mm_add_epi16(stp1_4, stp1_5); + tmp1 = _mm_sub_epi16(stp1_4, stp1_5); stp2_4 = tmp0; stp2_5 = _mm_unpacklo_epi64(tmp1, zero); @@ -878,8 +878,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, { const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - tmp4 = _mm_adds_epi16(stp2_0, stp2_2); - tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + tmp4 = _mm_add_epi16(stp2_0, stp2_2); + tmp6 = _mm_sub_epi16(stp2_0, stp2_2); stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); @@ -896,10 +896,10 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, } // Stage4 - tmp0 = _mm_adds_epi16(stp1_3, stp2_4); - tmp1 = _mm_adds_epi16(stp1_2, stp1_5); - tmp2 = _mm_subs_epi16(stp1_3, stp2_4); - tmp3 = _mm_subs_epi16(stp1_2, stp1_5); + tmp0 = _mm_add_epi16(stp1_3, stp2_4); + tmp1 = _mm_add_epi16(stp1_2, stp1_5); + tmp2 = _mm_sub_epi16(stp1_3, stp2_4); + tmp3 = _mm_sub_epi16(stp1_2, stp1_5); TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) @@ -3449,7 +3449,7 @@ static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { __m128i ubounded, retval; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); ubounded = _mm_cmpgt_epi16(value, max); retval = _mm_andnot_si128(ubounded, value); ubounded = _mm_and_si128(ubounded, max); @@ -4012,7 +4012,7 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8, __m128i dc_value, d; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); int a, i, j; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); tran_low_t out; |