diff options
Diffstat (limited to 'vpx_dsp')
29 files changed, 1423 insertions, 1380 deletions
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index a0a1e6dd5..1e338516f 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -20,12 +20,12 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -38,8 +38,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); const int16x8_t dequant = vld1q_s16(dequant_ptr); // Add one because the eob does not index from 0. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); @@ -65,10 +65,10 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff = vandq_s16(qcoeff, zbin_mask); // Set non-zero elements to -1 and use that to extract values for eob. - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); coeff_ptr += 8; - iscan_ptr += 8; + iscan += 8; store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; @@ -90,8 +90,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, do { // Add one because the eob is not its index. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); @@ -118,10 +118,10 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Set non-zero elements to -1 and use that to extract values for eob. eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); coeff_ptr += 8; - iscan_ptr += 8; + iscan += 8; store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; @@ -150,17 +150,19 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) { // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. -void vpx_quantize_b_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; - (void)scan_ptr; + (void)scan; (void)n_coeffs; // Because we will always calculate 32*32. (void)skip_block; assert(!skip_block); @@ -174,8 +176,8 @@ void vpx_quantize_b_32x32_neon( const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); const int16x8_t dequant = vld1q_s16(dequant_ptr); // Add one because the eob does not index from 0. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); @@ -203,10 +205,10 @@ void vpx_quantize_b_32x32_neon( qcoeff = vandq_s16(qcoeff, zbin_mask); // Set non-zero elements to -1 and use that to extract values for eob. - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); coeff_ptr += 8; - iscan_ptr += 8; + iscan += 8; store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; @@ -234,8 +236,8 @@ void vpx_quantize_b_32x32_neon( for (i = 1; i < 32 * 32 / 8; ++i) { // Add one because the eob is not its index. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); @@ -264,10 +266,10 @@ void vpx_quantize_b_32x32_neon( // Set non-zero elements to -1 and use that to extract values for eob. eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); coeff_ptr += 8; - iscan_ptr += 8; + iscan += 8; store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 535ec0f0d..06443c699 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -28,24 +28,25 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, return vreinterpret_u8_u32(aa); } -static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride, - const uint8_t *const ref[4], const int ref_stride, - const int height, uint32_t *const res) { +static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, + const uint8_t *const ref_array[4], + const int ref_stride, const int height, + uint32_t *const res) { int i; uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; uint16x4_t a[2]; uint32x4_t r; - assert(!((intptr_t)src % sizeof(uint32_t))); + assert(!((intptr_t)src_ptr % sizeof(uint32_t))); assert(!(src_stride % sizeof(uint32_t))); for (i = 0; i < height; ++i) { const uint8x8_t s = vreinterpret_u8_u32( - vld1_dup_u32((const uint32_t *)(src + i * src_stride))); - const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride, - ref[1] + i * ref_stride); - const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride, - ref[3] + i * ref_stride); + vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride))); + const uint8x8_t ref01 = load_unaligned_2_buffers( + ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride); + const uint8x8_t ref23 = load_unaligned_2_buffers( + ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride); abs[0] = vabal_u8(abs[0], s, ref01); abs[1] = vabal_u8(abs[1], s, ref23); } @@ -56,16 +57,16 @@ static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride, vst1q_u32(res, r); } -void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad4x_4d(src, src_stride, ref, ref_stride, 4, res); + sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res); } -void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad4x_4d(src, src_stride, ref, ref_stride, 8, res); + sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res); } //////////////////////////////////////////////////////////////////////////////// @@ -137,17 +138,18 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, vst1q_u32(res, vcombine_u32(d0, d1)); } -static INLINE void sad8x_4d(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res, const int height) { int i, j; - const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; for (i = 0; i < height; ++i) { - const uint8x8_t s = vld1_u8(src); - src += src_stride; + const uint8x8_t s = vld1_u8(src_ptr); + src_ptr += src_stride; for (j = 0; j < 4; ++j) { const uint8x8_t b_u8 = vld1_u8(ref_loop[j]); ref_loop[j] += ref_stride; @@ -158,44 +160,45 @@ static INLINE void sad8x_4d(const uint8_t *src, int src_stride, sad_512_pel_final_neon(sum, res); } -void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad8x_4d(src, src_stride, ref, ref_stride, res, 4); + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4); } -void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad8x_4d(src, src_stride, ref, ref_stride, res, 8); + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8); } -void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad8x_4d(src, src_stride, ref, ref_stride, res, 16); + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); } //////////////////////////////////////////////////////////////////////////////// -static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src, +static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, uint16x8_t *const sum) { - const uint8x16_t r = vld1q_u8(ref); - *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r)); - *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r)); + const uint8x16_t r = vld1q_u8(ref_ptr); + *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r)); + *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r)); } -static INLINE void sad16x_4d(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res, const int height) { int i, j; - const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; for (i = 0; i < height; ++i) { - const uint8x16_t s = vld1q_u8(src); - src += src_stride; + const uint8x16_t s = vld1q_u8(src_ptr); + src_ptr += src_stride; for (j = 0; j < 4; ++j) { sad16_neon(ref_loop[j], s, &sum[j]); ref_loop[j] += ref_stride; @@ -205,50 +208,51 @@ static INLINE void sad16x_4d(const uint8_t *src, int src_stride, sad_512_pel_final_neon(sum, res); } -void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad16x_4d(src, src_stride, ref, ref_stride, res, 8); + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8); } -void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad16x_4d(src, src_stride, ref, ref_stride, res, 16); + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); } -void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { - sad16x_4d(src, src_stride, ref, ref_stride, res, 32); + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32); } //////////////////////////////////////////////////////////////////////////////// -static INLINE void sad32x_4d(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, const int height, uint16x8_t *const sum) { int i; - const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0); for (i = 0; i < height; ++i) { uint8x16_t s; - s = vld1q_u8(src + 0 * 16); + s = vld1q_u8(src_ptr + 0 * 16); sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - s = vld1q_u8(src + 1 * 16); + s = vld1q_u8(src_ptr + 1 * 16); sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - src += src_stride; + src_ptr += src_stride; ref_loop[0] += ref_stride; ref_loop[1] += ref_stride; ref_loop[2] += ref_stride; @@ -256,68 +260,69 @@ static INLINE void sad32x_4d(const uint8_t *src, int src_stride, } } -void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { uint16x8_t sum[4]; - sad32x_4d(src, src_stride, ref, ref_stride, 16, sum); + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum); sad_512_pel_final_neon(sum, res); } -void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { uint16x8_t sum[4]; - sad32x_4d(src, src_stride, ref, ref_stride, 32, sum); + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum); sad_1024_pel_final_neon(sum, res); } -void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { uint16x8_t sum[4]; - sad32x_4d(src, src_stride, ref, ref_stride, 64, sum); + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum); sad_2048_pel_final_neon(sum, res); } //////////////////////////////////////////////////////////////////////////////// -void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { int i; - const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; for (i = 0; i < 32; ++i) { uint8x16_t s; - s = vld1q_u8(src + 0 * 16); + s = vld1q_u8(src_ptr + 0 * 16); sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - s = vld1q_u8(src + 1 * 16); + s = vld1q_u8(src_ptr + 1 * 16); sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - s = vld1q_u8(src + 2 * 16); + s = vld1q_u8(src_ptr + 2 * 16); sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); - s = vld1q_u8(src + 3 * 16); + s = vld1q_u8(src_ptr + 3 * 16); sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); - src += src_stride; + src_ptr += src_stride; ref_loop[0] += ref_stride; ref_loop[1] += ref_stride; ref_loop[2] += ref_stride; @@ -327,11 +332,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, sad_2048_pel_final_neon(sum, res); } -void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { int i; - const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; @@ -339,31 +345,31 @@ void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, for (i = 0; i < 64; ++i) { uint8x16_t s; - s = vld1q_u8(src + 0 * 16); + s = vld1q_u8(src_ptr + 0 * 16); sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); - s = vld1q_u8(src + 1 * 16); + s = vld1q_u8(src_ptr + 1 * 16); sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); - s = vld1q_u8(src + 2 * 16); + s = vld1q_u8(src_ptr + 2 * 16); sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); - s = vld1q_u8(src + 3 * 16); + s = vld1q_u8(src_ptr + 3 * 16); sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); - src += src_stride; + src_ptr += src_stride; ref_loop[0] += ref_stride; ref_loop[1] += ref_stride; ref_loop[2] += ref_stride; diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 9518a166b..1ce66d3e8 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -73,128 +73,132 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, return vget_lane_u32(horizontal_add_uint16x8(abs), 0); } -static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, const int height) { +static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(a); - const uint8x8_t b_u8 = vld1_u8(b); - a += a_stride; - b += b_stride; + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + src_ptr += src_stride; + ref_ptr += ref_stride; abs = vabal_u8(abs, a_u8, b_u8); } return abs; } -static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *c, const int height) { +static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(a); - const uint8x8_t b_u8 = vld1_u8(b); - const uint8x8_t c_u8 = vld1_u8(c); + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + const uint8x8_t c_u8 = vld1_u8(second_pred); const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); - a += a_stride; - b += b_stride; - c += 8; + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; abs = vabal_u8(abs, a_u8, avg); } return abs; } -#define sad8xN(n) \ - uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ - } \ - \ - uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ +#define sad8xN(n) \ + uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad8xN(4); sad8xN(8); sad8xN(16); -static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(a); - const uint8x16_t b_u8 = vld1q_u8(b); - a += a_stride; - b += b_stride; + const uint8x16_t a_u8 = vld1q_u8(src_ptr); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr); + src_ptr += src_stride; + ref_ptr += ref_stride; abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); } return abs; } -static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *c, const int height) { +static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(a); - const uint8x16_t b_u8 = vld1q_u8(b); - const uint8x16_t c_u8 = vld1q_u8(c); + const uint8x16_t a_u8 = vld1q_u8(src_ptr); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr); + const uint8x16_t c_u8 = vld1q_u8(second_pred); const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); - a += a_stride; - b += b_stride; - c += 16; + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); } return abs; } -#define sad16xN(n) \ - uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ - } \ - \ - uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ +#define sad16xN(n) \ + uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint16x8_t abs = \ + sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad16xN(8); sad16xN(16); sad16xN(32); -static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(a); - const uint8x16_t a_hi = vld1q_u8(a + 16); - const uint8x16_t b_lo = vld1q_u8(b); - const uint8x16_t b_hi = vld1q_u8(b + 16); - a += a_stride; - b += b_stride; + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + src_ptr += src_stride; + ref_ptr += ref_stride; abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); @@ -203,24 +207,25 @@ static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride, return abs; } -static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *c, const int height) { +static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(a); - const uint8x16_t a_hi = vld1q_u8(a + 16); - const uint8x16_t b_lo = vld1q_u8(b); - const uint8x16_t b_hi = vld1q_u8(b + 16); - const uint8x16_t c_lo = vld1q_u8(c); - const uint8x16_t c_hi = vld1q_u8(c + 16); + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + const uint8x16_t c_lo = vld1q_u8(second_pred); + const uint8x16_t c_hi = vld1q_u8(second_pred + 16); const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); - a += a_stride; - b += b_stride; - c += 32; + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); @@ -229,43 +234,44 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, return abs; } -#define sad32xN(n) \ - uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ - } \ - \ - uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ +#define sad32xN(n) \ + uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint16x8_t abs = \ + sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad32xN(16); sad32xN(32); sad32xN(64); -static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const int height) { int i; uint16x8_t abs_0 = vdupq_n_u16(0); uint16x8_t abs_1 = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - const uint8x16_t a_2 = vld1q_u8(a + 32); - const uint8x16_t a_3 = vld1q_u8(a + 48); - const uint8x16_t b_0 = vld1q_u8(b); - const uint8x16_t b_1 = vld1q_u8(b + 16); - const uint8x16_t b_2 = vld1q_u8(b + 32); - const uint8x16_t b_3 = vld1q_u8(b + 48); - a += a_stride; - b += b_stride; + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + src_ptr += src_stride; + ref_ptr += ref_stride; abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); @@ -282,33 +288,34 @@ static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, } } -static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *c, const int height) { +static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { int i; uint16x8_t abs_0 = vdupq_n_u16(0); uint16x8_t abs_1 = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - const uint8x16_t a_2 = vld1q_u8(a + 32); - const uint8x16_t a_3 = vld1q_u8(a + 48); - const uint8x16_t b_0 = vld1q_u8(b); - const uint8x16_t b_1 = vld1q_u8(b + 16); - const uint8x16_t b_2 = vld1q_u8(b + 32); - const uint8x16_t b_3 = vld1q_u8(b + 48); - const uint8x16_t c_0 = vld1q_u8(c); - const uint8x16_t c_1 = vld1q_u8(c + 16); - const uint8x16_t c_2 = vld1q_u8(c + 32); - const uint8x16_t c_3 = vld1q_u8(c + 48); + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + const uint8x16_t c_0 = vld1q_u8(second_pred); + const uint8x16_t c_1 = vld1q_u8(second_pred + 16); + const uint8x16_t c_2 = vld1q_u8(second_pred + 32); + const uint8x16_t c_3 = vld1q_u8(second_pred + 48); const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); - a += a_stride; - b += b_stride; - c += 64; + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); @@ -325,19 +332,20 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, } } -#define sad64xN(n) \ - uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ - } \ - \ - uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t abs = \ - sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ +#define sad64xN(n) \ + uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x4_t abs = \ + sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ + } \ + \ + uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t abs = \ + sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ } sad64xN(32); diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h index ec3975e94..11579c9a9 100644 --- a/vpx_dsp/bitwriter.h +++ b/vpx_dsp/bitwriter.h @@ -27,8 +27,8 @@ typedef struct vpx_writer { uint8_t *buffer; } vpx_writer; -void vpx_start_encode(vpx_writer *bc, uint8_t *buffer); -void vpx_stop_encode(vpx_writer *bc); +void vpx_start_encode(vpx_writer *br, uint8_t *source); +void vpx_stop_encode(vpx_writer *br); static INLINE void vpx_write(vpx_writer *br, int bit, int probability) { unsigned int split; diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c index 94acbb391..455b73bbc 100644 --- a/vpx_dsp/deblock.c +++ b/vpx_dsp/deblock.c @@ -39,11 +39,10 @@ const int16_t vpx_rv[] = { 9, 10, 13, }; -void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, int cols, - unsigned char *f, int size) { +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, + unsigned char *dst, int src_pitch, + int dst_pitch, int cols, + unsigned char *flimits, int size) { unsigned char *p_src, *p_dst; int row; int col; @@ -55,19 +54,21 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, for (row = 0; row < size; row++) { /* post_proc_down for one row */ - p_src = src_ptr; - p_dst = dst_ptr; + p_src = src; + p_dst = dst; for (col = 0; col < cols; col++) { - unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line]; - unsigned char p_above1 = p_src[col - src_pixels_per_line]; - unsigned char p_below1 = p_src[col + src_pixels_per_line]; - unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line]; + unsigned char p_above2 = p_src[col - 2 * src_pitch]; + unsigned char p_above1 = p_src[col - src_pitch]; + unsigned char p_below1 = p_src[col + src_pitch]; + unsigned char p_below2 = p_src[col + 2 * src_pitch]; v = p_src[col]; - if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) && - (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) { + if ((abs(v - p_above2) < flimits[col]) && + (abs(v - p_above1) < flimits[col]) && + (abs(v - p_below1) < flimits[col]) && + (abs(v - p_below2) < flimits[col])) { unsigned char k1, k2, k3; k1 = (p_above2 + p_above1 + 1) >> 1; k2 = (p_below2 + p_below1 + 1) >> 1; @@ -79,8 +80,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, } /* now post_proc_across */ - p_src = dst_ptr; - p_dst = dst_ptr; + p_src = dst; + p_dst = dst; p_src[-2] = p_src[-1] = p_src[0]; p_src[cols] = p_src[cols + 1] = p_src[cols - 1]; @@ -88,10 +89,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, for (col = 0; col < cols; col++) { v = p_src[col]; - if ((abs(v - p_src[col - 2]) < f[col]) && - (abs(v - p_src[col - 1]) < f[col]) && - (abs(v - p_src[col + 1]) < f[col]) && - (abs(v - p_src[col + 2]) < f[col])) { + if ((abs(v - p_src[col - 2]) < flimits[col]) && + (abs(v - p_src[col - 1]) < flimits[col]) && + (abs(v - p_src[col + 1]) < flimits[col]) && + (abs(v - p_src[col + 2]) < flimits[col])) { unsigned char k1, k2, k3; k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; @@ -109,8 +110,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, p_dst[col - 1] = d[(col - 1) & 3]; /* next row */ - src_ptr += src_pixels_per_line; - dst_ptr += dst_pixels_per_line; + src += src_pitch; + dst += dst_pitch; } } diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c index 6dcb3ba66..ef66de024 100644 --- a/vpx_dsp/fwd_txfm.c +++ b/vpx_dsp/fwd_txfm.c @@ -87,11 +87,11 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { output[0] = sum * 2; } -void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; tran_low_t intermediate[64]; int pass; - tran_low_t *output = intermediate; + tran_low_t *out = intermediate; const tran_low_t *in = NULL; // Transform columns @@ -133,10 +133,10 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { t1 = (x0 - x1) * cospi_16_64; t2 = x2 * cospi_24_64 + x3 * cospi_8_64; t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - output[0] = (tran_low_t)fdct_round_shift(t0); - output[2] = (tran_low_t)fdct_round_shift(t2); - output[4] = (tran_low_t)fdct_round_shift(t1); - output[6] = (tran_low_t)fdct_round_shift(t3); + out[0] = (tran_low_t)fdct_round_shift(t0); + out[2] = (tran_low_t)fdct_round_shift(t2); + out[4] = (tran_low_t)fdct_round_shift(t1); + out[6] = (tran_low_t)fdct_round_shift(t3); // Stage 2 t0 = (s6 - s5) * cospi_16_64; @@ -155,19 +155,19 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { t1 = x1 * cospi_12_64 + x2 * cospi_20_64; t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - output[1] = (tran_low_t)fdct_round_shift(t0); - output[3] = (tran_low_t)fdct_round_shift(t2); - output[5] = (tran_low_t)fdct_round_shift(t1); - output[7] = (tran_low_t)fdct_round_shift(t3); - output += 8; + out[1] = (tran_low_t)fdct_round_shift(t0); + out[3] = (tran_low_t)fdct_round_shift(t2); + out[5] = (tran_low_t)fdct_round_shift(t1); + out[7] = (tran_low_t)fdct_round_shift(t3); + out += 8; } in = intermediate; - output = final_output; + out = output; } // Rows for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; + for (j = 0; j < 8; ++j) output[j + i * 8] /= 2; } } @@ -705,9 +705,9 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; - tran_high_t output[32 * 32]; + tran_high_t out[32 * 32]; // Columns for (i = 0; i < 32; ++i) { @@ -715,16 +715,16 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } // Rows for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) - out[j + i * 32] = + output[j + i * 32] = (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); } } @@ -732,9 +732,9 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { // Note that although we use dct_32_round in dct32 computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. -void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; - tran_high_t output[32 * 32]; + tran_high_t out[32 * 32]; // Columns for (i = 0; i < 32; ++i) { @@ -745,15 +745,15 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { // TODO(cd): see quality impact of only doing // output[j * 32 + i] = (temp_out[j] + 1) >> 2; // PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } // Rows for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; vpx_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j]; } } @@ -772,14 +772,14 @@ void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, vpx_fdct4x4_c(input, output, stride); } -void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct8x8_c(input, final_output, stride); + vpx_fdct8x8_c(input, output, stride); } -void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct8x8_1_c(input, final_output, stride); + vpx_fdct8x8_1_c(input, output, stride); } void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, @@ -792,17 +792,18 @@ void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, vpx_fdct16x16_1_c(input, output, stride); } -void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - vpx_fdct32x32_c(input, out, stride); +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_c(input, output, stride); } -void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct32x32_rd_c(input, out, stride); + vpx_fdct32x32_rd_c(input, output, stride); } -void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct32x32_1_c(input, out, stride); + vpx_fdct32x32_1_c(input, output, stride); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c index 0194aa1e1..69de05e71 100644 --- a/vpx_dsp/inv_txfm.c +++ b/vpx_dsp/inv_txfm.c @@ -67,11 +67,11 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) { +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; - const tran_low_t *ip = in; + const tran_low_t *ip = input; tran_low_t *op = tmp; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1346,12 +1346,12 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, } } -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest, +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; - const tran_low_t *ip = in; + const tran_low_t *ip = input; tran_low_t *op = tmp; (void)bd; diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index 9866ea37d..47f30c96a 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -109,29 +109,30 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; } -void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); + filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch); ++s; } } -void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); - vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); + vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1); } void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, @@ -178,31 +179,33 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, } } -void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p); + filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch, + s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch); ++s; } } -void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); - vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); + vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, @@ -283,7 +286,8 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, } } -static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, +static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; @@ -291,34 +295,37 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8 * count; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = - flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, - s[4 * p], s[5 * p], s[6 * p], s[7 * p]); - - filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, - s + 7 * p); + const int8_t flat2 = flat_mask5( + 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0, + s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]); + + filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch, + s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, + s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch, + s + 7 * pitch); ++s; } } -void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); + mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1); } -void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); + mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2); } -static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, +static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; @@ -335,18 +342,18 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); - s += p; + s += pitch; } } -void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); + mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8); } -void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); + mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16); } #if CONFIG_VP9_HIGHBITDEPTH @@ -440,7 +447,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); } -void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; @@ -448,27 +455,28 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch]; + const uint16_t p2 = s[-3 * pitch]; + const uint16_t p1 = s[-2 * pitch]; + const uint16_t p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch]; + const uint16_t q1 = s[1 * pitch]; + const uint16_t q2 = s[2 * pitch]; + const uint16_t q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); - highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); + highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, + s + 1 * pitch, bd); ++s; } } void vpx_highbd_lpf_horizontal_4_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); - vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); + vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, @@ -517,33 +525,36 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, } } -void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, - s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, + s + 2 * pitch, s + 3 * pitch, bd); ++s; } } void vpx_highbd_lpf_horizontal_8_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); - vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); + vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, @@ -639,7 +650,7 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, } } -static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, @@ -649,44 +660,45 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8 * count; ++i) { - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch]; + const uint16_t p2 = s[-3 * pitch]; + const uint16_t p1 = s[-2 * pitch]; + const uint16_t p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch]; + const uint16_t q1 = s[1 * pitch]; + const uint16_t q2 = s[2 * pitch]; + const uint16_t q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat2 = - highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, - s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); - - highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, - s + 6 * p, s + 7 * p, bd); + const int8_t flat2 = highbd_flat_mask5( + 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0, + s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd); + + highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch, + s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, + s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, + s + 6 * pitch, s + 7 * pitch, bd); ++s; } } -void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); } -void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p, +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd); } -static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, @@ -712,20 +724,20 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7, bd); - s += p; + s += pitch; } } -void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd); } -void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c index 9ef04836a..1707d3284 100644 --- a/vpx_dsp/mips/deblock_msa.c +++ b/vpx_dsp/mips/deblock_msa.c @@ -508,11 +508,11 @@ void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst, } } -void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, - int32_t rows, int32_t cols, int32_t flimit) { +void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows, + int32_t cols, int32_t flimit) { int32_t row, col, cnt; - uint8_t *src_dup = src_ptr; - v16u8 src0, src, tmp_orig; + uint8_t *src_dup = src; + v16u8 src0, src1, tmp_orig; v16u8 tmp = { 0 }; v16i8 zero = { 0 }; v8u16 sum_h, src_r_h, src_l_h; @@ -531,13 +531,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, src_dup[cols + 16] = src_dup[cols - 1]; tmp_orig = (v16u8)__msa_ldi_b(0); tmp_orig[15] = tmp[15]; - src = LD_UB(src_dup - 8); - src[15] = 0; - ILVRL_B2_UH(zero, src, src_r_h, src_l_h); + src1 = LD_UB(src_dup - 8); + src1[15] = 0; + ILVRL_B2_UH(zero, src1, src_r_h, src_l_h); src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); src_r_w += __msa_dotp_u_w(src_l_h, src_l_h); sum_sq = HADD_SW_S32(src_r_w) + 16; - sum_h = __msa_hadd_u_h(src, src); + sum_h = __msa_hadd_u_h(src1, src1); sum = HADD_UH_U32(sum_h); { v16u8 src7, src8, src_r, src_l; @@ -566,8 +566,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1]; } sum = sum_l[7]; - src = LD_UB(src_dup + 16 * col); - ILVRL_B2_UH(zero, src, src_r_h, src_l_h); + src1 = LD_UB(src_dup + 16 * col); + ILVRL_B2_UH(zero, src1, src_r_h, src_l_h); src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4); src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4); tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7); @@ -613,7 +613,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, total3 = (total3 < flimit_vec); PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); - tmp = __msa_bmz_v(tmp, src, (v16u8)mask); + tmp = __msa_bmz_v(tmp, src1, (v16u8)mask); if (col == 0) { uint64_t src_d; diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index e37ca92ad..82a659592 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -17,7 +17,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int rc = 0; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); @@ -31,7 +31,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 16; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; if (tmp) eob = 0; } *eob_ptr = eob + 1; @@ -41,7 +41,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr) { int eob = -1; @@ -55,7 +55,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int64_t tmp = abs_coeff + round_ptr[0]; const int abs_qcoeff = (int)((tmp * quant) >> 16); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; if (abs_qcoeff) eob = 0; } *eob_ptr = eob + 1; @@ -65,7 +65,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int n_coeffs = 1024; const int rc = 0; const int coeff = coeff_ptr[rc]; @@ -81,7 +81,7 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 15; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2; if (tmp) eob = 0; } *eob_ptr = eob + 1; @@ -92,8 +92,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int n_coeffs = 1024; int eob = -1; @@ -107,7 +106,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); const int abs_qcoeff = (int)((tmp * quant) >> 15); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2; if (abs_qcoeff) eob = 0; } *eob_ptr = eob + 1; diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h index 94c8206d9..7cac140e9 100644 --- a/vpx_dsp/quantize.h +++ b/vpx_dsp/quantize.h @@ -19,26 +19,25 @@ extern "C" { #endif void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); + const int16_t dequant, uint16_t *eob_ptr); void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); + const int16_t dequant, uint16_t *eob_ptr); #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, - const int16_t quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr); void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, - const int16_t quant_ptr, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); + const int16_t dequant, uint16_t *eob_ptr); #endif #ifdef __cplusplus diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index d4a532968..873ddca09 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -17,54 +17,55 @@ #include "vpx_ports/mem.h" /* Sum the difference between every corresponding element of the buffers. */ -static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { +static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } return sad; } -#define sadMxN(m, n) \ - unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return sad(src, src_stride, ref, ref_stride, m, n); \ - } \ - unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ - vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ - return sad(src, src_stride, comp_pred, m, m, n); \ +#define sadMxN(m, n) \ + unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } \ + unsigned int vpx_sad##m##x##n##_avg_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ + vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ + return sad(src_ptr, src_stride, comp_pred, m, m, n); \ } // depending on call sites, pass **ref_array to avoid & in subsequent call and // de-dup with 4D below. -#define sadMxNxK(m, n, k) \ - void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref_array, int ref_stride, \ - uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) \ - sad_array[i] = \ - vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ +#define sadMxNxK(m, n, k) \ + void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \ } // This appears to be equivalent to the above when k == 4 and refs is const -#define sadMxNx4D(m, n) \ - void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) \ - sad_array[i] = \ - vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ +#define sadMxNx4D(m, n) \ + void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \ } /* clang-format off */ @@ -133,60 +134,61 @@ sadMxNx4D(4, 4) #if CONFIG_VP9_HIGHBITDEPTH static INLINE - unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, - int b_stride, int width, int height) { + unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int width, + int height) { int y, x; unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src += src_stride; + ref_ptr += ref_stride; } return sad; } -static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, - const uint16_t *b, int b_stride, +static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, int width, int height) { int y, x; unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src += src_stride; + ref_ptr += ref_stride; } return sad; } #define highbd_sadMxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, \ - int ref_stride) { \ - return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ + unsigned int vpx_highbd_sad##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ } \ unsigned int vpx_highbd_sad##m##x##n##_avg_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \ vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \ - n, CONVERT_TO_SHORTPTR(ref), ref_stride); \ - return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ + n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \ + return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ } -#define highbd_sadMxNx4D(m, n) \ - void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \ - ref_array[i], ref_stride); \ - } \ +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ } /* clang-format off */ diff --git a/vpx_dsp/subtract.c b/vpx_dsp/subtract.c index 95e7071b2..45c819e67 100644 --- a/vpx_dsp/subtract.c +++ b/vpx_dsp/subtract.c @@ -16,37 +16,37 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -void vpx_subtract_block_c(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src, - ptrdiff_t src_stride, const uint8_t *pred, +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; + for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - diff += diff_stride; - pred += pred_stride; - src += src_stride; + diff_ptr += diff_stride; + pred_ptr += pred_stride; + src_ptr += src_stride; } } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src8, - ptrdiff_t src_stride, const uint8_t *pred8, +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src8_ptr, + ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd) { int r, c; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); (void)bd; for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) { - diff[c] = src[c] - pred[c]; + diff_ptr[c] = src[c] - pred[c]; } - diff += diff_stride; + diff_ptr += diff_stride; pred += pred_stride; src += src_stride; } diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 13d4b3d9b..f4f89b9f4 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -37,322 +37,322 @@ if ($opts{arch} eq "x86_64") { # Intra prediction # -add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_4x4 sse2/; -add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_4x4 neon sse2/; -add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d63_predictor_4x4 ssse3/; -add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/; -add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_4x4 neon/; -add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_4x4 ssse3/; -add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_4x4 neon msa sse2/; -add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/; -add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/; -add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/; -add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/; -add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/; -add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_8x8 ssse3/; -add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/; -add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/; -add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/; -add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_8x8 neon/; -add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_8x8 ssse3/; -add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_8x8 neon msa sse2/; -add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/; -add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/; -add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/; -add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/; -add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_16x16 ssse3/; -add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/; -add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/; -add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; -add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_16x16 neon/; -add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_16x16 ssse3/; -add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/; -add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_32x32 ssse3/; -add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/; -add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/; -add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; -add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_32x32 neon/; -add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_32x32 ssse3/; -add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/; -add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/; -add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/; -add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/; -add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/; -add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/; # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d207_predictor_4x4 sse2/; - add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; - add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_4x4 sse2/; - add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d117_predictor_4x4 sse2/; - add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d153_predictor_4x4 sse2/; - add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/; - add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/; - add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; - add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/; - add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/; - add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/; - add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/; - add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/; - add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/; - add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; - add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/; - add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/; - add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/; - add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/; - add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/; - add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/; - add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; - add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/; - add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/; - add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/; - add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/; - add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/; - add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/; } # CONFIG_VP9_HIGHBITDEPTH @@ -400,28 +400,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Sub Pixel Filters # - add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64"; } # CONFIG_VP9_HIGHBITDEPTH @@ -888,43 +888,43 @@ specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/; # # Multi-block SAD, comparing a reference to N independent blocks # -add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; @@ -945,7 +945,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Block subtraction # - add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; + add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd"; # # Single block SAD @@ -990,13 +990,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Avg # - add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p"; + add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p"; specialize qw/vpx_highbd_avg_8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p"; + add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p"; specialize qw/vpx_highbd_avg_4x4 sse2/; - add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_highbd_sad64x64_avg sse2/; @@ -1038,43 +1038,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Multi-block SAD, comparing a reference to N independent blocks # - add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad64x64x4d sse2/; - add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad64x32x4d sse2/; - add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad32x64x4d sse2/; - add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad32x32x4d sse2/; - add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad32x16x4d sse2/; - add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad16x32x4d sse2/; - add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad16x16x4d sse2/; - add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad16x8x4d sse2/; - add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad8x16x4d sse2/; - add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad8x8x4d sse2/; - add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad8x4x4d sse2/; - add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad4x8x4d sse2/; - add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_highbd_sad4x4x4d sse2/; # @@ -1610,7 +1610,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/; - add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; + add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit"; specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/; add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; diff --git a/vpx_dsp/x86/avg_pred_sse2.c b/vpx_dsp/x86/avg_pred_sse2.c index e7db75559..e4e1e0e7a 100644 --- a/vpx_dsp/x86/avg_pred_sse2.c +++ b/vpx_dsp/x86/avg_pred_sse2.c @@ -15,10 +15,10 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/mem_sse2.h" -void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { - /* comp and pred must be 16 byte aligned. */ - assert(((intptr_t)comp & 0xf) == 0); + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); assert(((intptr_t)pred & 0xf) == 0); if (width > 8) { int x, y; @@ -27,17 +27,17 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); const __m128i avg = _mm_avg_epu8(p, r); - _mm_store_si128((__m128i *)(comp + x), avg); + _mm_store_si128((__m128i *)(comp_pred + x), avg); } - comp += width; + comp_pred += width; pred += width; ref += ref_stride; } } else { // width must be 4 or 8. int i; - // Process 16 elements at a time. comp and pred have width == stride and - // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all - // divisible by 16 so just ref needs to be massaged when loading. + // Process 16 elements at a time. comp_pred and pred have width == stride + // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are + // all divisible by 16 so just ref needs to be massaged when loading. for (i = 0; i < width * height; i += 16) { const __m128i p = _mm_load_si128((const __m128i *)pred); __m128i r; @@ -60,10 +60,10 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, ref += 2 * ref_stride; } avg = _mm_avg_epu8(p, r); - _mm_store_si128((__m128i *)comp, avg); + _mm_store_si128((__m128i *)comp_pred, avg); pred += 16; - comp += 16; + comp_pred += 16; } } } diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h index aa60f44f7..8398ec3c1 100644 --- a/vpx_dsp/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -23,59 +23,59 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, #define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ void vpx_convolve8_##name##_##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ - const int16_t *filter = filter_kernel[offset]; \ + const int16_t *filter_row = filter[offset]; \ (void)x0_q4; \ (void)x_step_q4; \ (void)y0_q4; \ (void)y_step_q4; \ - assert(filter[3] != 128); \ + assert(filter_row[3] != 128); \ assert(step_q4 == 16); \ - if (filter[0] | filter[1] | filter[6] | filter[7]) { \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ while (w >= 16) { \ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ src += 16; \ dst += 16; \ w -= 16; \ } \ if (w == 8) { \ vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } else if (w == 4) { \ vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } \ - } else if (filter[2] | filter[5]) { \ + } else if (filter_row[2] | filter_row[5]) { \ while (w >= 16) { \ vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ src += 16; \ dst += 16; \ w -= 16; \ } \ if (w == 8) { \ vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } else if (w == 4) { \ vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } \ } else { \ while (w >= 16) { \ vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ src += 16; \ dst += 16; \ w -= 16; \ } \ if (w == 8) { \ vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } else if (w == 4) { \ vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } \ } \ } @@ -121,86 +121,86 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, unsigned int output_height, const int16_t *filter, int bd); -#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ - void vpx_highbd_convolve8_##name##_##opt( \ - const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ - ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ - int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ - const int16_t *filter = filter_kernel[offset]; \ - if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] | filter[1] | filter[6] | filter[7]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else if (filter[2] | filter[5]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ - filter_kernel, x0_q4, x_step_q4, y0_q4, \ - y_step_q4, w, h, bd); \ - } \ +#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ + void vpx_highbd_convolve8_##name##_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_row = filter[offset]; \ + if (step_q4 == 16 && filter_row[3] != 128) { \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter_row[2] | filter_row[5]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h, bd); \ + } \ } #define HIGH_FUN_CONV_2D(avg, opt) \ diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index ff5ef5f85..0ffa7f2d4 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -20,7 +20,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, - int width, int h, int bd) { + int w, int h, int bd) { (void)filter; (void)x0_q4; (void)x_step_q4; @@ -28,8 +28,8 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, (void)y_step_q4; (void)bd; - assert(width % 4 == 0); - if (width > 32) { // width = 64 + assert(w % 4 == 0); + if (w > 32) { // w = 64 do { const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); @@ -43,7 +43,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h--; } while (h > 0); - } else if (width > 16) { // width = 32 + } else if (w > 16) { // w = 32 do { const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); @@ -53,7 +53,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h--; } while (h > 0); - } else if (width > 8) { // width = 16 + } else if (w > 8) { // w = 16 __m256i p0, p1; do { p0 = _mm256_loadu_si256((const __m256i *)src); @@ -67,7 +67,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h -= 2; } while (h > 0); - } else if (width > 4) { // width = 8 + } else if (w > 4) { // w = 8 __m128i p0, p1; do { p0 = _mm_loadu_si128((const __m128i *)src); @@ -81,7 +81,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h -= 2; } while (h > 0); - } else { // width = 4 + } else { // w = 4 __m128i p0, p1; do { p0 = _mm_loadl_epi64((const __m128i *)src); @@ -102,7 +102,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, - int width, int h, int bd) { + int w, int h, int bd) { (void)filter; (void)x0_q4; (void)x_step_q4; @@ -110,8 +110,8 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, (void)y_step_q4; (void)bd; - assert(width % 4 == 0); - if (width > 32) { // width = 64 + assert(w % 4 == 0); + if (w > 32) { // w = 64 __m256i p0, p1, p2, p3, u0, u1, u2, u3; do { p0 = _mm256_loadu_si256((const __m256i *)src); @@ -130,7 +130,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h--; } while (h > 0); - } else if (width > 16) { // width = 32 + } else if (w > 16) { // w = 32 __m256i p0, p1, u0, u1; do { p0 = _mm256_loadu_si256((const __m256i *)src); @@ -143,7 +143,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride; h--; } while (h > 0); - } else if (width > 8) { // width = 16 + } else if (w > 8) { // w = 16 __m256i p0, p1, u0, u1; do { p0 = _mm256_loadu_si256((const __m256i *)src); @@ -158,7 +158,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride << 1; h -= 2; } while (h > 0); - } else if (width > 4) { // width = 8 + } else if (w > 4) { // w = 8 __m128i p0, p1, u0, u1; do { p0 = _mm_loadu_si128((const __m128i *)src); @@ -172,7 +172,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, dst += dst_stride << 1; h -= 2; } while (h > 0); - } else { // width = 4 + } else { // w = 4 __m128i p0, p1, u0, u1; do { p0 = _mm_loadl_epi64((const __m128i *)src); diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm index c61b62104..caf506ac0 100644 --- a/vpx_dsp/x86/highbd_intrapred_sse2.asm +++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm @@ -256,7 +256,7 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps +cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd movd m1, [aboveq-2] movq m0, [aboveq] pshuflw m1, m1, 0x0 @@ -264,7 +264,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps movlhps m1, m1 ; tl tl tl tl tl tl tl tl ; Get the values to compute the maximum value at this bit depth pcmpeqw m3, m3 - movd m4, bpsd + movd m4, bdd psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl psllw m3, m4 pcmpeqw m2, m2 @@ -295,7 +295,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps RET INIT_XMM sse2 -cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one +cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one movd m1, [aboveq-2] mova m0, [aboveq] pshuflw m1, m1, 0x0 @@ -304,7 +304,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one pxor m3, m3 pxor m4, m4 pinsrw m3, oned, 0 - pinsrw m4, bpsd, 0 + pinsrw m4, bdd, 0 pshuflw m3, m3, 0x0 DEFINE_ARGS dst, stride, line, left punpcklqdq m3, m3 @@ -339,14 +339,14 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps +cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd movd m2, [aboveq-2] mova m0, [aboveq] mova m1, [aboveq+16] pshuflw m2, m2, 0x0 ; Get the values to compute the maximum value at this bit depth pcmpeqw m3, m3 - movd m4, bpsd + movd m4, bdd punpcklqdq m2, m2 psllw m3, m4 pcmpeqw m5, m5 @@ -386,7 +386,7 @@ cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps +cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd movd m0, [aboveq-2] mova m1, [aboveq] mova m2, [aboveq+16] @@ -395,7 +395,7 @@ cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps pshuflw m0, m0, 0x0 ; Get the values to compute the maximum value at this bit depth pcmpeqw m5, m5 - movd m6, bpsd + movd m6, bdd psllw m5, m6 pcmpeqw m7, m7 pxor m6, m6 ; min possible value diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c index ec22db9f4..f7fb40d51 100644 --- a/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -47,13 +47,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { // TODO(debargha, peter): Break up large functions into smaller ones // in this file. -void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { +void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi16(1); - __m128i blimit, limit, thresh; + __m128i blimit_v, limit_v, thresh_v; __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; __m128i ps1, qs1, ps0, qs0; @@ -70,35 +70,35 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, __m128i eight, four; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); } - q4 = _mm_load_si128((__m128i *)(s + 4 * p)); - p4 = _mm_load_si128((__m128i *)(s - 5 * p)); - q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - q0 = _mm_load_si128((__m128i *)(s + 0 * p)); - p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + q4 = _mm_load_si128((__m128i *)(s + 4 * pitch)); + p4 = _mm_load_si128((__m128i *)(s - 5 * pitch)); + q3 = _mm_load_si128((__m128i *)(s + 3 * pitch)); + p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); + q2 = _mm_load_si128((__m128i *)(s + 2 * pitch)); + p2 = _mm_load_si128((__m128i *)(s - 3 * pitch)); + q1 = _mm_load_si128((__m128i *)(s + 1 * pitch)); + p1 = _mm_load_si128((__m128i *)(s - 2 * pitch)); + q0 = _mm_load_si128((__m128i *)(s + 0 * pitch)); + p0 = _mm_load_si128((__m128i *)(s - 1 * pitch)); // highbd_filter_mask abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); @@ -111,14 +111,14 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // highbd_hev_mask (in C code this is actually called from highbd_filter4) flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); work = _mm_max_epi16( _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); @@ -132,7 +132,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // return ~mask // lp filter @@ -207,12 +207,12 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // (because, in both vars, each block of 16 either all 1s or all 0s) flat = _mm_and_si128(flat, mask); - p5 = _mm_load_si128((__m128i *)(s - 6 * p)); - q5 = _mm_load_si128((__m128i *)(s + 5 * p)); - p6 = _mm_load_si128((__m128i *)(s - 7 * p)); - q6 = _mm_load_si128((__m128i *)(s + 6 * p)); - p7 = _mm_load_si128((__m128i *)(s - 8 * p)); - q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + p5 = _mm_load_si128((__m128i *)(s - 6 * pitch)); + q5 = _mm_load_si128((__m128i *)(s + 5 * pitch)); + p6 = _mm_load_si128((__m128i *)(s - 7 * pitch)); + q6 = _mm_load_si128((__m128i *)(s + 6 * pitch)); + p7 = _mm_load_si128((__m128i *)(s - 8 * pitch)); + q7 = _mm_load_si128((__m128i *)(s + 7 * pitch)); // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 // but referred to as p0-p4 & q0-q4 in fn) @@ -389,8 +389,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q6 = _mm_and_si128(flat2, flat2_q6); // get values for when (flat2 && flat && mask) q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values - _mm_store_si128((__m128i *)(s - 7 * p), p6); - _mm_store_si128((__m128i *)(s + 6 * p), q6); + _mm_store_si128((__m128i *)(s - 7 * pitch), p6); + _mm_store_si128((__m128i *)(s + 6 * pitch), q6); p5 = _mm_andnot_si128(flat2, p5); // p5 remains unchanged if !(flat2 && flat && mask) @@ -404,8 +404,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // get values for when (flat2 && flat && mask) q5 = _mm_or_si128(q5, flat2_q5); // full list of q5 values - _mm_store_si128((__m128i *)(s - 6 * p), p5); - _mm_store_si128((__m128i *)(s + 5 * p), q5); + _mm_store_si128((__m128i *)(s - 6 * pitch), p5); + _mm_store_si128((__m128i *)(s + 5 * pitch), q5); p4 = _mm_andnot_si128(flat2, p4); // p4 remains unchanged if !(flat2 && flat && mask) @@ -417,8 +417,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q4 = _mm_and_si128(flat2, flat2_q4); // get values for when (flat2 && flat && mask) q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values - _mm_store_si128((__m128i *)(s - 5 * p), p4); - _mm_store_si128((__m128i *)(s + 4 * p), q4); + _mm_store_si128((__m128i *)(s - 5 * pitch), p4); + _mm_store_si128((__m128i *)(s + 4 * pitch), q4); p3 = _mm_andnot_si128(flat2, p3); // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -430,8 +430,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q3 = _mm_and_si128(flat2, flat2_q3); // get values for when (flat2 && flat && mask) q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values - _mm_store_si128((__m128i *)(s - 4 * p), p3); - _mm_store_si128((__m128i *)(s + 3 * p), q3); + _mm_store_si128((__m128i *)(s - 4 * pitch), p3); + _mm_store_si128((__m128i *)(s + 3 * pitch), q3); p2 = _mm_andnot_si128(flat2, p2); // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -444,8 +444,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q2 = _mm_and_si128(flat2, flat2_q2); // get values for when (flat2 && flat && mask) q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + _mm_store_si128((__m128i *)(s - 3 * pitch), p2); + _mm_store_si128((__m128i *)(s + 2 * pitch), q2); p1 = _mm_andnot_si128(flat2, p1); // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -457,8 +457,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q1 = _mm_and_si128(flat2, flat2_q1); // get values for when (flat2 && flat && mask) q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s + 1 * p), q1); + _mm_store_si128((__m128i *)(s - 2 * pitch), p1); + _mm_store_si128((__m128i *)(s + 1 * pitch), q1); p0 = _mm_andnot_si128(flat2, p0); // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -470,22 +470,22 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q0 = _mm_and_si128(flat2, flat2_q0); // get values for when (flat2 && flat && mask) q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s - 0 * p), q0); + _mm_store_si128((__m128i *)(s - 1 * pitch), p0); + _mm_store_si128((__m128i *)(s - 0 * pitch), q0); } -void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd); - vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd); + vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd); } -void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { +void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); @@ -493,16 +493,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); const __m128i zero = _mm_set1_epi16(0); - __m128i blimit, limit, thresh; + __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; - __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); + __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch)); + __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch)); + __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch)); + __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch)); + __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch)); + __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch)); + __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch)); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_cmpeq_epi16(one, one); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -519,25 +519,25 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, __m128i filter1, filter2; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); t80 = _mm_set1_epi16(0x80); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); t80 = _mm_set1_epi16(0x200); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); t80 = _mm_set1_epi16(0x800); } @@ -553,16 +553,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); mask = _mm_max_epi16(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; mask = _mm_max_epi16(abs_q1q0, mask); @@ -576,7 +576,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // flat_mask4 @@ -674,7 +674,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_load_si128((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -694,43 +694,43 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_load_si128((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s + 0 * p), q0); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + _mm_store_si128((__m128i *)(s - 3 * pitch), p2); + _mm_store_si128((__m128i *)(s - 2 * pitch), p1); + _mm_store_si128((__m128i *)(s - 1 * pitch), p0); + _mm_store_si128((__m128i *)(s + 0 * pitch), q0); + _mm_store_si128((__m128i *)(s + 1 * pitch), q1); + _mm_store_si128((__m128i *)(s + 2 * pitch), q2); } void vpx_highbd_lpf_horizontal_8_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd); } -void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { +void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { const __m128i zero = _mm_set1_epi16(0); - __m128i blimit, limit, thresh; + __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; - __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); const __m128i abs_q1q0 = @@ -760,33 +760,33 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, __m128i filter1, filter2; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); t80 = _mm_set1_epi16(0x80); tff80 = _mm_set1_epi16(0xff80); tffe0 = _mm_set1_epi16(0xffe0); t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); @@ -794,23 +794,23 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); } - ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); + ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); + qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); + qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); // filter_mask and hev_mask flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); mask = _mm_max_epi16(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; @@ -822,7 +822,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // filter4 @@ -872,18 +872,18 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); } void vpx_highbd_lpf_horizontal_4_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd); } static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], @@ -998,9 +998,9 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, highbd_transpose(src1, in_p, dest1, out_p, 1); } -void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; @@ -1009,7 +1009,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, src[0] = s - 4; dst[0] = t_dst; - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose(src, pitch, dst, 8, 1); // Loop filtering vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); @@ -1018,11 +1018,11 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[0] = s - 4; // Transpose back - highbd_transpose(src, 8, dst, p, 1); + highbd_transpose(src, 8, dst, pitch, 1); } void vpx_highbd_lpf_vertical_4_dual_sse2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); @@ -1030,7 +1030,7 @@ void vpx_highbd_lpf_vertical_4_dual_sse2( uint16_t *dst[2]; // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, @@ -1038,15 +1038,15 @@ void vpx_highbd_lpf_vertical_4_dual_sse2( src[0] = t_dst; src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - highbd_transpose(src, 16, dst, p, 2); + highbd_transpose(src, 16, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; @@ -1055,7 +1055,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, src[0] = s - 4; dst[0] = t_dst; - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose(src, pitch, dst, 8, 1); // Loop filtering vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); @@ -1064,11 +1064,11 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[0] = s - 4; // Transpose back - highbd_transpose(src, 8, dst, p, 1); + highbd_transpose(src, 8, dst, pitch, 1); } void vpx_highbd_lpf_vertical_8_dual_sse2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); @@ -1076,7 +1076,7 @@ void vpx_highbd_lpf_vertical_8_dual_sse2( uint16_t *dst[2]; // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, @@ -1085,13 +1085,14 @@ void vpx_highbd_lpf_vertical_8_dual_sse2( src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - highbd_transpose(src, 16, dst, p, 2); + highbd_transpose(src, 16, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, +void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); @@ -1104,7 +1105,7 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[1] = t_dst + 8 * 8; // Transpose 16x8 - highbd_transpose(src, p, dst, 8, 2); + highbd_transpose(src, pitch, dst, 8, 2); // Loop filtering vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, @@ -1115,24 +1116,25 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[1] = s; // Transpose back - highbd_transpose(src, 8, dst, p, 2); + highbd_transpose(src, 8, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, +void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[256]); // Transpose 16x16 - highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); // Loop filtering vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); // Transpose back - highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, + pitch); } diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c index 6652a62dc..85a731426 100644 --- a/vpx_dsp/x86/loopfilter_avx2.c +++ b/vpx_dsp/x86/loopfilter_avx2.c @@ -13,38 +13,38 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0])); + const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0])); - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch)); q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch)); q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch)); q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch)); q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch)); q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { @@ -59,12 +59,12 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -76,7 +76,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -136,21 +136,21 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch)); q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch))); - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch)); q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch))); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch)); q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch))); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), @@ -321,44 +321,44 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0)); } } @@ -367,10 +367,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; -void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -380,32 +380,32 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); - - p256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); - p256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); - p256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); - p256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); - p256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); - q256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); - q256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); - q256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); - q256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); - q256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0])); + const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0])); + + p256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch))); + p256_3 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch))); + p256_2 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch))); + p256_1 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch))); + p256_0 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch))); + q256_0 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch))); + q256_1 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch))); + q256_2 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch))); + q256_3 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch))); + q256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch))); p4 = _mm256_castsi256_si128(p256_4); p3 = _mm256_castsi256_si128(p256_3); @@ -431,12 +431,12 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); @@ -450,7 +450,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -532,9 +532,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat = _mm_and_si128(flat, mask); p256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch))); q256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch))); p5 = _mm256_castsi256_si128(p256_5); q5 = _mm256_castsi256_si128(q256_5); flat2 = _mm_max_epu8( @@ -543,9 +543,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat2 = _mm_max_epu8(work, flat2); p256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch))); q256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch))); p6 = _mm256_castsi256_si128(p256_6); q6 = _mm256_castsi256_si128(q256_6); work = _mm_max_epu8( @@ -555,9 +555,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat2 = _mm_max_epu8(work, flat2); p256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch))); q256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch))); p7 = _mm256_castsi256_si128(p256_7); q7 = _mm256_castsi256_si128(q256_7); work = _mm_max_epu8( @@ -843,71 +843,71 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, p6 = _mm_andnot_si128(flat2, p6); flat2_p6 = _mm_and_si128(flat2, flat2_p6); p6 = _mm_or_si128(flat2_p6, p6); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6); p5 = _mm_andnot_si128(flat2, p5); flat2_p5 = _mm_and_si128(flat2, flat2_p5); p5 = _mm_or_si128(flat2_p5, p5); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5); p4 = _mm_andnot_si128(flat2, p4); flat2_p4 = _mm_and_si128(flat2, flat2_p4); p4 = _mm_or_si128(flat2_p4, p4); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4); p3 = _mm_andnot_si128(flat2, p3); flat2_p3 = _mm_and_si128(flat2, flat2_p3); p3 = _mm_or_si128(flat2_p3, p3); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3); p2 = _mm_andnot_si128(flat2, p2); flat2_p2 = _mm_and_si128(flat2, flat2_p2); p2 = _mm_or_si128(flat2_p2, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2); p1 = _mm_andnot_si128(flat2, p1); flat2_p1 = _mm_and_si128(flat2, flat2_p1); p1 = _mm_or_si128(flat2_p1, p1); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); p0 = _mm_andnot_si128(flat2, p0); flat2_p0 = _mm_and_si128(flat2, flat2_p0); p0 = _mm_or_si128(flat2_p0, p0); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); q0 = _mm_andnot_si128(flat2, q0); flat2_q0 = _mm_and_si128(flat2, flat2_q0); q0 = _mm_or_si128(flat2_q0, q0); - _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0); q1 = _mm_andnot_si128(flat2, q1); flat2_q1 = _mm_and_si128(flat2, flat2_q1); q1 = _mm_or_si128(flat2_q1, q1); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); q2 = _mm_andnot_si128(flat2, q2); flat2_q2 = _mm_and_si128(flat2, flat2_q2); q2 = _mm_or_si128(flat2_q2, q2); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2); q3 = _mm_andnot_si128(flat2, q3); flat2_q3 = _mm_and_si128(flat2, flat2_q3); q3 = _mm_or_si128(flat2_q3, q3); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3); q4 = _mm_andnot_si128(flat2, q4); flat2_q4 = _mm_and_si128(flat2, flat2_q4); q4 = _mm_or_si128(flat2_q4, q4); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4); q5 = _mm_andnot_si128(flat2, q5); flat2_q5 = _mm_and_si128(flat2, flat2_q5); q5 = _mm_or_si128(flat2_q5, q5); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5); q6 = _mm_andnot_si128(flat2, q6); flat2_q6 = _mm_and_si128(flat2, flat2_q6); q6 = _mm_or_si128(flat2_q6, q6); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6); } } diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index 853c4d270..20dcb0d22 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -31,7 +31,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ hev = \ _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_cmpgt_epi16(hev, thresh_v); \ hev = _mm_packs_epi16(hev, hev); \ \ /* const int8_t mask = filter_mask(*limit, *blimit, */ \ @@ -52,7 +52,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { flat = _mm_max_epu8(work, flat); \ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_subs_epu8(mask, limit_v); \ mask = _mm_cmpeq_epi8(mask, zero); \ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ } while (0) @@ -104,27 +104,26 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ } while (0) -void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i limit_v = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), + _mm_loadl_epi64((const __m128i *)limit)); + const __m128i thresh_v = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; __m128i mask, hev; - p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s - 4 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s + 0 * p))); - q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)), + _mm_loadl_epi64((__m128i *)(s - 4 * pitch))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 0 * pitch))); + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch))); p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); @@ -133,41 +132,40 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, FILTER_HEV_MASK; FILTER4; - _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 + _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0)); // *oq1 } -void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i limit_v = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), + _mm_loadl_epi64((const __m128i *)limit)); + const __m128i thresh_v = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i x0, x1, x2, x3; __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; __m128i mask, hev; // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4))); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4))); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4))); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4))); // Transpose 8x8 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 @@ -213,52 +211,52 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); - storeu_uint32(s + 0 * p - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 1 * p - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 2 * p - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 3 * p - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); - storeu_uint32(s + 4 * p - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 5 * p - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 6 * p - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 7 * p - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); } -void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat, flat2; __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch)); q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch)); q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch)); q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch)); q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch)); q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { @@ -270,12 +268,12 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, abs_p0q0 = abs_diff(q0p0, p0q0); abs_p1q1 = abs_diff(q1p1, p1q1); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -285,7 +283,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -343,18 +341,18 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch)); q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch))); - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch)); q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch))); flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch)); q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch))); work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); @@ -521,44 +519,44 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0)); } } @@ -592,15 +590,15 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat, flat2; __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; @@ -610,22 +608,22 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, __m128i max_abs_p1p0q1q0; - p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); - q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); - q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch)); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); { const __m128i abs_p1p0 = abs_diff(p1, p0); @@ -639,7 +637,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); @@ -649,7 +647,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, mask = _mm_max_epu8(work, mask); work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -695,7 +693,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, oq0 = _mm_xor_si128(q0, t80); oq1 = _mm_xor_si128(q1, t80); - hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); @@ -852,82 +850,82 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6); f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5); f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4); f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3); f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2); f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ } } -void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -935,21 +933,21 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s - 0 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 2 * pitch))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), + _mm_loadl_epi64((__m128i *)(s - 0 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); p0q0 = _mm_shuffle_epi32(q0p0, 78); @@ -965,12 +963,12 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, abs_p0q0 = abs_diff(q0p0, p0q0); abs_p1q1 = abs_diff(q1p1, p1q1); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -980,7 +978,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); // flat_mask4 @@ -998,14 +996,22 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, unsigned char *src = s; { __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)), + zero); workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); @@ -1051,13 +1057,13 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, const __m128i t80 = _mm_set1_epi8(0x80); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i ps1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1103,7 +1109,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_loadl_epi64((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -1121,27 +1127,25 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_loadl_epi64((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0); + _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0); + _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1); + _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2); } } -void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { +void vpx_lpf_horizontal_8_dual_sse2( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -1150,26 +1154,26 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); const __m128i zero = _mm_set1_epi16(0); const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), + _mm_load_si128((const __m128i *)blimit1)); const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0), + _mm_load_si128((const __m128i *)limit1)); const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), + _mm_load_si128((const __m128i *)thresh1)); __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); @@ -1228,14 +1232,22 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, do { __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)), + zero); workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); @@ -1287,13 +1299,13 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1345,7 +1357,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_load_si128((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -1363,49 +1375,49 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_load_si128((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2); } } -void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit0, - const unsigned char *_limit0, - const unsigned char *_thresh0, - const unsigned char *_blimit1, - const unsigned char *_limit1, - const unsigned char *_thresh1) { +void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch, + const unsigned char *blimit0, + const unsigned char *limit0, + const unsigned char *thresh0, + const unsigned char *blimit1, + const unsigned char *limit1, + const unsigned char *thresh1) { const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), + _mm_load_si128((const __m128i *)blimit1)); const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0), + _mm_load_si128((const __m128i *)limit1)); const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), + _mm_load_si128((const __m128i *)thresh1)); const __m128i zero = _mm_set1_epi16(0); __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i mask, hev, flat; - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); // filter_mask and hev_mask { @@ -1456,13 +1468,13 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1507,10 +1519,10 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); } } @@ -1650,7 +1662,7 @@ static INLINE void transpose(unsigned char *src[], int in_p, } while (++idx8x8 < num_8x8_to_transpose); } -void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { @@ -1659,7 +1671,7 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, unsigned char *dst[2]; // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, @@ -1667,13 +1679,13 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, src[0] = t_dst; src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - transpose(src, 16, dst, p, 2); + transpose(src, 16, dst, pitch, 2); } -void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { @@ -1685,7 +1697,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, src[0] = s - 4; dst[0] = t_dst; - transpose(src, p, dst, 8, 1); + transpose(src, pitch, dst, 8, 1); // Loop filtering vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); @@ -1694,10 +1706,10 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, dst[0] = s - 4; // Transpose back - transpose(src, 8, dst, p, 1); + transpose(src, 8, dst, pitch, 1); } -void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { @@ -1706,7 +1718,7 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, unsigned char *dst[2]; // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, @@ -1715,13 +1727,13 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - transpose(src, 16, dst, p, 2); + transpose(src, 16, dst, pitch, 2); } -void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { @@ -1735,7 +1747,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, dst[1] = t_dst + 8 * 8; // Transpose 16x8 - transpose(src, p, dst, 8, 2); + transpose(src, pitch, dst, 8, 2); // Loop filtering vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); @@ -1746,22 +1758,22 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, dst[1] = s; // Transpose back - transpose(src, 8, dst, p, 2); + transpose(src, 8, dst, pitch, 2); } -void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { DECLARE_ALIGNED(16, unsigned char, t_dst[256]); // Transpose 16x16 - transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); // Loop filtering vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); // Transpose back - transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch); } diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 6f4489004..692cf59d9 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -24,8 +24,8 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; @@ -37,7 +37,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -97,8 +97,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(coeff0, dqcoeff_ptr); store_tran_low(coeff1, dqcoeff_ptr + 8); - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, - zero); + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -141,20 +140,22 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(coeff0, dqcoeff_ptr + index); store_tran_low(coeff1, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); } *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_avx( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); @@ -167,7 +168,7 @@ void vpx_quantize_b_32x32_avx( __m128i all_zero; __m128i eob = zero, eob0; - (void)scan_ptr; + (void)scan; (void)n_coeffs; (void)skip_block; assert(!skip_block); @@ -253,8 +254,7 @@ void vpx_quantize_b_32x32_avx( store_tran_low(coeff0, dqcoeff_ptr); store_tran_low(coeff1, dqcoeff_ptr + 8); - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, - zero); + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -306,8 +306,8 @@ void vpx_quantize_b_32x32_avx( store_tran_low(coeff0, dqcoeff_ptr + index); store_tran_low(coeff1, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index c020b398c..327fb052a 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -22,8 +22,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); int index = 16; @@ -33,7 +33,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -81,8 +81,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(coeff0, dqcoeff_ptr); store_tran_low(coeff1, dqcoeff_ptr + 8); - eob = - scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -115,8 +114,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(coeff0, dqcoeff_ptr + index); store_tran_low(coeff1, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); index += 16; diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 3f528e1a9..d8b6dc78a 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -22,7 +22,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); int index = 16; @@ -32,7 +32,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -74,8 +74,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(coeff0, dqcoeff_ptr); store_tran_low(coeff1, dqcoeff_ptr + 8); - eob = - scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -106,8 +105,8 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(coeff0, dqcoeff_ptr + index); store_tran_low(coeff1, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); index += 16; @@ -116,12 +115,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); int index; @@ -133,7 +134,7 @@ void vpx_quantize_b_32x32_ssse3( __m128i all_zero; __m128i eob = zero, eob0; - (void)scan_ptr; + (void)scan; (void)n_coeffs; (void)skip_block; assert(!skip_block); @@ -226,8 +227,7 @@ void vpx_quantize_b_32x32_ssse3( store_tran_low(coeff0, dqcoeff_ptr); store_tran_low(coeff1, dqcoeff_ptr + 8); - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, - zero); + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -283,8 +283,8 @@ void vpx_quantize_b_32x32_ssse3( store_tran_low(coeff0, dqcoeff_ptr + index); store_tran_low(coeff1, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_x86.h b/vpx_dsp/x86/quantize_x86.h index bb9e32f71..a6e2f274d 100644 --- a/vpx_dsp/x86/quantize_x86.h +++ b/vpx_dsp/x86/quantize_x86.h @@ -48,17 +48,17 @@ static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { return _mm_mullo_epi16(qcoeff, dequant); } -// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing -// to zbin to add 1 to the index in 'scan'. +// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to +// zbin to add 1 to the index in 'scan'. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, const __m128i zbin_mask0, const __m128i zbin_mask1, - const int16_t *scan_ptr, const int index, + const int16_t *scan, const int index, const __m128i zero) { const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); - __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); - __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8)); __m128i eob0, eob1; // Add one to convert from indices to counts scan0 = _mm_sub_epi16(scan0, zbin_mask0); diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 2c6b36f17..b18fecf70 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -12,26 +12,26 @@ #include "vpx/vpx_integer.h" static INLINE void calc_final(const __m256i *const sums /*[4]*/, - uint32_t res[4]) { + uint32_t sad_array[4]) { const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); const __m256i t2 = _mm256_hadd_epi32(t0, t1); const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), _mm256_extractf128_si256(t2, 1)); - _mm_storeu_si128((__m128i *)res, sum); + _mm_storeu_si128((__m128i *)sad_array, sum); } -void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; - refs[0] = ref[0]; - refs[1] = ref[1]; - refs[2] = ref[2]; - refs[3] = ref[3]; + refs[0] = ref_array[0]; + refs[1] = ref_array[1]; + refs[2] = ref_array[2]; + refs[3] = ref_array[3]; sums[0] = _mm256_setzero_si256(); sums[1] = _mm256_setzero_si256(); sums[2] = _mm256_setzero_si256(); @@ -40,46 +40,46 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride, for (i = 0; i < 32; i++) { __m256i r[4]; - // load src and all refs - const __m256i s = _mm256_load_si256((const __m256i *)src); + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); - // sum of the absolute differences between every ref-i to src + // sum of the absolute differences between every ref[] to src r[0] = _mm256_sad_epu8(r[0], s); r[1] = _mm256_sad_epu8(r[1], s); r[2] = _mm256_sad_epu8(r[2], s); r[3] = _mm256_sad_epu8(r[3], s); - // sum every ref-i + // sum every ref[] sums[0] = _mm256_add_epi32(sums[0], r[0]); sums[1] = _mm256_add_epi32(sums[1], r[1]); sums[2] = _mm256_add_epi32(sums[2], r[2]); sums[3] = _mm256_add_epi32(sums[3], r[3]); - src += src_stride; + src_ptr += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } - calc_final(sums, res); + calc_final(sums, sad_array); } -void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; - refs[0] = ref[0]; - refs[1] = ref[1]; - refs[2] = ref[2]; - refs[3] = ref[3]; + refs[0] = ref_array[0]; + refs[1] = ref_array[1]; + refs[2] = ref_array[2]; + refs[3] = ref_array[3]; sums[0] = _mm256_setzero_si256(); sums[1] = _mm256_setzero_si256(); sums[2] = _mm256_setzero_si256(); @@ -87,9 +87,9 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, for (i = 0; i < 64; i++) { __m256i r_lo[4], r_hi[4]; - // load 64 bytes from src and all refs - const __m256i s_lo = _mm256_load_si256((const __m256i *)src); - const __m256i s_hi = _mm256_load_si256((const __m256i *)(src + 32)); + // load 64 bytes from src and all ref[] + const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); + const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32)); r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32)); r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]); @@ -99,7 +99,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]); r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32)); - // sum of the absolute differences between every ref-i to src + // sum of the absolute differences between every ref[] to src r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo); r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo); r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo); @@ -109,7 +109,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi); r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi); - // sum every ref-i + // sum every ref[] sums[0] = _mm256_add_epi32(sums[0], r_lo[0]); sums[1] = _mm256_add_epi32(sums[1], r_lo[1]); sums[2] = _mm256_add_epi32(sums[2], r_lo[2]); @@ -119,12 +119,12 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, sums[2] = _mm256_add_epi32(sums[2], r_hi[2]); sums[3] = _mm256_add_epi32(sums[3], r_hi[3]); - src += src_stride; + src_ptr += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } - calc_final(sums, res); + calc_final(sums, sad_array); } diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c index 5f2ab6ea7..4c5d70464 100644 --- a/vpx_dsp/x86/sad4d_avx512.c +++ b/vpx_dsp/x86/sad4d_avx512.c @@ -11,8 +11,8 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t res[4]) { __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; @@ -20,33 +20,33 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, int i; const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; + ref0 = ref_array[0]; + ref1 = ref_array[1]; + ref2 = ref_array[2]; + ref3 = ref_array[3]; sum_ref0 = _mm512_set1_epi16(0); sum_ref1 = _mm512_set1_epi16(0); sum_ref2 = _mm512_set1_epi16(0); sum_ref3 = _mm512_set1_epi16(0); for (i = 0; i < 64; i++) { - // load src and all refs - src_reg = _mm512_loadu_si512((const __m512i *)src); + // load src and all ref[] + src_reg = _mm512_loadu_si512((const __m512i *)src_ptr); ref0_reg = _mm512_loadu_si512((const __m512i *)ref0); ref1_reg = _mm512_loadu_si512((const __m512i *)ref1); ref2_reg = _mm512_loadu_si512((const __m512i *)ref2); ref3_reg = _mm512_loadu_si512((const __m512i *)ref3); - // sum of the absolute differences between every ref-i to src + // sum of the absolute differences between every ref[] to src ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg); ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg); ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg); - // sum every ref-i + // sum every ref[] sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg); sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg); sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg); sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg); - src += src_stride; + src_ptr += src_stride; ref0 += ref_stride; ref1 += ref_stride; ref2 += ref_stride; @@ -55,7 +55,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, { __m256i sum256; __m128i sum128; - // in sum_ref-i the result is saved in the first 4 bytes + // in sum_ref[] the result is saved in the first 4 bytes // the other 4 bytes are zeroed. // sum_ref1 and sum_ref3 are shifted left by 4 bytes sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4); @@ -65,7 +65,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1); sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3); - // merge every 64 bit from each sum_ref-i + // merge every 64 bit from each sum_ref[] sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2); sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2); diff --git a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm index d83507dc9..e6e72b826 100644 --- a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm +++ b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -45,7 +45,7 @@ ;Compute max and min values of a pixel mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm0, rdx movq xmm1, rcx pshufd xmm0, xmm0, 0b @@ -121,7 +121,7 @@ ;Compute max and min values of a pixel mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm0, rdx movq xmm1, rcx pshufd xmm0, xmm0, 0b diff --git a/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm index 9bffe504b..87bf75ebb 100644 --- a/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm +++ b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm @@ -26,7 +26,7 @@ pshufd xmm3, xmm3, 0 mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm5, rdx movq xmm2, rcx pshufd xmm5, xmm5, 0b @@ -82,7 +82,7 @@ pshufd xmm4, xmm4, 0 mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm8, rdx movq xmm5, rcx pshufd xmm8, xmm8, 0b |