diff options
author | clang-format <noreply@google.com> | 2016-07-26 20:43:23 -0700 |
---|---|---|
committer | James Zern <jzern@google.com> | 2016-08-02 16:47:11 -0700 |
commit | e0cc52db3fc9b09c99d7bbee35153cf82964a860 (patch) | |
tree | 4988f1d3a21056339e2ffbd7a3b3d52fab54cb6b /vp9/encoder/x86 | |
parent | 3a04c9c9c4c4935925f4c00dcc70610100c5e9dd (diff) | |
download | libvpx-e0cc52db3fc9b09c99d7bbee35153cf82964a860.tar libvpx-e0cc52db3fc9b09c99d7bbee35153cf82964a860.tar.gz libvpx-e0cc52db3fc9b09c99d7bbee35153cf82964a860.tar.bz2 libvpx-e0cc52db3fc9b09c99d7bbee35153cf82964a860.zip |
vp9/encoder: apply clang-format
Change-Id: I45d9fb4013f50766b24363a86365e8063e8954c2
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r-- | vp9/encoder/x86/vp9_dct_intrin_sse2.c | 153 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_dct_ssse3.c | 80 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_denoiser_sse2.c | 119 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 66 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_error_intrin_avx2.c | 12 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_frame_scale_ssse3.c | 46 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c | 30 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_quantize_sse2.c | 71 |
8 files changed, 269 insertions, 308 deletions
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index fa37b6fed..0712779b7 100644 --- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -78,8 +78,8 @@ static void fdct4_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[4], v[4]; - u[0]=_mm_unpacklo_epi16(in[0], in[1]); - u[1]=_mm_unpacklo_epi16(in[3], in[2]); + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[3], in[2]); v[0] = _mm_add_epi16(u[0], u[1]); v[1] = _mm_sub_epi16(u[0], u[1]); @@ -151,14 +151,12 @@ static void fadst4_sse2(__m128i *in) { transpose_4x4(in); } -void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { __m128i in[4]; switch (tx_type) { - case DCT_DCT: - vpx_fdct4x4_sse2(input, output, stride); - break; + case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break; case ADST_DCT: load_buffer_4x4(input, in, stride); fadst4_sse2(in); @@ -177,21 +175,18 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, fadst4_sse2(in); write_buffer_4x4(output, in); break; - default: - assert(0); - break; + default: assert(0); break; } } void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, - int16_t* coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t* zbin_ptr, - const int16_t* round_ptr, const int16_t* quant_ptr, - const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, - int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, - uint16_t* eob_ptr, - const int16_t* scan_ptr, - const int16_t* iscan_ptr) { + int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { __m128i zero; int pass; // Constants @@ -208,14 +203,14 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); __m128i *in[8]; int index = 0; @@ -469,9 +464,9 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, // Setup global values { - round = _mm_load_si128((const __m128i*)round_ptr); - quant = _mm_load_si128((const __m128i*)quant_ptr); - dequant = _mm_load_si128((const __m128i*)dequant_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); } { @@ -503,15 +498,15 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } { @@ -524,8 +519,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -568,14 +563,14 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } { @@ -588,8 +583,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -615,10 +610,10 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, } } else { do { - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; @@ -628,14 +623,14 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, // load 8x8 array static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, int stride) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); in[0] = _mm_slli_epi16(in[0], 2); in[1] = _mm_slli_epi16(in[1], 2); @@ -930,14 +925,14 @@ static void fadst8_sse2(__m128i *in) { __m128i in0, in1, in2, in3, in4, in5, in6, in7; // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; // column transformation // stage 1 @@ -1135,14 +1130,12 @@ static void fadst8_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { __m128i in[8]; switch (tx_type) { - case DCT_DCT: - vpx_fdct8x8_sse2(input, output, stride); - break; + case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break; case ADST_DCT: load_buffer_8x8(input, in, stride); fadst8_sse2(in); @@ -1164,13 +1157,11 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - default: - assert(0); - break; + default: assert(0); break; } } -static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, +static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0, __m128i *in1, int stride) { // load first 8 columns load_buffer_8x8(input, in0, stride); @@ -1530,13 +1521,13 @@ static void fdct16_8col(__m128i *in) { v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - in[1] = _mm_packs_epi32(v[0], v[1]); - in[9] = _mm_packs_epi32(v[2], v[3]); - in[5] = _mm_packs_epi32(v[4], v[5]); + in[1] = _mm_packs_epi32(v[0], v[1]); + in[9] = _mm_packs_epi32(v[2], v[3]); + in[5] = _mm_packs_epi32(v[4], v[5]); in[13] = _mm_packs_epi32(v[6], v[7]); - in[3] = _mm_packs_epi32(v[8], v[9]); + in[3] = _mm_packs_epi32(v[8], v[9]); in[11] = _mm_packs_epi32(v[10], v[11]); - in[7] = _mm_packs_epi32(v[12], v[13]); + in[7] = _mm_packs_epi32(v[12], v[13]); in[15] = _mm_packs_epi32(v[14], v[15]); } @@ -2022,14 +2013,12 @@ static void fadst16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); } -void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { __m128i in0[16], in1[16]; switch (tx_type) { - case DCT_DCT: - vpx_fdct16x16_sse2(input, output, stride); - break; + case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break; case ADST_DCT: load_buffer_16x16(input, in0, in1, stride); fadst16_sse2(in0, in1); @@ -2051,8 +2040,6 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - default: - assert(0); - break; + default: assert(0); break; } } diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c index 1a1d4eabc..fb2a92541 100644 --- a/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/vp9/encoder/x86/vp9_dct_ssse3.c @@ -15,16 +15,12 @@ #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, - int16_t* coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t* zbin_ptr, - const int16_t* round_ptr, const int16_t* quant_ptr, - const int16_t* quant_shift_ptr, - int16_t* qcoeff_ptr, - int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, - uint16_t* eob_ptr, - const int16_t* scan_ptr, - const int16_t* iscan_ptr) { +void vp9_fdct8x8_quant_ssse3( + const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; int pass; // Constants @@ -42,14 +38,14 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); __m128i *in[8]; int index = 0; @@ -298,9 +294,9 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, // Setup global values { - round = _mm_load_si128((const __m128i*)round_ptr); - quant = _mm_load_si128((const __m128i*)quant_ptr); - dequant = _mm_load_si128((const __m128i*)dequant_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); } { @@ -332,15 +328,15 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } { @@ -353,8 +349,8 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -388,7 +384,7 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, round); @@ -402,20 +398,20 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } else { - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); } } @@ -429,8 +425,8 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -456,10 +452,10 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, } } else { do { - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c index 883507af3..91d0602f9 100644 --- a/vp9/encoder/x86/vp9_denoiser_sse2.c +++ b/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -37,17 +37,11 @@ static INLINE int sum_diff_16x1(__m128i acc_diff) { } // Denoise a 16x1 vector. -static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig, - const uint8_t *mc_running_avg_y, - uint8_t *running_avg_y, - const __m128i *k_0, - const __m128i *k_4, - const __m128i *k_8, - const __m128i *k_16, - const __m128i *l3, - const __m128i *l32, - const __m128i *l21, - __m128i acc_diff) { +static INLINE __m128i vp9_denoiser_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, + const __m128i *k_16, const __m128i *l3, const __m128i *l32, + const __m128i *l21, __m128i acc_diff) { // Calculate differences const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = @@ -69,7 +63,7 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig, __m128i adj2 = _mm_and_si128(mask2, *l32); const __m128i adj1 = _mm_and_si128(mask1, *l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); - __m128i adj, padj, nadj; + __m128i adj, padj, nadj; // Combine the adjustments and get absolute adjustments. adj2 = _mm_add_epi8(adj2, adj1); @@ -95,9 +89,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig, // Denoise a 16x1 vector with a weaker filter. static INLINE __m128i vp9_denoiser_adj_16x1_sse2( - const uint8_t *sig, const uint8_t *mc_running_avg_y, - uint8_t *running_avg_y, const __m128i k_0, - const __m128i k_delta, __m128i acc_diff) { + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i k_0, const __m128i k_delta, __m128i acc_diff) { __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); // Calculate differences. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); @@ -108,8 +101,7 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2( // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); // Clamp absolute difference to delta to get the adjustment. - const __m128i adj = - _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); // Restore the sign and get positive and negative adjustments. __m128i padj, nadj; padj = _mm_andnot_si128(diff_sign, adj); @@ -126,14 +118,17 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2( } // Denoise 8x8 and 8x16 blocks. -static int vp9_denoiser_NxM_sse2_small( - const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y, - int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride, - int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) { +static int vp9_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude, int width) { int sum_diff_thresh, r, sum_diff = 0; - const int shift_inc = (increase_denoising && - motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? - 1 : 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); @@ -153,15 +148,13 @@ static int vp9_denoiser_NxM_sse2_small( memcpy(sig_buffer[r], sig, width); memcpy(sig_buffer[r] + width, sig + sig_stride, width); memcpy(mc_running_buffer[r], mc_running_avg_y, width); - memcpy(mc_running_buffer[r] + width, - mc_running_avg_y + mc_avg_y_stride, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); memcpy(running_buffer[r], running_avg_y, width); memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); - acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], - mc_running_buffer[r], - running_buffer[r], - &k_0, &k_4, &k_8, &k_16, - &l3, &l32, &l21, acc_diff); + acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r], + running_buffer[r], &k_0, &k_4, &k_8, + &k_16, &l3, &l32, &l21, acc_diff); memcpy(running_avg_y, running_buffer[r], width); memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width); // Update pointers for next iteration. @@ -184,19 +177,19 @@ static int vp9_denoiser_NxM_sse2_small( // The delta is set by the excess of absolute pixel diff over the // threshold. - const int delta = ((abs(sum_diff) - sum_diff_thresh) >> - num_pels_log2_lookup[bs]) + 1; + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); running_avg_y -= avg_y_stride * (b_height << 1); for (r = 0; r < b_height; ++r) { acc_diff = vp9_denoiser_adj_16x1_sse2( - sig_buffer[r], mc_running_buffer[r], running_buffer[r], - k_0, k_delta, acc_diff); + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0, + k_delta, acc_diff); memcpy(running_avg_y, running_buffer[r], width); - memcpy(running_avg_y + avg_y_stride, - running_buffer[r] + width, width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, + width); // Update pointers for next iteration. running_avg_y += (avg_y_stride << 1); } @@ -216,14 +209,14 @@ static int vp9_denoiser_NxM_sse2_small( static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y, int mc_avg_y_stride, - uint8_t *running_avg_y, - int avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude) { int sum_diff_thresh, r, c, sum_diff = 0; - const int shift_inc = (increase_denoising && - motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? - 1 : 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; __m128i acc_diff[4][4]; const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); @@ -248,9 +241,9 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, for (r = 0; r < b_height; ++r) { for (c = 0; c < b_width_shift4; ++c) { - acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2( - sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, - &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]); + acc_diff[c][r >> 4] = vp9_denoiser_16x1_sse2( + sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3, + &l32, &l21, acc_diff[c][r >> 4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; @@ -259,7 +252,7 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { for (c = 0; c < b_width_shift4; ++c) { - sum_diff += sum_diff_16x1(acc_diff[c][r>>4]); + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); } } @@ -272,8 +265,8 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, { sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); if (abs(sum_diff) > sum_diff_thresh) { - const int delta = ((abs(sum_diff) - sum_diff_thresh) >> - num_pels_log2_lookup[bs]) + 1; + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { @@ -284,9 +277,9 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, sum_diff = 0; for (r = 0; r < b_height; ++r) { for (c = 0; c < b_width_shift4; ++c) { - acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2( - sig, mc_running_avg_y, running_avg_y, k_0, - k_delta, acc_diff[c][r>>4]); + acc_diff[c][r >> 4] = + vp9_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y, + k_0, k_delta, acc_diff[c][r >> 4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; @@ -295,7 +288,7 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { for (c = 0; c < b_width_shift4; ++c) { - sum_diff += sum_diff_16x1(acc_diff[c][r>>4]); + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); } } sig = sig - b_width + sig_stride; @@ -314,27 +307,21 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, } int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride, - const uint8_t *mc_avg, - int mc_avg_stride, + const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, - int increase_denoising, - BLOCK_SIZE bs, + int increase_denoising, BLOCK_SIZE bs, int motion_magnitude) { // Rank by frequency of the block type to have an early termination. if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || bs == BLOCK_32X64 || bs == BLOCK_64X32) { - return vp9_denoiser_NxM_sse2_big(sig, sig_stride, - mc_avg, mc_avg_stride, - avg, avg_stride, - increase_denoising, - bs, motion_magnitude); + return vp9_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude); } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { - return vp9_denoiser_NxM_sse2_small(sig, sig_stride, - mc_avg, mc_avg_stride, - avg, avg_stride, - increase_denoising, - bs, motion_magnitude, 8); + return vp9_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude, 8); } else { return COPY_BLOCK; } diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index cd3e87ec8..36bcf1536 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -9,7 +9,7 @@ */ #if defined(_MSC_VER) -# include <intrin.h> +#include <intrin.h> #endif #include <emmintrin.h> #include <smmintrin.h> @@ -19,11 +19,11 @@ #include "vpx_ports/mem.h" #ifdef __GNUC__ -# define LIKELY(v) __builtin_expect(v, 1) -# define UNLIKELY(v) __builtin_expect(v, 0) +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) #else -# define LIKELY(v) (v) -# define UNLIKELY(v) (v) +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) #endif static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { @@ -40,19 +40,19 @@ static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { return mv.as_int == 0 ? 0 : 1; } -static INLINE int mv_cost(const int_mv mv, - const int *joint_cost, int *const comp_cost[2]) { - return joint_cost[get_mv_joint(mv)] + - comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col]; +static INLINE int mv_cost(const int_mv mv, const int *joint_cost, + int *const comp_cost[2]) { + return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + + comp_cost[1][mv.as_mv.col]; } static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, int sad_per_bit) { - const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row, - mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost, - x->nmvsadcost) * - sad_per_bit, VP9_PROB_COST_SHIFT); + const int_mv diff = + pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); } /***************************************************************************** @@ -71,10 +71,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, - const search_site_config *cfg, - MV *ref_mv, MV *best_mv, int search_param, - int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + const search_site_config *cfg, MV *ref_mv, + MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); @@ -91,12 +90,12 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... - const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; + const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; - const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, - center_mv->col >> 3); + const int_mv fcenter_mv = + pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); @@ -109,8 +108,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; - const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + - ref_row * in_what_stride + ref_col; + const uint8_t *const in_what = + x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; @@ -181,10 +180,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]); // Set the ones falling outside to zero - v_bo10_q = _mm_and_si128(v_bo10_q, - _mm_cvtepi32_epi64(v_inside_d)); - v_bo32_q = _mm_and_si128(v_bo32_q, - _mm_unpackhi_epi32(v_inside_d, v_inside_d)); + v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); + v_bo32_q = + _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); @@ -195,9 +193,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, #endif } - fn_ptr->sdx4df(what, what_stride, - (const uint8_t **)&v_blocka[0], in_what_stride, - (uint32_t*)&v_sad_d); + fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); // Look up the component cost of the residual motion vector { @@ -226,11 +223,10 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, // Now add in the joint cost { - const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, - _mm_setzero_si128()); - const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, - v_joint_cost_0_d, - v_sel_d); + const __m128i v_sel_d = + _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); + const __m128i v_joint_cost_d = + _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } diff --git a/vp9/encoder/x86/vp9_error_intrin_avx2.c b/vp9/encoder/x86/vp9_error_intrin_avx2.c index dfebaab0a..453af2a40 100644 --- a/vp9/encoder/x86/vp9_error_intrin_avx2.c +++ b/vp9/encoder/x86/vp9_error_intrin_avx2.c @@ -13,10 +13,8 @@ #include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" -int64_t vp9_block_error_avx2(const int16_t *coeff, - const int16_t *dqcoeff, - intptr_t block_size, - int64_t *ssz) { +int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; __m256i sse_reg_64hi, ssz_reg_64hi; @@ -29,7 +27,7 @@ int64_t vp9_block_error_avx2(const int16_t *coeff, sse_reg = _mm256_set1_epi16(0); ssz_reg = _mm256_set1_epi16(0); - for (i = 0 ; i < block_size ; i+= 16) { + for (i = 0; i < block_size; i += 16) { // load 32 bytes from coeff and dqcoeff coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); @@ -66,8 +64,8 @@ int64_t vp9_block_error_avx2(const int16_t *coeff, _mm256_extractf128_si256(ssz_reg, 1)); // store the results - _mm_storel_epi64((__m128i*)(&sse), sse_reg128); + _mm_storel_epi64((__m128i *)(&sse), sse_reg128); - _mm_storel_epi64((__m128i*)(ssz), ssz_reg128); + _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); return sse; } diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c index 23325d63b..fa2a6449b 100644 --- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -19,23 +19,22 @@ extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - int w, int h) { + uint8_t *dst, ptrdiff_t dst_stride, int w, + int h) { const __m128i mask = _mm_set1_epi16(0x00FF); const int max_width = w & ~15; int y; for (y = 0; y < h; ++y) { int x; for (x = 0; x < max_width; x += 16) { - const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0)); + const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0)); const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16)); const __m128i a_and = _mm_and_si128(a, mask); const __m128i b_and = _mm_and_si128(b, mask); const __m128i c = _mm_packus_epi16(a_and, b_and); _mm_storeu_si128((__m128i *)(dst + x), c); } - for (; x < w; ++x) - dst[x] = src[x * 2]; + for (; x < w; ++x) dst[x] = src[x * 2]; src += src_stride * 2; dst += dst_stride; } @@ -47,9 +46,8 @@ static INLINE __m128i filter(const __m128i *const a, const __m128i *const b, const __m128i *const g, const __m128i *const h) { const __m128i coeffs_ab = _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1); - const __m128i coeffs_cd = - _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78, -19, 78, -19, - 78, -19, 78, -19); + const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78, + -19, 78, -19, 78, -19, 78, -19); const __m128i const64_x16 = _mm_set1_epi16(64); const __m128i ab = _mm_unpacklo_epi8(*a, *b); const __m128i cd = _mm_unpacklo_epi8(*c, *d); @@ -88,8 +86,8 @@ static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) { } static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - int dst_w, int dst_h) { + uint8_t *dst, ptrdiff_t dst_stride, int dst_w, + int dst_h) { dst_w /= 2; dst_h /= 2; { @@ -116,7 +114,7 @@ static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride, int x; eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w); for (x = 0; x < max_width; x += 8) { - const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x)); + const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x)); const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x)); const __m128i AB = _mm_unpacklo_epi8(A, B); __m128i C, D, CD; @@ -179,23 +177,23 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, const int dst_uv_h = dst_h / 2; if (dst_w * 2 == src_w && dst_h * 2 == src_h) { - downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, - dst->y_buffer, dst->y_stride, dst_w, dst_h); - downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, - dst->u_buffer, dst->uv_stride, dst_uv_w, dst_uv_h); - downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, - dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h); + downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); vpx_extend_frame_borders(dst); } else if (dst_w == src_w * 2 && dst_h == src_h * 2) { // The upsample() supports widths up to 1920 * 2. If greater, fall back // to vp9_scale_and_extend_frame_c(). - if (dst_w/2 <= 1920) { - upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, - dst->y_buffer, dst->y_stride, dst_w, dst_h); - upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, - dst->u_buffer, dst->uv_stride, dst_uv_w, dst_uv_h); - upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, - dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h); + if (dst_w / 2 <= 1920) { + upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); vpx_extend_frame_borders(dst); } else { vp9_scale_and_extend_frame_c(src, dst); diff --git a/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c index c245ccafa..91f627c34 100644 --- a/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -23,41 +23,41 @@ int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, const int shift = 2 * (bps - 8); const int rounding = shift > 0 ? 1 << (shift - 1) : 0; - for (i = 0; i < block_size; i+=8) { + for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers - __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i)); - __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4)); - __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i)); - __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4)); + __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); min = _mm_set1_epi32(0xffffc000); cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), - _mm_cmplt_epi32(mm_coeff, min)); + _mm_cmplt_epi32(mm_coeff, min)); cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), - _mm_cmplt_epi32(mm_coeff2, min)); + _mm_cmplt_epi32(mm_coeff2, min)); cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), - _mm_cmplt_epi32(mm_dqcoeff, min)); + _mm_cmplt_epi32(mm_dqcoeff, min)); cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), - _mm_cmplt_epi32(mm_dqcoeff2, min)); - test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1), - _mm_or_si128(cmp2, cmp3))); + _mm_cmplt_epi32(mm_dqcoeff2, min)); + test = _mm_movemask_epi8( + _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); if (!test) { - __m128i mm_diff, error_sse2, sqcoeff_sse2;; + __m128i mm_diff, error_sse2, sqcoeff_sse2; mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); - _mm_storeu_si128((__m128i*)temp, error_sse2); + _mm_storeu_si128((__m128i *)temp, error_sse2); error = error + temp[0] + temp[1] + temp[2] + temp[3]; - _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2); + _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; } else { for (j = 0; j < 8; j++) { const int64_t diff = coeff[i + j] - dqcoeff[i + j]; - error += diff * diff; + error += diff * diff; sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; } } diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 2071dfe3c..3f8ee5f24 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -14,14 +14,13 @@ #include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" -void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t* zbin_ptr, - const int16_t* round_ptr, const int16_t* quant_ptr, - const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, - int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, - uint16_t* eob_ptr, - const int16_t* scan_ptr, - const int16_t* iscan_ptr) { +void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { __m128i zero; __m128i thr; int16_t nzflag; @@ -44,9 +43,9 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, // Setup global values { - round = _mm_load_si128((const __m128i*)round_ptr); - quant = _mm_load_si128((const __m128i*)quant_ptr); - dequant = _mm_load_si128((const __m128i*)dequant_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); } { @@ -54,8 +53,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; // Do DC and first 15 AC - coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); + coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); + coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -78,15 +77,15 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } { @@ -99,8 +98,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -121,8 +120,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; - coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); + coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); + coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -133,7 +132,7 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, round); @@ -147,20 +146,20 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } else { - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); } } @@ -174,8 +173,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -200,10 +199,10 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, } } else { do { - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; |