summaryrefslogtreecommitdiff
path: root/vp9/encoder/x86
diff options
context:
space:
mode:
authorclang-format <noreply@google.com>2016-07-26 20:43:23 -0700
committerJames Zern <jzern@google.com>2016-08-02 16:47:11 -0700
commite0cc52db3fc9b09c99d7bbee35153cf82964a860 (patch)
tree4988f1d3a21056339e2ffbd7a3b3d52fab54cb6b /vp9/encoder/x86
parent3a04c9c9c4c4935925f4c00dcc70610100c5e9dd (diff)
downloadlibvpx-e0cc52db3fc9b09c99d7bbee35153cf82964a860.tar
libvpx-e0cc52db3fc9b09c99d7bbee35153cf82964a860.tar.gz
libvpx-e0cc52db3fc9b09c99d7bbee35153cf82964a860.tar.bz2
libvpx-e0cc52db3fc9b09c99d7bbee35153cf82964a860.zip
vp9/encoder: apply clang-format
Change-Id: I45d9fb4013f50766b24363a86365e8063e8954c2
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r--vp9/encoder/x86/vp9_dct_intrin_sse2.c153
-rw-r--r--vp9/encoder/x86/vp9_dct_ssse3.c80
-rw-r--r--vp9/encoder/x86/vp9_denoiser_sse2.c119
-rw-r--r--vp9/encoder/x86/vp9_diamond_search_sad_avx.c66
-rw-r--r--vp9/encoder/x86/vp9_error_intrin_avx2.c12
-rw-r--r--vp9/encoder/x86/vp9_frame_scale_ssse3.c46
-rw-r--r--vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c30
-rw-r--r--vp9/encoder/x86/vp9_quantize_sse2.c71
8 files changed, 269 insertions, 308 deletions
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index fa37b6fed..0712779b7 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -78,8 +78,8 @@ static void fdct4_sse2(__m128i *in) {
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[4], v[4];
- u[0]=_mm_unpacklo_epi16(in[0], in[1]);
- u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[3], in[2]);
v[0] = _mm_add_epi16(u[0], u[1]);
v[1] = _mm_sub_epi16(u[0], u[1]);
@@ -151,14 +151,12 @@ static void fadst4_sse2(__m128i *in) {
transpose_4x4(in);
}
-void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output,
- int stride, int tx_type) {
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
__m128i in[4];
switch (tx_type) {
- case DCT_DCT:
- vpx_fdct4x4_sse2(input, output, stride);
- break;
+ case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break;
case ADST_DCT:
load_buffer_4x4(input, in, stride);
fadst4_sse2(in);
@@ -177,21 +175,18 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output,
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
- default:
- assert(0);
- break;
+ default: assert(0); break;
}
}
void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
- int16_t* coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t* zbin_ptr,
- const int16_t* round_ptr, const int16_t* quant_ptr,
- const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
- int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- uint16_t* eob_ptr,
- const int16_t* scan_ptr,
- const int16_t* iscan_ptr) {
+ int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
__m128i zero;
int pass;
// Constants
@@ -208,14 +203,14 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
// Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
__m128i *in[8];
int index = 0;
@@ -469,9 +464,9 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
// Setup global values
{
- round = _mm_load_si128((const __m128i*)round_ptr);
- quant = _mm_load_si128((const __m128i*)quant_ptr);
- dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
}
{
@@ -503,15 +498,15 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
dequant = _mm_unpackhi_epi64(dequant, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
}
{
@@ -524,8 +519,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
// Add one to convert from indices to counts
iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -568,14 +563,14 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
}
{
@@ -588,8 +583,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
// Add one to convert from indices to counts
iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -615,10 +610,10 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
}
} else {
do {
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
n_coeffs += 8 * 2;
} while (n_coeffs < 0);
*eob_ptr = 0;
@@ -628,14 +623,14 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
// load 8x8 array
static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
int stride) {
- in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
in[0] = _mm_slli_epi16(in[0], 2);
in[1] = _mm_slli_epi16(in[1], 2);
@@ -930,14 +925,14 @@ static void fadst8_sse2(__m128i *in) {
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
// properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
// column transformation
// stage 1
@@ -1135,14 +1130,12 @@ static void fadst8_sse2(__m128i *in) {
array_transpose_8x8(in, in);
}
-void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output,
- int stride, int tx_type) {
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
__m128i in[8];
switch (tx_type) {
- case DCT_DCT:
- vpx_fdct8x8_sse2(input, output, stride);
- break;
+ case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break;
case ADST_DCT:
load_buffer_8x8(input, in, stride);
fadst8_sse2(in);
@@ -1164,13 +1157,11 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output,
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
- default:
- assert(0);
- break;
+ default: assert(0); break;
}
}
-static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
__m128i *in1, int stride) {
// load first 8 columns
load_buffer_8x8(input, in0, stride);
@@ -1530,13 +1521,13 @@ static void fdct16_8col(__m128i *in) {
v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
- in[1] = _mm_packs_epi32(v[0], v[1]);
- in[9] = _mm_packs_epi32(v[2], v[3]);
- in[5] = _mm_packs_epi32(v[4], v[5]);
+ in[1] = _mm_packs_epi32(v[0], v[1]);
+ in[9] = _mm_packs_epi32(v[2], v[3]);
+ in[5] = _mm_packs_epi32(v[4], v[5]);
in[13] = _mm_packs_epi32(v[6], v[7]);
- in[3] = _mm_packs_epi32(v[8], v[9]);
+ in[3] = _mm_packs_epi32(v[8], v[9]);
in[11] = _mm_packs_epi32(v[10], v[11]);
- in[7] = _mm_packs_epi32(v[12], v[13]);
+ in[7] = _mm_packs_epi32(v[12], v[13]);
in[15] = _mm_packs_epi32(v[14], v[15]);
}
@@ -2022,14 +2013,12 @@ static void fadst16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
}
-void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output,
- int stride, int tx_type) {
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
__m128i in0[16], in1[16];
switch (tx_type) {
- case DCT_DCT:
- vpx_fdct16x16_sse2(input, output, stride);
- break;
+ case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break;
case ADST_DCT:
load_buffer_16x16(input, in0, in1, stride);
fadst16_sse2(in0, in1);
@@ -2051,8 +2040,6 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output,
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
- default:
- assert(0);
- break;
+ default: assert(0); break;
}
}
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index 1a1d4eabc..fb2a92541 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -15,16 +15,12 @@
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
- int16_t* coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t* zbin_ptr,
- const int16_t* round_ptr, const int16_t* quant_ptr,
- const int16_t* quant_shift_ptr,
- int16_t* qcoeff_ptr,
- int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- uint16_t* eob_ptr,
- const int16_t* scan_ptr,
- const int16_t* iscan_ptr) {
+void vp9_fdct8x8_quant_ssse3(
+ const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
__m128i zero;
int pass;
// Constants
@@ -42,14 +38,14 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
// Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
__m128i *in[8];
int index = 0;
@@ -298,9 +294,9 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
// Setup global values
{
- round = _mm_load_si128((const __m128i*)round_ptr);
- quant = _mm_load_si128((const __m128i*)quant_ptr);
- dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
}
{
@@ -332,15 +328,15 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
dequant = _mm_unpackhi_epi64(dequant, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
}
{
@@ -353,8 +349,8 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
// Add one to convert from indices to counts
iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -388,7 +384,7 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
- _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
if (nzflag) {
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
@@ -402,20 +398,20 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
} else {
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
}
}
@@ -429,8 +425,8 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
// Add one to convert from indices to counts
iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -456,10 +452,10 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
}
} else {
do {
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
n_coeffs += 8 * 2;
} while (n_coeffs < 0);
*eob_ptr = 0;
diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c
index 883507af3..91d0602f9 100644
--- a/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -37,17 +37,11 @@ static INLINE int sum_diff_16x1(__m128i acc_diff) {
}
// Denoise a 16x1 vector.
-static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
- const uint8_t *mc_running_avg_y,
- uint8_t *running_avg_y,
- const __m128i *k_0,
- const __m128i *k_4,
- const __m128i *k_8,
- const __m128i *k_16,
- const __m128i *l3,
- const __m128i *l32,
- const __m128i *l21,
- __m128i acc_diff) {
+static INLINE __m128i vp9_denoiser_16x1_sse2(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
+ const __m128i *k_16, const __m128i *l3, const __m128i *l32,
+ const __m128i *l21, __m128i acc_diff) {
// Calculate differences
const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
const __m128i v_mc_running_avg_y =
@@ -69,7 +63,7 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
__m128i adj2 = _mm_and_si128(mask2, *l32);
const __m128i adj1 = _mm_and_si128(mask1, *l21);
const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
- __m128i adj, padj, nadj;
+ __m128i adj, padj, nadj;
// Combine the adjustments and get absolute adjustments.
adj2 = _mm_add_epi8(adj2, adj1);
@@ -95,9 +89,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
// Denoise a 16x1 vector with a weaker filter.
static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
- const uint8_t *sig, const uint8_t *mc_running_avg_y,
- uint8_t *running_avg_y, const __m128i k_0,
- const __m128i k_delta, __m128i acc_diff) {
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
__m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
// Calculate differences.
const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
@@ -108,8 +101,7 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
// Obtain the sign. FF if diff is negative.
const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
// Clamp absolute difference to delta to get the adjustment.
- const __m128i adj =
- _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+ const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
// Restore the sign and get positive and negative adjustments.
__m128i padj, nadj;
padj = _mm_andnot_si128(diff_sign, adj);
@@ -126,14 +118,17 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
}
// Denoise 8x8 and 8x16 blocks.
-static int vp9_denoiser_NxM_sse2_small(
- const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
- int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
- int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) {
+static int vp9_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride,
+ uint8_t *running_avg_y, int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude, int width) {
int sum_diff_thresh, r, sum_diff = 0;
- const int shift_inc = (increase_denoising &&
- motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
- 1 : 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
__m128i acc_diff = _mm_setzero_si128();
const __m128i k_0 = _mm_setzero_si128();
@@ -153,15 +148,13 @@ static int vp9_denoiser_NxM_sse2_small(
memcpy(sig_buffer[r], sig, width);
memcpy(sig_buffer[r] + width, sig + sig_stride, width);
memcpy(mc_running_buffer[r], mc_running_avg_y, width);
- memcpy(mc_running_buffer[r] + width,
- mc_running_avg_y + mc_avg_y_stride, width);
+ memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+ width);
memcpy(running_buffer[r], running_avg_y, width);
memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
- acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
- mc_running_buffer[r],
- running_buffer[r],
- &k_0, &k_4, &k_8, &k_16,
- &l3, &l32, &l21, acc_diff);
+ acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
+ running_buffer[r], &k_0, &k_4, &k_8,
+ &k_16, &l3, &l32, &l21, acc_diff);
memcpy(running_avg_y, running_buffer[r], width);
memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
// Update pointers for next iteration.
@@ -184,19 +177,19 @@ static int vp9_denoiser_NxM_sse2_small(
// The delta is set by the excess of absolute pixel diff over the
// threshold.
- const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
- num_pels_log2_lookup[bs]) + 1;
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
const __m128i k_delta = _mm_set1_epi8(delta);
running_avg_y -= avg_y_stride * (b_height << 1);
for (r = 0; r < b_height; ++r) {
acc_diff = vp9_denoiser_adj_16x1_sse2(
- sig_buffer[r], mc_running_buffer[r], running_buffer[r],
- k_0, k_delta, acc_diff);
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
+ k_delta, acc_diff);
memcpy(running_avg_y, running_buffer[r], width);
- memcpy(running_avg_y + avg_y_stride,
- running_buffer[r] + width, width);
+ memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
+ width);
// Update pointers for next iteration.
running_avg_y += (avg_y_stride << 1);
}
@@ -216,14 +209,14 @@ static int vp9_denoiser_NxM_sse2_small(
static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
const uint8_t *mc_running_avg_y,
int mc_avg_y_stride,
- uint8_t *running_avg_y,
- int avg_y_stride,
+ uint8_t *running_avg_y, int avg_y_stride,
int increase_denoising, BLOCK_SIZE bs,
int motion_magnitude) {
int sum_diff_thresh, r, c, sum_diff = 0;
- const int shift_inc = (increase_denoising &&
- motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
- 1 : 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
__m128i acc_diff[4][4];
const __m128i k_0 = _mm_setzero_si128();
const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
@@ -248,9 +241,9 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
for (r = 0; r < b_height; ++r) {
for (c = 0; c < b_width_shift4; ++c) {
- acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2(
- sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
- &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]);
+ acc_diff[c][r >> 4] = vp9_denoiser_16x1_sse2(
+ sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
+ &l32, &l21, acc_diff[c][r >> 4]);
// Update pointers for next iteration.
sig += 16;
mc_running_avg_y += 16;
@@ -259,7 +252,7 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
for (c = 0; c < b_width_shift4; ++c) {
- sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
+ sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
}
}
@@ -272,8 +265,8 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
{
sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
if (abs(sum_diff) > sum_diff_thresh) {
- const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
- num_pels_log2_lookup[bs]) + 1;
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
@@ -284,9 +277,9 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
sum_diff = 0;
for (r = 0; r < b_height; ++r) {
for (c = 0; c < b_width_shift4; ++c) {
- acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2(
- sig, mc_running_avg_y, running_avg_y, k_0,
- k_delta, acc_diff[c][r>>4]);
+ acc_diff[c][r >> 4] =
+ vp9_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
+ k_0, k_delta, acc_diff[c][r >> 4]);
// Update pointers for next iteration.
sig += 16;
mc_running_avg_y += 16;
@@ -295,7 +288,7 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
for (c = 0; c < b_width_shift4; ++c) {
- sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
+ sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
}
}
sig = sig - b_width + sig_stride;
@@ -314,27 +307,21 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
}
int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
- const uint8_t *mc_avg,
- int mc_avg_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
uint8_t *avg, int avg_stride,
- int increase_denoising,
- BLOCK_SIZE bs,
+ int increase_denoising, BLOCK_SIZE bs,
int motion_magnitude) {
// Rank by frequency of the block type to have an early termination.
if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
bs == BLOCK_32X64 || bs == BLOCK_64X32) {
- return vp9_denoiser_NxM_sse2_big(sig, sig_stride,
- mc_avg, mc_avg_stride,
- avg, avg_stride,
- increase_denoising,
- bs, motion_magnitude);
+ return vp9_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
+ avg, avg_stride, increase_denoising, bs,
+ motion_magnitude);
} else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
- return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
- mc_avg, mc_avg_stride,
- avg, avg_stride,
- increase_denoising,
- bs, motion_magnitude, 8);
+ return vp9_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
+ avg, avg_stride, increase_denoising, bs,
+ motion_magnitude, 8);
} else {
return COPY_BLOCK;
}
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index cd3e87ec8..36bcf1536 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -9,7 +9,7 @@
*/
#if defined(_MSC_VER)
-# include <intrin.h>
+#include <intrin.h>
#endif
#include <emmintrin.h>
#include <smmintrin.h>
@@ -19,11 +19,11 @@
#include "vpx_ports/mem.h"
#ifdef __GNUC__
-# define LIKELY(v) __builtin_expect(v, 1)
-# define UNLIKELY(v) __builtin_expect(v, 0)
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
#else
-# define LIKELY(v) (v)
-# define UNLIKELY(v) (v)
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
#endif
static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
@@ -40,19 +40,19 @@ static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
return mv.as_int == 0 ? 0 : 1;
}
-static INLINE int mv_cost(const int_mv mv,
- const int *joint_cost, int *const comp_cost[2]) {
- return joint_cost[get_mv_joint(mv)] +
- comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col];
+static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
+ int *const comp_cost[2]) {
+ return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
+ comp_cost[1][mv.as_mv.col];
}
static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
int sad_per_bit) {
- const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row,
- mv.as_mv.col - ref->col);
- return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost,
- x->nmvsadcost) *
- sad_per_bit, VP9_PROB_COST_SHIFT);
+ const int_mv diff =
+ pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
+ return ROUND_POWER_OF_TWO(
+ (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+ VP9_PROB_COST_SHIFT);
}
/*****************************************************************************
@@ -71,10 +71,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
* which does not rely on these properties. *
*****************************************************************************/
int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
- const search_site_config *cfg,
- MV *ref_mv, MV *best_mv, int search_param,
- int sad_per_bit, int *num00,
- const vp9_variance_fn_ptr_t *fn_ptr,
+ const search_site_config *cfg, MV *ref_mv,
+ MV *best_mv, int search_param, int sad_per_bit,
+ int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
const MV *center_mv) {
const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
@@ -91,12 +90,12 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
// 0 = initial step (MAX_FIRST_STEP) pel
// 1 = (MAX_FIRST_STEP/2) pel,
// 2 = (MAX_FIRST_STEP/4) pel...
- const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+ const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
const int tot_steps = cfg->total_steps - search_param;
- const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
- center_mv->col >> 3);
+ const int_mv fcenter_mv =
+ pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
@@ -109,8 +108,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
const int what_stride = x->plane[0].src.stride;
const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
const uint8_t *const what = x->plane[0].src.buf;
- const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
- ref_row * in_what_stride + ref_col;
+ const uint8_t *const in_what =
+ x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
// Work out the start point for the search
const uint8_t *best_address = in_what;
@@ -181,10 +180,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
__m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
__m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
// Set the ones falling outside to zero
- v_bo10_q = _mm_and_si128(v_bo10_q,
- _mm_cvtepi32_epi64(v_inside_d));
- v_bo32_q = _mm_and_si128(v_bo32_q,
- _mm_unpackhi_epi32(v_inside_d, v_inside_d));
+ v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d));
+ v_bo32_q =
+ _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d));
// Compute the candidate addresses
v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
@@ -195,9 +193,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
#endif
}
- fn_ptr->sdx4df(what, what_stride,
- (const uint8_t **)&v_blocka[0], in_what_stride,
- (uint32_t*)&v_sad_d);
+ fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+ in_what_stride, (uint32_t *)&v_sad_d);
// Look up the component cost of the residual motion vector
{
@@ -226,11 +223,10 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
// Now add in the joint cost
{
- const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
- _mm_setzero_si128());
- const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
- v_joint_cost_0_d,
- v_sel_d);
+ const __m128i v_sel_d =
+ _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128());
+ const __m128i v_joint_cost_d =
+ _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d);
v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
}
diff --git a/vp9/encoder/x86/vp9_error_intrin_avx2.c b/vp9/encoder/x86/vp9_error_intrin_avx2.c
index dfebaab0a..453af2a40 100644
--- a/vp9/encoder/x86/vp9_error_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_error_intrin_avx2.c
@@ -13,10 +13,8 @@
#include "./vp9_rtcd.h"
#include "vpx/vpx_integer.h"
-int64_t vp9_block_error_avx2(const int16_t *coeff,
- const int16_t *dqcoeff,
- intptr_t block_size,
- int64_t *ssz) {
+int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
__m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
__m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
__m256i sse_reg_64hi, ssz_reg_64hi;
@@ -29,7 +27,7 @@ int64_t vp9_block_error_avx2(const int16_t *coeff,
sse_reg = _mm256_set1_epi16(0);
ssz_reg = _mm256_set1_epi16(0);
- for (i = 0 ; i < block_size ; i+= 16) {
+ for (i = 0; i < block_size; i += 16) {
// load 32 bytes from coeff and dqcoeff
coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
@@ -66,8 +64,8 @@ int64_t vp9_block_error_avx2(const int16_t *coeff,
_mm256_extractf128_si256(ssz_reg, 1));
// store the results
- _mm_storel_epi64((__m128i*)(&sse), sse_reg128);
+ _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
- _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
+ _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
return sse;
}
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 23325d63b..fa2a6449b 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -19,23 +19,22 @@ extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst);
static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- int w, int h) {
+ uint8_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
const __m128i mask = _mm_set1_epi16(0x00FF);
const int max_width = w & ~15;
int y;
for (y = 0; y < h; ++y) {
int x;
for (x = 0; x < max_width; x += 16) {
- const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0));
+ const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0));
const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16));
const __m128i a_and = _mm_and_si128(a, mask);
const __m128i b_and = _mm_and_si128(b, mask);
const __m128i c = _mm_packus_epi16(a_and, b_and);
_mm_storeu_si128((__m128i *)(dst + x), c);
}
- for (; x < w; ++x)
- dst[x] = src[x * 2];
+ for (; x < w; ++x) dst[x] = src[x * 2];
src += src_stride * 2;
dst += dst_stride;
}
@@ -47,9 +46,8 @@ static INLINE __m128i filter(const __m128i *const a, const __m128i *const b,
const __m128i *const g, const __m128i *const h) {
const __m128i coeffs_ab =
_mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1);
- const __m128i coeffs_cd =
- _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78, -19, 78, -19,
- 78, -19, 78, -19);
+ const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78,
+ -19, 78, -19, 78, -19, 78, -19);
const __m128i const64_x16 = _mm_set1_epi16(64);
const __m128i ab = _mm_unpacklo_epi8(*a, *b);
const __m128i cd = _mm_unpacklo_epi8(*c, *d);
@@ -88,8 +86,8 @@ static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) {
}
static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- int dst_w, int dst_h) {
+ uint8_t *dst, ptrdiff_t dst_stride, int dst_w,
+ int dst_h) {
dst_w /= 2;
dst_h /= 2;
{
@@ -116,7 +114,7 @@ static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
int x;
eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w);
for (x = 0; x < max_width; x += 8) {
- const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x));
+ const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x));
const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
const __m128i AB = _mm_unpacklo_epi8(A, B);
__m128i C, D, CD;
@@ -179,23 +177,23 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
const int dst_uv_h = dst_h / 2;
if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
- downsample_2_to_1_ssse3(src->y_buffer, src->y_stride,
- dst->y_buffer, dst->y_stride, dst_w, dst_h);
- downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride,
- dst->u_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
- downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride,
- dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+ downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
vpx_extend_frame_borders(dst);
} else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
// The upsample() supports widths up to 1920 * 2. If greater, fall back
// to vp9_scale_and_extend_frame_c().
- if (dst_w/2 <= 1920) {
- upsample_1_to_2_ssse3(src->y_buffer, src->y_stride,
- dst->y_buffer, dst->y_stride, dst_w, dst_h);
- upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride,
- dst->u_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
- upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride,
- dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+ if (dst_w / 2 <= 1920) {
+ upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
vpx_extend_frame_borders(dst);
} else {
vp9_scale_and_extend_frame_c(src, dst);
diff --git a/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
index c245ccafa..91f627c34 100644
--- a/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -23,41 +23,41 @@ int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
const int shift = 2 * (bps - 8);
const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
- for (i = 0; i < block_size; i+=8) {
+ for (i = 0; i < block_size; i += 8) {
// Load the data into xmm registers
- __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
- __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
- __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
- __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
+ __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+ __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+ __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+ __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
// Check if any values require more than 15 bit
max = _mm_set1_epi32(0x3fff);
min = _mm_set1_epi32(0xffffc000);
cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
- _mm_cmplt_epi32(mm_coeff, min));
+ _mm_cmplt_epi32(mm_coeff, min));
cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
- _mm_cmplt_epi32(mm_coeff2, min));
+ _mm_cmplt_epi32(mm_coeff2, min));
cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
- _mm_cmplt_epi32(mm_dqcoeff, min));
+ _mm_cmplt_epi32(mm_dqcoeff, min));
cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
- _mm_cmplt_epi32(mm_dqcoeff2, min));
- test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
- _mm_or_si128(cmp2, cmp3)));
+ _mm_cmplt_epi32(mm_dqcoeff2, min));
+ test = _mm_movemask_epi8(
+ _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
if (!test) {
- __m128i mm_diff, error_sse2, sqcoeff_sse2;;
+ __m128i mm_diff, error_sse2, sqcoeff_sse2;
mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
- _mm_storeu_si128((__m128i*)temp, error_sse2);
+ _mm_storeu_si128((__m128i *)temp, error_sse2);
error = error + temp[0] + temp[1] + temp[2] + temp[3];
- _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
+ _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
} else {
for (j = 0; j < 8; j++) {
const int64_t diff = coeff[i + j] - dqcoeff[i + j];
- error += diff * diff;
+ error += diff * diff;
sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
}
}
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 2071dfe3c..3f8ee5f24 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -14,14 +14,13 @@
#include "./vp9_rtcd.h"
#include "vpx/vpx_integer.h"
-void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t* zbin_ptr,
- const int16_t* round_ptr, const int16_t* quant_ptr,
- const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
- int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- uint16_t* eob_ptr,
- const int16_t* scan_ptr,
- const int16_t* iscan_ptr) {
+void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
__m128i zero;
__m128i thr;
int16_t nzflag;
@@ -44,9 +43,9 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
// Setup global values
{
- round = _mm_load_si128((const __m128i*)round_ptr);
- quant = _mm_load_si128((const __m128i*)quant_ptr);
- dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
}
{
@@ -54,8 +53,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
__m128i qcoeff0, qcoeff1;
__m128i qtmp0, qtmp1;
// Do DC and first 15 AC
- coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
- coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+ coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
+ coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
// Poor man's sign extract
coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -78,15 +77,15 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
dequant = _mm_unpackhi_epi64(dequant, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
}
{
@@ -99,8 +98,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
// Add one to convert from indices to counts
iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -121,8 +120,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
__m128i qcoeff0, qcoeff1;
__m128i qtmp0, qtmp1;
- coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
- coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+ coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
+ coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
// Poor man's sign extract
coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -133,7 +132,7 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
- _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
if (nzflag) {
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
@@ -147,20 +146,20 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
} else {
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
}
}
@@ -174,8 +173,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
// Add one to convert from indices to counts
iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -200,10 +199,10 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
}
} else {
do {
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
n_coeffs += 8 * 2;
} while (n_coeffs < 0);
*eob_ptr = 0;