From 7b13d99b981057199d9def0c7a61554609aa4ad9 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 21 Aug 2017 11:14:39 -0700 Subject: quantize fp: ignore skip_block in x86 Change-Id: I1272917c49cf6e6710e52c36535b2fc8c8dced78 --- vp9/encoder/x86/vp9_quantize_sse2.c | 277 +++++++++++++------------- vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 23 --- 2 files changed, 134 insertions(+), 166 deletions(-) (limited to 'vp9') diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 4a2581a34..ca0ad4407 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include @@ -25,8 +26,12 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i zero; __m128i thr; int16_t nzflag; + __m128i eob; + __m128i round, quant, dequant; (void)scan_ptr; + (void)skip_block; + assert(!skip_block); coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; @@ -35,40 +40,106 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant; + { + __m128i coeff0, coeff1; + + // Setup global values { - __m128i coeff0, coeff1; + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + } - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + // Do DC and first 15 AC + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); + } - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs @@ -81,131 +152,51 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } + } else { + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } - n_coeffs += 8 * 2; } - thr = _mm_srai_epi16(dequant, 1); - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; + if (nzflag) { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); } + n_coeffs += 8 * 2; + } - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); } } diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 1b88863f6..1d421f6ec 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -22,8 +22,6 @@ SECTION .text cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \ qcoeff, dqcoeff, dequant, \ eob, scan, iscan - cmp dword skipm, 0 - jne .blank ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp @@ -173,27 +171,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \ pextrw r6, m8, 0 mov [r2], r6 RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - - lea r0q, [r0q+ncoeffq*2] - lea r2q, [r2q+ncoeffq*2] - neg ncoeffq - pxor m7, m7 -.blank_loop: - STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq - STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8 - STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq - STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8 - add ncoeffq, mmsize - jl .blank_loop - mov word [r3q], 0 - RET %endmacro INIT_XMM ssse3 -- cgit v1.2.3