diff options
author | Yunqing Wang <yunqingwang@google.com> | 2015-07-20 16:20:06 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2015-07-20 16:20:07 +0000 |
commit | f65473c036a1270f360e1c1d323786257fae9f0e (patch) | |
tree | 551dbab897e8f5a20eaf413895d4851a0596446a /vp9/encoder | |
parent | b0e6811ace3425a5ca7951448eb939c357acfb85 (diff) | |
parent | 38f1fbbb759518356c6d8ef6f27bab4e6e263a1a (diff) | |
download | libvpx-f65473c036a1270f360e1c1d323786257fae9f0e.tar libvpx-f65473c036a1270f360e1c1d323786257fae9f0e.tar.gz libvpx-f65473c036a1270f360e1c1d323786257fae9f0e.tar.bz2 libvpx-f65473c036a1270f360e1c1d323786257fae9f0e.zip |
Merge "Migrate quantization functions from vp9/ to vpx_dsp/"
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.c | 327 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.h | 25 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c | 179 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_quantize_sse2.c | 208 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 200 |
6 files changed, 2 insertions, 939 deletions
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index f74cdd8aa..776934047 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/quantize.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -23,7 +24,6 @@ #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_encodemb.h" -#include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_tokenize.h" diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 32c1f7697..d53d95d29 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -9,7 +9,7 @@ */ #include <math.h> - +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -20,113 +20,6 @@ #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" -void vp9_quantize_dc(const tran_low_t *coeff_ptr, - int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp, eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; - if (tmp) - eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, - int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { - int eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + round_ptr[0]; - const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; - if (abs_qcoeff) - eob = 0; - } - *eob_ptr = eob + 1; -} -#endif - -void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { - const int n_coeffs = 1024; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp, eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), - INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; - if (tmp) - eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, - int skip_block, - const int16_t *round_ptr, - const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr) { - const int n_coeffs = 1024; - int eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); - const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; - if (abs_qcoeff) - eob = 0; - } - *eob_ptr = eob + 1; -} -#endif - void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -298,224 +191,6 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, } #endif -void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]}; - const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } - - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= zbins[rc != 0]) { - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> 16; // quantization - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - - if (tmp) - eob = i; - } - } - } - *eob_ptr = eob + 1; -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]}; - const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } - - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= zbins[rc != 0]) { - const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) - eob = i; - } - } - } - *eob_ptr = eob + 1; -} -#endif - -void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1)}; - const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; - - int idx = 0; - int idx_arr[1024]; - int i, eob = -1; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - int tmp; - int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); - tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * - quant_shift_ptr[rc != 0]) >> 15; - - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - - if (tmp) - eob = idx_arr[i]; - } - } - *eob_ptr = eob + 1; -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, - intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1)}; - const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; - - int idx = 0; - int idx_arr[1024]; - int i, eob = -1; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff - + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) - eob = idx_arr[i]; - } - } - *eob_ptr = eob + 1; -} -#endif - void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan) { MACROBLOCKD *const xd = &x->e_mbd; diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 55e546944..61320361b 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -37,34 +37,9 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); } QUANTS; -void vp9_quantize_dc(const tran_low_t *coeff_ptr, - int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan); -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, - int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, - int skip_block, - const int16_t *round_ptr, - const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr); -#endif - struct VP9_COMP; struct VP9Common; diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c deleted file mode 100644 index 0174cfeca..000000000 --- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <emmintrin.h> - -#include "vpx_ports/mem.h" -#include "vp9/common/vp9_common.h" - -#if CONFIG_VP9_HIGHBITDEPTH -// from vp9_idct.h: typedef int32_t tran_low_t; -void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, - intptr_t count, - int skip_block, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, - const int16_t *iscan) { - int i, j, non_zero_regs = (int)count / 4, eob_i = -1; - __m128i zbins[2]; - __m128i nzbins[2]; - - zbins[0] = _mm_set_epi32((int)zbin_ptr[1], - (int)zbin_ptr[1], - (int)zbin_ptr[1], - (int)zbin_ptr[0]); - zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); - - nzbins[0] = _mm_setzero_si128(); - nzbins[1] = _mm_setzero_si128(); - nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); - nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - - (void)scan; - - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } - - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; - - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32( - _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i*)abs_coeff, coeffs); - _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign); - - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) - eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } - } - } - } - *eob_ptr = eob_i + 1; -} - - -void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, - intptr_t n_coeffs, - int skip_block, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, - const int16_t *iscan) { - __m128i zbins[2]; - __m128i nzbins[2]; - int idx = 0; - int idx_arr[1024]; - int i, eob = -1; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); - (void)scan; - zbins[0] = _mm_set_epi32(zbin1_tmp, - zbin1_tmp, - zbin1_tmp, - zbin0_tmp); - zbins[1] = _mm_set1_epi32(zbin1_tmp); - - nzbins[0] = _mm_setzero_si128(); - nzbins[1] = _mm_setzero_si128(); - nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); - nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) - idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) - idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) - idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) - idx_arr[idx++] = i * 4 + 3; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff - + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) - eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } - } - *eob_ptr = eob + 1; -} -#endif diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 71fdfd716..2071dfe3c 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -14,214 +14,6 @@ #include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" -void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t* zbin_ptr, - const int16_t* round_ptr, const int16_t* quant_ptr, - const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, - int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, - uint16_t* eob_ptr, - const int16_t* scan_ptr, - const int16_t* iscan_ptr) { - __m128i zero; - (void)scan_ptr; - - coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i zbin; - __m128i round, quant, dequant, shift; - { - __m128i coeff0, coeff1; - - // Setup global values - { - __m128i pw_1; - zbin = _mm_load_si128((const __m128i*)zbin_ptr); - round = _mm_load_si128((const __m128i*)round_ptr); - quant = _mm_load_si128((const __m128i*)quant_ptr); - pw_1 = _mm_set1_epi16(1); - zbin = _mm_sub_epi16(zbin, pw_1); - dequant = _mm_load_si128((const __m128i*)dequant_ptr); - shift = _mm_load_si128((const __m128i*)quant_shift_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - // Do DC and first 15 AC - coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - shift = _mm_unpackhi_epi64(shift, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - - coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; - } -} - void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t* zbin_ptr, const int16_t* round_ptr, const int16_t* quant_ptr, diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 449d52b22..ec2e87cb1 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -15,206 +15,6 @@ pw_1: times 8 dw 1 SECTION .text -; TODO(yunqingwang)fix quantize_b code for skip=1 case. -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - cmp dword skipm, 0 - jne .blank - - ; actual quantize loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant -%ifidn %1, b_32x32 - pcmpeqw m5, m5 - psrlw m5, 15 - paddw m0, m5 - paddw m1, m5 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - mova m3, [r2q] ; m3 = dequant - psubw m0, [pw_1] - mov r2, shiftmp - mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob - lea coeffq, [ coeffq+ncoeffq*2] - lea iscanq, [ iscanq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - jz .accumulate_eob - -.ac_only_loop: - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin -%ifidn %1, b_32x32 - pmovmskb r6d, m7 - pmovmskb r2d, m12 - or r6, r2 - jz .skip_iter -%endif - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jl .ac_only_loop - -%ifidn %1, b_32x32 - jmp .accumulate_eob -.skip_iter: - mova [qcoeffq+ncoeffq*2+ 0], m5 - mova [qcoeffq+ncoeffq*2+16], m5 - mova [dqcoeffq+ncoeffq*2+ 0], m5 - mova [dqcoeffq+ncoeffq*2+16], m5 - add ncoeffq, mmsize - jl .ac_only_loop -%endif - -.accumulate_eob: - ; horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - pextrw r6, m8, 0 - mov [r2], r6 - RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - neg ncoeffq - pxor m7, m7 -.blank_loop: - mova [dqcoeffq+ncoeffq*2+ 0], m7 - mova [dqcoeffq+ncoeffq*2+16], m7 - mova [qcoeffq+ncoeffq*2+ 0], m7 - mova [qcoeffq+ncoeffq*2+16], m7 - add ncoeffq, mmsize - jl .blank_loop - mov word [eobq], 0 - RET -%endmacro - -INIT_XMM ssse3 -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 - %macro QUANTIZE_FP 2 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ |