diff options
author | yuanhecai <yuanhecai@loongson.cn> | 2022-05-06 12:17:39 +0800 |
---|---|---|
committer | yuanhecai <yuanhecai@loongson.cn> | 2022-05-18 16:19:48 +0800 |
commit | 17959f9c94be43de57972052ebfdc40870170b0e (patch) | |
tree | 0cd212e949e60c8ab089840b0c10b47bdb07567f /vpx_dsp | |
parent | 1c39c625264fa64db1c573cbac1f3a4f24c660d3 (diff) | |
download | libvpx-17959f9c94be43de57972052ebfdc40870170b0e.tar libvpx-17959f9c94be43de57972052ebfdc40870170b0e.tar.gz libvpx-17959f9c94be43de57972052ebfdc40870170b0e.tar.bz2 libvpx-17959f9c94be43de57972052ebfdc40870170b0e.zip |
vp9[loongarch]: Optimize vpx_quantize_b/b_32x32
1. vpx_quantize_b_lsx
2. vpx_quantize_b_32x32_lsx
Bug: webm:1755
Change-Id: I476c8677a2c2aed7248e088e62c3777c9bed2adb
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/loongarch/quantize_lsx.c | 249 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 |
3 files changed, 252 insertions, 2 deletions
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c new file mode 100644 index 000000000..e3fbb9e9e --- /dev/null +++ b/vpx_dsp/loongarch/quantize_lsx.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define CALCULATE_QCOEFF(coeff, coeff_abs, round, quant, shift, cmp_mask) \ + ({ \ + __m128i rounded, qcoeff; \ + \ + rounded = __lsx_vsadd_h(coeff_abs, round); \ + qcoeff = __lsx_vmuh_h(rounded, quant); \ + qcoeff = __lsx_vadd_h(rounded, qcoeff); \ + qcoeff = __lsx_vmuh_h(qcoeff, shift); \ + qcoeff = __lsx_vsigncov_h(coeff, qcoeff); \ + qcoeff = __lsx_vand_v(qcoeff, cmp_mask); \ + \ + qcoeff; \ + }) + +#define CALCULATE_DQCOEFF_AND_STORE(qcoeff, dequant, dqcoeff) \ + { \ + __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); \ + __lsx_vst(dqcoeff16, dqcoeff, 0); \ + } + +#define CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff, dequant, dqcoeff) \ + { \ + __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; \ + __m128i zero = __lsx_vldi(0); \ + __m128i coeff = __lsx_vabsd_h(qcoeff, zero); \ + \ + __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); \ + __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); \ + \ + low = __lsx_vmul_h(coeff, dequant); \ + high = __lsx_vmuh_h(coeff, dequant); \ + dqcoeff32_0 = __lsx_vilvl_h(high, low); \ + dqcoeff32_1 = __lsx_vilvh_h(high, low); \ + \ + dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); \ + dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); \ + dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); \ + dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); \ + res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); \ + __lsx_vst(res, dqcoeff, 0); \ + } + +#define SCAN_FOR_EOB(coeff0, coeff1, zbin_mask0, zbin_mask1, scan, index, \ + zero) \ + ({ \ + __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); \ + __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); \ + __m128i scan0 = __lsx_vld(scan + index, 0); \ + __m128i scan1 = __lsx_vld(scan + index + 8, 0); \ + __m128i eob0, eob1, eob_max; \ + \ + scan0 = __lsx_vsub_h(scan0, zbin_mask0); \ + scan1 = __lsx_vsub_h(scan1, zbin_mask1); \ + eob0 = __lsx_vandn_v(zero_coeff0, scan0); \ + eob1 = __lsx_vandn_v(zero_coeff1, scan1); \ + eob_max = __lsx_vmax_h(eob0, eob1); \ + eob_max; \ + }) + +#define ACCUMULATE_EOB(eob) \ + ({ \ + __m128i eob_shuffled; \ + int16_t res_m; \ + \ + eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); \ + eob = __lsx_vmax_h(eob, eob_shuffled); \ + eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); \ + eob = __lsx_vmax_h(eob, eob_shuffled); \ + eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); \ + eob = __lsx_vmax_h(eob, eob_shuffled); \ + res_m = __lsx_vpickve2gr_h(eob, 1); \ + res_m; \ + }) + +#if !CONFIG_VP9_HIGHBITDEPTH +void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + __m128i zero = __lsx_vldi(0); + int index = 16; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + (void)scan; + + zbin = __lsx_vld(zbin_ptr, 0); + round = __lsx_vld(round_ptr, 0); + quant = __lsx_vld(quant_ptr, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(quant_shift_ptr, 0); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + // AC only loop. + while (index < n_coeffs) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr + index); + CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = __lsx_vmax_h(eob, eob0); + + index += 16; + } + + *eob_ptr = ACCUMULATE_EOB(eob); +} + +void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zero = __lsx_vldi(0); + int index; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1; + __m128i eob = zero, eob0; + + (void)scan; + (void)n_coeffs; + + zbin = __lsx_vld(zbin_ptr, 0); + zbin = __lsx_vsrari_h(zbin, 1); + round = __lsx_vld(round_ptr, 0); + round = __lsx_vsrari_h(round, 1); + + quant = __lsx_vld(quant_ptr, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(quant_shift_ptr, 0); + quant_shift = __lsx_vslli_h(quant_shift, 1); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + // remove DC from zbin + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + // remove DC in quant_shift, quant, quant_shift + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); + eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr + index); + CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, + dqcoeff_ptr + 8 + index); + eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = __lsx_vmax_h(eob, eob0); + } + + *eob_ptr = ACCUMULATE_EOB(eob); +} +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 4f5a7a190..13999af04 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -328,6 +328,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c endif diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e82b487f1..d3c668f9a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -711,10 +711,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/; + specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/; + specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; |