diff options
-rw-r--r-- | test/quantize_test.cc | 7 | ||||
-rw-r--r-- | test/vp8_fdct4x4_test.cc | 5 | ||||
-rw-r--r-- | test/vp9_subtract_test.cc | 5 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.pl | 8 | ||||
-rw-r--r-- | vp8/encoder/loongarch/dct_lsx.c | 99 | ||||
-rw-r--r-- | vp8/encoder/loongarch/encodeopt_lsx.c | 82 | ||||
-rw-r--r-- | vp8/encoder/loongarch/quantize_lsx.c | 145 | ||||
-rw-r--r-- | vp8/vp8cx.mk | 5 | ||||
-rw-r--r-- | vpx_dsp/loongarch/subtract_lsx.c | 371 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 2 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 |
11 files changed, 726 insertions, 5 deletions
diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 792b21432..57309e810 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -224,4 +224,11 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c), make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c))); #endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, QuantizeTest, + ::testing::Values(make_tuple(&vp8_regular_quantize_b_lsx, + &vp8_regular_quantize_b_c))); +#endif // HAVE_LSX } // namespace diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc index 3e4305be7..1b73a72a0 100644 --- a/test/vp8_fdct4x4_test.cc +++ b/test/vp8_fdct4x4_test.cc @@ -203,4 +203,9 @@ INSTANTIATE_TEST_SUITE_P(MSA, FdctTest, INSTANTIATE_TEST_SUITE_P(MMI, FdctTest, ::testing::Values(vp8_short_fdct4x4_mmi)); #endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, FdctTest, + ::testing::Values(vp8_short_fdct4x4_lsx)); +#endif // HAVE_LSX } // namespace diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index ef8cc207d..211cc6c7a 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -152,4 +152,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_vsx)); #endif +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_lsx)); +#endif + } // namespace vp9 diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index e4b40fa9e..4f45d2ab9 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -184,7 +184,7 @@ specialize qw/vp8_copy32xn sse2 sse3/; # Forward DCT # add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/; +specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/; @@ -196,7 +196,7 @@ specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/; # Quantizer # add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *"; -specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/; +specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi lsx/; add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *"; specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/; @@ -205,10 +205,10 @@ specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/; # Block subtraction # add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff"; -specialize qw/vp8_block_error sse2 msa/; +specialize qw/vp8_block_error sse2 msa lsx/; add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc"; -specialize qw/vp8_mbblock_error sse2 msa/; +specialize qw/vp8_mbblock_error sse2 msa lsx/; add_proto qw/int vp8_mbuverror/, "struct macroblock *mb"; specialize qw/vp8_mbuverror sse2 msa/; diff --git a/vp8/encoder/loongarch/dct_lsx.c b/vp8/encoder/loongarch/dct_lsx.c new file mode 100644 index 000000000..e090d2360 --- /dev/null +++ b/vp8/encoder/loongarch/dct_lsx.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdint.h> +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } + +#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ + { \ + __m128i tmp0_m, tmp1_m, tmp2_m; \ + \ + tmp0_m = __lsx_vreplvei_h(coeff, val0); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \ + DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1, \ + const2); \ + } + +#define RET_1_IF_NZERO_H(_in) \ + ({ \ + __m128i tmp_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + __m128i max_m = __lsx_vldi(0xFF); \ + \ + tmp_m = __lsx_vseqi_h(_in, 0); \ + tmp_m = __lsx_vxor_v(tmp_m, max_m); \ + tmp_m = __lsx_vand_v(tmp_m, one_m); \ + \ + tmp_m; \ + }) + +void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3; + __m128i tmp0, tmp1, tmp2, tmp3, const0, const1; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i out0, out1, out2, out3; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1, + in3); + in0 = __lsx_vadd_h(tmp0, tmp1); + in2 = __lsx_vsub_h(tmp0, tmp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); + tmp0 = __lsx_vilvl_h(in3, in1); + in1 = __lsx_vreplvei_h(coeff, 3); + out0 = __lsx_vpackev_h(zero, in1); + coeff = __lsx_vilvl_h(zero, coeff); + out1 = __lsx_vreplvei_w(coeff, 0); + DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0, + out1); + DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + tmp2 = __lsx_vadd_h(tmp0, tmp1); + tmp3 = __lsx_vsub_h(tmp0, tmp1); + DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2); + DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2); + DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2); + tmp1 = RET_1_IF_NZERO_H(in3); + DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1); + out3 = __lsx_vadd_w(out3, out1); + out1 = __lsx_vreplvei_w(coeff, 1); + DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1, + out3); + DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3); + out1 = __lsx_vadd_w(out1, tmp1); + DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} diff --git a/vp8/encoder/loongarch/encodeopt_lsx.c b/vp8/encoder/loongarch/encodeopt_lsx.c new file mode 100644 index 000000000..4ad4caba6 --- /dev/null +++ b/vp8/encoder/loongarch/encodeopt_lsx.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp8/encoder/block.h" + +int32_t vp8_block_error_lsx(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) { + int32_t err = 0; + __m128i dq_coeff0, dq_coeff1, coeff0, coeff1; + __m128i reg0, reg1, reg2, reg3, error; + + DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, dq_coeff_ptr, 0, + dq_coeff_ptr, 16, coeff0, coeff1, dq_coeff0, dq_coeff1); + DUP2_ARG2(__lsx_vsubwev_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg0, + reg2); + DUP2_ARG2(__lsx_vsubwod_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg1, + reg3); + error = __lsx_vmul_w(reg0, reg0); + DUP2_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, error); + error = __lsx_vmadd_w(error, reg3, reg3); + error = __lsx_vhaddw_d_w(error, error); + err = __lsx_vpickve2gr_w(error, 0); + err += __lsx_vpickve2gr_w(error, 2); + return err; +} + +int32_t vp8_mbblock_error_lsx(MACROBLOCK *mb, int32_t dc) { + BLOCK *be; + BLOCKD *bd; + int16_t *coeff, *dq_coeff; + int32_t err = 0; + uint32_t loop_cnt; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, error; + __m128i mask0 = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + if (dc == 1) { + mask0 = __lsx_vinsgr2vr_w(mask0, 0, 0); + } + + for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) { + int32_t loop_tmp = loop_cnt << 1; + be = &mb->block[loop_tmp]; + bd = &mb->e_mbd.block[loop_tmp]; + coeff = be->coeff; + dq_coeff = bd->dqcoeff; + DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src0, + src1, tmp0, tmp1); + be = &mb->block[loop_tmp + 1]; + bd = &mb->e_mbd.block[loop_tmp + 1]; + coeff = be->coeff; + dq_coeff = bd->dqcoeff; + DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src2, + src3, tmp2, tmp3); + DUP4_ARG2(__lsx_vsubwev_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vsubwod_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3, + reg1, reg3, reg5, reg7); + DUP2_ARG3(__lsx_vbitsel_v, zero, reg0, mask0, zero, reg4, mask0, reg0, + reg4); + error = __lsx_vmul_w(reg0, reg0); + DUP4_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, reg3, + reg3, error, reg4, reg4, error, error, error, error); + DUP2_ARG3(__lsx_vmadd_w, error, reg5, reg5, error, reg6, reg6, error, + error); + error = __lsx_vmadd_w(error, reg7, reg7); + error = __lsx_vhaddw_d_w(error, error); + error = __lsx_vhaddw_q_d(error, error); + err += __lsx_vpickve2gr_w(error, 0); + } + return err; +} diff --git a/vp8/encoder/loongarch/quantize_lsx.c b/vp8/encoder/loongarch/quantize_lsx.c new file mode 100644 index 000000000..75889192a --- /dev/null +++ b/vp8/encoder/loongarch/quantize_lsx.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdint.h> +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp8/encoder/block.h" + +#define BOOST_QUANT1(_in0, _in1, _in2, _ui) \ + { \ + if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \ + if (__lsx_vpickve2gr_h(_in1, _ui)) { \ + eob = _ui; \ + boost_temp = zbin_boost; \ + } else { \ + boost_temp++; \ + } \ + } else { \ + _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui); \ + boost_temp++; \ + } \ + } + +#define BOOST_QUANT2(_in0, _in1, _in2, _ui) \ + { \ + if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \ + if (__lsx_vpickve2gr_h(_in1, _ui)) { \ + eob = _ui + 8; \ + boost_temp = zbin_boost; \ + } else { \ + boost_temp++; \ + } \ + } else { \ + _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui); \ + boost_temp++; \ + } \ + } + +static int8_t exact_regular_quantize_b_lsx( + int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in, + int16_t *q_coeff, int16_t *dq_coeff) { + int32_t eob; + int16_t *boost_temp = zbin_boost; + __m128i inv_zig_zag = { 0x0C07040206050100, 0x0F0E0A090D0B0803 }; + __m128i sign_z0, sign_z1, q_coeff0, q_coeff1; + __m128i z_bin0, z_bin1, zbin_o_q, x0, x1, sign_x0, sign_x1, de_quant0, + de_quant1; + __m128i z0, z1, round0, round1, quant0, quant2; + __m128i inv_zig_zag0, inv_zig_zag1; + __m128i zigzag_mask0 = { 0x0008000400010000, 0x0006000300020005 }; + __m128i zigzag_mask1 = { 0x000A000D000C0009, 0X000F000E000B0007 }; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i zero = __lsx_vldi(0); + + zbin_o_q = __lsx_vreplgr2vr_h(zbin_oq_in); + inv_zig_zag0 = __lsx_vilvl_b(zero, inv_zig_zag); + inv_zig_zag1 = __lsx_vilvh_b(zero, inv_zig_zag); + eob = -1; + DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, round, 0, round, 16, tmp0, + tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0, + zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, z0, z1, round0, + round1); + DUP4_ARG2(__lsx_vld, quant, 0, quant, 16, zbin, 0, zbin, 16, tmp0, tmp1, tmp2, + tmp3); + DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0, + zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, quant0, quant2, + z_bin0, z_bin1); + DUP2_ARG2(__lsx_vsrai_h, z0, 15, z1, 15, sign_z0, sign_z1); + DUP2_ARG2(__lsx_vadda_h, z0, zero, z1, zero, x0, x1); + DUP2_ARG2(__lsx_vsub_h, x0, z_bin0, x1, z_bin1, z_bin0, z_bin1); + DUP2_ARG2(__lsx_vsub_h, z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1); + DUP2_ARG2(__lsx_vmulwev_w_h, quant0, round0, quant2, round1, tmp0, tmp2); + DUP2_ARG2(__lsx_vmulwod_w_h, quant0, round0, quant2, round1, tmp1, tmp3); + DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2); + DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, q_coeff0, q_coeff1); + + DUP2_ARG2(__lsx_vld, quant_shift, 0, quant_shift, 16, tmp1, tmp3); + DUP2_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp3, tmp1, zigzag_mask1, tmp3, tmp1, + quant0, quant2); + DUP2_ARG2(__lsx_vadd_h, x0, round0, x1, round1, x0, x1); + DUP2_ARG2(__lsx_vmulwev_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp0, tmp2); + DUP2_ARG2(__lsx_vmulwod_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp1, tmp3); + DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2); + DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, x0, x1); + DUP2_ARG2(__lsx_vxor_v, x0, sign_z0, x1, sign_z1, sign_x0, sign_x1); + DUP2_ARG2(__lsx_vsub_h, sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1); + + BOOST_QUANT1(z_bin0, x0, sign_x0, 0); + BOOST_QUANT1(z_bin0, x0, sign_x0, 1); + BOOST_QUANT1(z_bin0, x0, sign_x0, 2); + BOOST_QUANT1(z_bin0, x0, sign_x0, 3); + BOOST_QUANT1(z_bin0, x0, sign_x0, 4); + BOOST_QUANT1(z_bin0, x0, sign_x0, 5); + BOOST_QUANT1(z_bin0, x0, sign_x0, 6); + BOOST_QUANT1(z_bin0, x0, sign_x0, 7); + + BOOST_QUANT2(z_bin1, x1, sign_x1, 0); + BOOST_QUANT2(z_bin1, x1, sign_x1, 1); + BOOST_QUANT2(z_bin1, x1, sign_x1, 2); + BOOST_QUANT2(z_bin1, x1, sign_x1, 3); + BOOST_QUANT2(z_bin1, x1, sign_x1, 4); + BOOST_QUANT2(z_bin1, x1, sign_x1, 5); + BOOST_QUANT2(z_bin1, x1, sign_x1, 6); + BOOST_QUANT2(z_bin1, x1, sign_x1, 7); + + DUP2_ARG2(__lsx_vld, de_quant, 0, de_quant, 16, de_quant0, de_quant1); + DUP2_ARG3(__lsx_vshuf_h, inv_zig_zag0, sign_x1, sign_x0, inv_zig_zag1, + sign_x1, sign_x0, q_coeff0, q_coeff1); + DUP2_ARG2(__lsx_vmul_h, de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, + de_quant1); + __lsx_vst(q_coeff0, q_coeff, 0); + __lsx_vst(q_coeff1, q_coeff, 16); + __lsx_vst(de_quant0, dq_coeff, 0); + __lsx_vst(de_quant1, dq_coeff, 16); + + return (int8_t)(eob + 1); +} + +void vp8_regular_quantize_b_lsx(BLOCK *b, BLOCKD *d) { + int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + int16_t *coeff_ptr = b->coeff; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant; + int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *dequant_ptr = d->dequant; + int16_t zbin_oq_value = b->zbin_extra; + + *d->eob = exact_regular_quantize_b_lsx( + zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr); +} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 3a8f8ea45..5744cbabc 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -124,4 +124,9 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c endif +# common (loongarch LSX intrinsics) +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c + VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes)) diff --git a/vpx_dsp/loongarch/subtract_lsx.c b/vpx_dsp/loongarch/subtract_lsx.c new file mode 100644 index 000000000..943a5c5a9 --- /dev/null +++ b/vpx_dsp/loongarch/subtract_lsx.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3; + __m128i pred0, pred1, pred2, pred3; + __m128i diff0, diff1; + __m128i reg0, reg1; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t diff_stride2 = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t diff_stride3 = diff_stride2 + diff_stride; + + DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2, + src0, src2, pred0, pred2); + DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0); + reg0 = __lsx_vilvl_b(src0, pred0); + reg1 = __lsx_vilvh_b(src0, pred0); + DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1); + __lsx_vstelm_d(diff0, diff_ptr, 0, 0); + __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1); +} + +static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + src_ptr += src_stride4; + pred_ptr += pred_stride4; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4, + pred5, pred6, pred7); + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + __lsx_vst(src0, diff_ptr, 0); + __lsx_vstx(src1, diff_ptr, dst_stride); + __lsx_vstx(src2, diff_ptr, dst_stride2); + __lsx_vstx(src3, diff_ptr, dst_stride3); + diff_ptr += dst_stride2; + __lsx_vst(src4, diff_ptr, 0); + __lsx_vstx(src5, diff_ptr, dst_stride); + __lsx_vstx(src6, diff_ptr, dst_stride2); + __lsx_vstx(src7, diff_ptr, dst_stride3); +} + +static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + int16_t *diff_tmp = diff + 8; + + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); +} + +static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + + for (loop_cnt = 8; loop_cnt--;) { + const uint8_t *src_tmp = src + 16; + const uint8_t *pred_tmp = pred + 16; + DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1, + pred0, pred1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred, + pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3); + DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred, + pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + diff += diff_stride; + __lsx_vst(src4, diff, 0); + __lsx_vst(src5, diff, 16); + __lsx_vst(src6, diff, 32); + __lsx_vst(src7, diff, 48); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + diff += diff_stride; + __lsx_vst(pred4, diff, 0); + __lsx_vst(pred5, diff, 16); + __lsx_vst(pred6, diff, 32); + __lsx_vst(pred7, diff, 48); + diff += diff_stride; + } +} + +static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + + for (loop_cnt = 32; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1, + pred2, pred3); + src += src_stride; + pred += pred_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5, + pred6, pred7); + src += src_stride; + pred += pred_stride; + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + __lsx_vst(src4, diff, 64); + __lsx_vst(src5, diff, 80); + __lsx_vst(src6, diff, 96); + __lsx_vst(src7, diff, 112); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + __lsx_vst(pred4, diff, 64); + __lsx_vst(pred5, diff, 80); + __lsx_vst(pred6, diff, 96); + __lsx_vst(pred7, diff, 112); + diff += diff_stride; + } +} + +void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 7de8b0205..9d8c94545 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -382,6 +382,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 706af97e5..1ef99e641 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -730,7 +730,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") { # Block subtraction # add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/; +specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/; # # Single block SAD |