summaryrefslogtreecommitdiff
path: root/vpx_dsp/loongarch
diff options
context:
space:
mode:
authoryuanhecai <yuanhecai@loongson.cn>2022-04-05 18:17:19 +0800
committeryuanhecai <yuanhecai@loongson.cn>2022-04-26 20:54:41 +0800
commitf6de5b51b8338ebd743a465e84d2c4b73cc29082 (patch)
tree5dbd956a2fe7311e51523e42f48beff9732c450d /vpx_dsp/loongarch
parent8a29e27e17c8e4277ba15ca993a8912464c2400a (diff)
downloadlibvpx-f6de5b51b8338ebd743a465e84d2c4b73cc29082.tar
libvpx-f6de5b51b8338ebd743a465e84d2c4b73cc29082.tar.gz
libvpx-f6de5b51b8338ebd743a465e84d2c4b73cc29082.tar.bz2
libvpx-f6de5b51b8338ebd743a465e84d2c4b73cc29082.zip
vp9[loongarch]: Optimize fdct/get/variance16x16
1. vpx_fdct16x16_lsx 2. vpx_get16x16var_lsx 3. vpx_variance16x16_lsx Bug: webm:1755 Change-Id: I27090406dc28cfdca64760fea4bc16ae11b74628
Diffstat (limited to 'vpx_dsp/loongarch')
-rw-r--r--vpx_dsp/loongarch/fwd_txfm_lsx.c258
-rw-r--r--vpx_dsp/loongarch/fwd_txfm_lsx.h166
-rw-r--r--vpx_dsp/loongarch/txfm_macros_lsx.h8
-rw-r--r--vpx_dsp/loongarch/variance_lsx.c54
4 files changed, 484 insertions, 2 deletions
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c
new file mode 100644
index 000000000..03f194b43
--- /dev/null
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride) {
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+ __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+ __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
+ __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
+ __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+ int32_t src_stride8 = src_stride4 << 1;
+ int16_t *input_tmp = (int16_t *)input;
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
+ input_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
+ input_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
+ in12);
+ input_tmp += src_stride4;
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
+ in14);
+ input_tmp += src_stride2;
+ in15 = __lsx_vldx(input_tmp, src_stride2);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+ in15);
+ DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
+ tmp6, tmp7);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ __lsx_vst(tmp0, tmp_ptr, 0);
+ __lsx_vst(tmp1, tmp_ptr, 64);
+ __lsx_vst(tmp2, tmp_ptr, 128);
+ __lsx_vst(tmp3, tmp_ptr, 192);
+ __lsx_vst(tmp4, tmp_ptr, 256);
+ __lsx_vst(tmp5, tmp_ptr, 320);
+ __lsx_vst(tmp6, tmp_ptr, 384);
+ __lsx_vst(tmp7, tmp_ptr, 448);
+ DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
+ in14, in13, in12);
+ DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
+ in9, in8);
+
+ tmp_ptr += 16;
+
+ /* stp 1 */
+ DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
+ DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
+
+ cnst4 = __lsx_vreplvei_h(coeff, 0);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
+
+ cnst5 = __lsx_vreplvei_h(coeff, 1);
+ cnst5 = __lsx_vpackev_h(cnst5, cnst4);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
+
+ /* stp2 */
+ LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+ LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+ DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
+ DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
+
+ cnst0 = __lsx_vreplvei_h(coeff, 4);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
+
+ LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+ vec1 = __lsx_vilvl_h(in15, in8);
+ vec0 = __lsx_vilvh_h(in15, in8);
+
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 0);
+
+ cnst0 = __lsx_vreplvei_h(coeff2, 0);
+ cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 448);
+
+ vec1 = __lsx_vilvl_h(in14, in9);
+ vec0 = __lsx_vilvh_h(in14, in9);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+ __lsx_vst(in8, tmp_ptr, 256);
+
+ cnst1 = __lsx_vreplvei_h(coeff2, 2);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 192);
+
+ DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
+
+ cnst1 = __lsx_vreplvei_h(coeff, 3);
+ cnst1 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
+
+ /* stp4 */
+ DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
+
+ vec1 = __lsx_vilvl_h(in13, in10);
+ vec0 = __lsx_vilvh_h(in13, in10);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 128);
+
+ cnst0 = __lsx_vreplvei_h(coeff2, 1);
+ cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 320);
+
+ DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
+ vec1 = __lsx_vilvl_h(in12, in11);
+ vec0 = __lsx_vilvh_h(in12, in11);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+ __lsx_vst(in8, tmp_ptr, 384);
+
+ cnst1 = __lsx_vreplvei_h(coeff2, 3);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 64);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ int16_t *input_tmp = input;
+
+ DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
+ in6, in7);
+ DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
+ 112, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
+ input_tmp, 240, in12, in13, in14, in15);
+
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
+ in14, in15);
+
+ DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+ in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
+ tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+ __lsx_vst(in8, input, 0);
+ __lsx_vst(in9, input, 32);
+ __lsx_vst(in10, input, 64);
+ __lsx_vst(in11, input, 96);
+ __lsx_vst(in12, input, 128);
+ __lsx_vst(in13, input, 160);
+ __lsx_vst(in14, input, 192);
+ __lsx_vst(in15, input, 224);
+
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
+ in13, in14, in15);
+ FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+ tmp1, in1, tmp2, in2, tmp3, in3);
+ __lsx_vst(tmp0, output, 0);
+ __lsx_vst(in0, output, 32);
+ __lsx_vst(tmp1, output, 64);
+ __lsx_vst(in1, output, 96);
+ __lsx_vst(tmp2, output, 128);
+ __lsx_vst(in2, output, 160);
+ __lsx_vst(tmp3, output, 192);
+ __lsx_vst(in3, output, 224);
+
+ LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+ tmp5, in5, tmp6, in6, tmp7, in7);
+ __lsx_vst(tmp4, output, 16);
+ __lsx_vst(in4, output, 48);
+ __lsx_vst(tmp5, output, 80);
+ __lsx_vst(in5, output, 112);
+ __lsx_vst(tmp6, output, 144);
+ __lsx_vst(in6, output, 176);
+ __lsx_vst(tmp7, output, 208);
+ __lsx_vst(in7, output, 240);
+}
+
+void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+ }
+}
+#endif // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
index a6f62dbc8..9ed810226 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.h
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -113,4 +113,170 @@
__lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \
}
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ __m128i x0_m, x1_m, x2_m, x3_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
+ \
+ /* FDCT stage1 */ \
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+ s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
+ LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
+ x2_m = __lsx_vneg_h(x2_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
+ \
+ /* stage2 */ \
+ s1_m = __lsx_vilvl_h(s5_m, s6_m); \
+ s0_m = __lsx_vilvh_h(s5_m, s6_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
+ \
+ /* stage3 */ \
+ LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x0_m, x1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
+ \
+ x1_m = __lsx_vreplvei_h(coeff_m, 5); \
+ x0_m = __lsx_vneg_h(x0_m); \
+ x0_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
+ \
+ x2_m = __lsx_vreplvei_h(coeff_m, 6); \
+ x3_m = __lsx_vneg_h(x3_m); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
+ }
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
+ input7, out1, out3, out5, out7, out9, out11, out13, \
+ out15) \
+ { \
+ __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
+ __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
+ __m128i stp36_m, stp37_m, vec0_m, vec1_m; \
+ __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
+ __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; \
+ __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; \
+ __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 }; \
+ \
+ /* stp 1 */ \
+ DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \
+ DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \
+ \
+ cnst4_m = __lsx_vreplvei_h(coeff_m, 0); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m); \
+ \
+ cnst5_m = __lsx_vreplvei_h(coeff_m, 1); \
+ cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m); \
+ \
+ /* stp2 */ \
+ LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, \
+ stp32_m, stp33_m); \
+ LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, \
+ stp35_m, stp34_m); \
+ \
+ DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, \
+ vec4_m); \
+ DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, \
+ vec5_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff_m, 4); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff_m, 3); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m); \
+ \
+ /* stp4 */ \
+ LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, \
+ vec4_m, vec5_m); \
+ LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, \
+ stp24_m, stp31_m); \
+ \
+ vec1_m = __lsx_vilvl_h(vec2_m, vec6_m); \
+ vec0_m = __lsx_vilvh_h(vec2_m, vec6_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff2_m, 0); \
+ cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15); \
+ \
+ vec1_m = __lsx_vilvl_h(vec4_m, vec5_m); \
+ vec0_m = __lsx_vilvh_h(vec4_m, vec5_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9); \
+ \
+ cnst1_m = __lsx_vreplvei_h(coeff2_m, 2); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7); \
+ \
+ vec1_m = __lsx_vilvl_h(stp23_m, stp21_m); \
+ vec0_m = __lsx_vilvh_h(stp23_m, stp21_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff2_m, 1); \
+ cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11); \
+ \
+ vec1_m = __lsx_vilvl_h(stp24_m, stp31_m); \
+ vec0_m = __lsx_vilvh_h(stp24_m, stp31_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13); \
+ \
+ cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \
+ }
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
#endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h
index bc6f7dacc..977f1c2dd 100644
--- a/vpx_dsp/loongarch/txfm_macros_lsx.h
+++ b/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -44,4 +44,12 @@
out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
}
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \
+ do { \
+ __m128i tp0_m, tp1_m; \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \
+ in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS); \
+ } while (0)
+
#endif // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c
index 8164e9818..8f2ec0563 100644
--- a/vpx_dsp/loongarch/variance_lsx.c
+++ b/vpx_dsp/loongarch/variance_lsx.c
@@ -37,9 +37,50 @@
sub = __lsx_vadd_h(sub, res_l1_m); \
}
+#define VARIANCE_WxH(sse, diff, shift) \
+ (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
#define VARIANCE_LARGE_WxH(sse, diff, shift) \
(sse) - (((int64_t)(diff) * (diff)) >> (shift))
+static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt = (height >> 2);
+ __m128i src, ref, vec;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ for (; ht_cnt--;) {
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t *ref_ptr, int32_t ref_stride,
int32_t height, int32_t *diff) {
@@ -133,8 +174,10 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
return HADD_SW_S32(var);
}
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
#define VPX_VARIANCE_WDXHT_LSX(wd, ht) \
uint32_t vpx_variance##wd##x##ht##_lsx( \
@@ -148,6 +191,7 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
}
+VPX_VARIANCE_WDXHT_LSX(16, 16)
VPX_VARIANCE_WDXHT_LSX(32, 32)
uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
@@ -159,3 +203,9 @@ uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
return VARIANCE_64Wx64H(*sse, diff);
}
+
+void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
+ *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum);
+}