summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vpx_dsp/loongarch/sad_lsx.c2
-rw-r--r--vpx_dsp/loongarch/txfm_macros_lsx.h53
-rw-r--r--vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c49
-rw-r--r--vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c161
-rw-r--r--vpx_dsp/loongarch/vpx_convolve8_lsx.c110
-rw-r--r--vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c243
-rw-r--r--vpx_dsp/loongarch/vpx_convolve_copy_lsx.c1
-rw-r--r--vpx_dsp/loongarch/vpx_convolve_lsx.h25
8 files changed, 271 insertions, 373 deletions
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
index 46ee557df..5eaebfb51 100644
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -198,7 +198,7 @@ static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
int32_t ref_stride, int32_t height,
uint32_t *sad_array) {
int32_t ht_cnt = (height >> 2);
- uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
__m128i src0, src1, src2, src3, sad_tmp;
__m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
__m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h
index 977f1c2dd..bd514831b 100644
--- a/vpx_dsp/loongarch/txfm_macros_lsx.h
+++ b/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -13,36 +13,29 @@
#include "vpx_util/loongson_intrinsics.h"
-#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
- { \
- __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \
- __m128i k0_m, k1_m, k2_m, k3_m; \
- __m128i zero = __lsx_vldi(0); \
- \
- k0_m = __lsx_vreplgr2vr_h(cnst0); \
- k1_m = __lsx_vreplgr2vr_h(cnst1); \
- k2_m = __lsx_vpackev_h(k1_m, k0_m); \
- k0_m = __lsx_vpackev_h(zero, k0_m); \
- k1_m = __lsx_vpackev_h(k1_m, zero); \
- \
- s5_m = __lsx_vilvl_h(reg1, reg0); \
- s4_m = __lsx_vilvh_h(reg1, reg0); \
- s3_m = __lsx_vilvl_h(reg0, reg1); \
- s2_m = __lsx_vilvh_h(reg0, reg1); \
- \
- s1_m = __lsx_vdp2_w_h(s5_m, k0_m); \
- s0_m = __lsx_vdp2_w_h(s4_m, k0_m); \
- k3_m = __lsx_vdp2_w_h(s5_m, k1_m); \
- s1_m = __lsx_vsub_w(s1_m, k3_m); \
- k3_m = __lsx_vdp2_w_h(s4_m, k1_m); \
- s0_m = __lsx_vsub_w(s0_m, k3_m); \
- \
- out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
- \
- s1_m = __lsx_vdp2_w_h(s3_m, k2_m); \
- s0_m = __lsx_vdp2_w_h(s2_m, k2_m); \
- out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
- }
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \
+ __m128i k0_m, k1_m, k2_m, k3_m; \
+ \
+ k0_m = __lsx_vreplgr2vr_h(cnst0); \
+ k1_m = __lsx_vreplgr2vr_h(cnst1); \
+ k2_m = __lsx_vpackev_h(k1_m, k0_m); \
+ \
+ DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m); \
+ \
+ DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \
+ k3_m = __lsx_vmulwod_w_h(s5_m, k1_m); \
+ s1_m = __lsx_vsub_w(s1_m, k3_m); \
+ k3_m = __lsx_vmulwod_w_h(s4_m, k1_m); \
+ s0_m = __lsx_vsub_w(s0_m, k3_m); \
+ \
+ out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m); \
+ out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
+ } while (0)
#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \
do { \
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
index 2b983552b..54fcd6c57 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -278,7 +278,7 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
__m128i src0, src1, src2, src3, src4, mask;
__m128i filt_hz, filt_vt, vec0, vec1;
__m128i dst0, dst1, dst2, dst3;
- __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, out;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
__m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
int32_t src_stride2 = src_stride << 1;
@@ -311,13 +311,12 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
dst1 = __lsx_vilvl_w(dst3, dst2);
dst0 = __lsx_vilvl_d(dst1, dst0);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- out = __lsx_vpickev_b(tmp1, tmp0);
- out = __lsx_vavgr_bu(out, dst0);
- __lsx_vstelm_w(out, dst, 0, 0);
- __lsx_vstelm_w(out, dst + dst_stride, 0, 1);
- __lsx_vstelm_w(out, dst + dst_stride2, 0, 2);
- __lsx_vstelm_w(out, dst + dst_stride3, 0, 3);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
}
static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
@@ -386,9 +385,8 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
filt_vt, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
- FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, res0, res1);
DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
__lsx_vstelm_w(res0, dst, 0, 0);
@@ -467,10 +465,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
-
- DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
- FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
- PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+ AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
}
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
@@ -513,8 +510,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
-
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
@@ -522,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
-
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
dst0 = __lsx_vldrepl_d(dst_tmp, 0);
dst_tmp += dst_stride;
@@ -534,7 +529,7 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
dst3 = __lsx_vldrepl_d(dst_tmp, 0);
dst_tmp += dst_stride;
DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
- PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+ AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
dst += dst_stride;
}
}
@@ -597,8 +592,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
tmp3 = __lsx_vavgr_bu(tmp3, dst0);
__lsx_vst(tmp3, dst, 0);
@@ -606,8 +600,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
tmp3 = __lsx_vavgr_bu(tmp3, dst1);
__lsx_vstx(tmp3, dst, dst_stride);
@@ -615,8 +608,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
tmp3 = __lsx_vavgr_bu(tmp3, dst2);
__lsx_vstx(tmp3, dst, dst_stride2);
@@ -624,8 +616,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp3 = __lsx_vpickev_b(tmp1, tmp0);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
tmp3 = __lsx_vavgr_bu(tmp3, dst3);
__lsx_vstx(tmp3, dst, dst_stride3);
dst += dst_stride4;
@@ -642,8 +633,6 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_lsx(
common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
filter_horiz, filter_vert, height);
- src += 16;
- dst += 16;
}
static void common_hv_2ht_2vt_and_aver_dst_64w_lsx(
diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
index 5d67d6527..2c6459a97 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -338,8 +338,7 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int8_t *filter) {
__m128i src0, src1, src2, src3, mask;
- __m128i filt0, vec0, vec1, res0, res1;
- __m128i vec2, vec3;
+ __m128i filt0, vec0, vec1, vec2, vec3, res0, res1;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride + src_stride2;
@@ -355,8 +354,8 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
src3 = __lsx_vldx(src, src_stride3);
DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
- DUP2_ARG2(__lsx_vsrari_h, vec2, FILTER_BITS, vec3, FILTER_BITS, vec2, vec3);
- DUP2_ARG2(__lsx_vpickev_b, vec2, vec2, vec3, vec3, res0, res1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3,
+ FILTER_BITS, res0, res1);
__lsx_vstelm_w(res0, dst, 0, 0);
__lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
@@ -367,10 +366,9 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int8_t *filter) {
- __m128i vec0, vec1, vec2, vec3, filt0;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
__m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
- __m128i res0, res1, res2, res3;
- __m128i vec4, vec5, vec6, vec7;
+ __m128i res0, res1, res2, res3, filt0;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride + src_stride2;
int32_t src_stride4 = src_stride2 << 1;
@@ -396,10 +394,10 @@ static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
src7, src6, mask, vec0, vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
vec4, vec5, vec6, vec7);
- DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6,
- FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7);
- DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
- res0, res1, res2, res3);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+ FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+ res1, res2, res3);
+
__lsx_vstelm_w(res0, dst, 0, 0);
dst += dst_stride;
__lsx_vstelm_w(res0, dst, 0, 1);
@@ -451,14 +449,13 @@ static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
src3, src3, mask, vec0, vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
vec0, vec1, vec2, vec3);
- DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
- FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
- DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, src0, src1);
-
- __lsx_vstelm_d(src0, dst, 0, 0);
- __lsx_vstelm_d(src0, dst + dst_stride, 0, 1);
- __lsx_vstelm_d(src1, dst + dst_stride2, 0, 0);
- __lsx_vstelm_d(src1, dst + dst_stride3, 0, 1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec1);
+
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1);
}
static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
@@ -490,15 +487,9 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
src3, src3, mask, vec0, vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
vec0, vec1, vec2, vec3);
- DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
- FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
-
- src0 = __lsx_vld(src, 0);
- DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
- src3 = __lsx_vldx(src, src_stride3);
- src += src_stride4;
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
- DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
__lsx_vstelm_d(out0, dst, 0, 0);
dst += dst_stride;
__lsx_vstelm_d(out0, dst, 0, 1);
@@ -508,13 +499,17 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
__lsx_vstelm_d(out1, dst, 0, 1);
dst += dst_stride;
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
src3, src3, mask, vec0, vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
vec0, vec1, vec2, vec3);
- DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
- FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
- DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
__lsx_vstelm_d(out0, dst, 0, 0);
dst += dst_stride;
@@ -537,27 +532,25 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
mask, src3, src3, mask, vec0, vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
filt0, vec0, vec1, vec2, vec3);
- DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
- FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
-
- src0 = __lsx_vld(src, 0);
- DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
- src3 = __lsx_vldx(src, src_stride3);
- src += src_stride4;
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
- DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
__lsx_vstelm_d(out0, dst, 0, 0);
__lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
__lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
__lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
mask, src3, src3, mask, vec0, vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
filt0, vec0, vec1, vec2, vec3);
- DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2,
- FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3);
- DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
__lsx_vstelm_d(out0, dst_tmp1, 0, 0);
__lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
@@ -582,7 +575,7 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt = (height >> 2) - 1;
__m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
__m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
@@ -609,22 +602,17 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
out0, out1, out2, out3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
out4, out5, out6, out7);
- DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
- FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
- DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
- FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0,
+ out1, out2, out3);
- tmp = __lsx_vpickev_b(out1, out0);
- __lsx_vst(tmp, dst, 0);
+ __lsx_vst(out0, dst, 0);
dst += dst_stride;
- tmp = __lsx_vpickev_b(out3, out2);
- __lsx_vst(tmp, dst, 0);
+ __lsx_vst(out1, dst, 0);
dst += dst_stride;
- tmp = __lsx_vpickev_b(out5, out4);
- __lsx_vst(tmp, dst, 0);
+ __lsx_vst(out2, dst, 0);
dst += dst_stride;
- tmp = __lsx_vpickev_b(out7, out6);
- __lsx_vst(tmp, dst, 0);
+ __lsx_vst(out3, dst, 0);
dst += dst_stride;
for (; loop_cnt--;) {
@@ -648,22 +636,17 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
filt0, out0, out1, out2, out3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
filt0, out4, out5, out6, out7);
- DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
- FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
- DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
- FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
- tmp = __lsx_vpickev_b(out1, out0);
- __lsx_vst(tmp, dst, 0);
+ __lsx_vst(out0, dst, 0);
dst += dst_stride;
- tmp = __lsx_vpickev_b(out3, out2);
- __lsx_vst(tmp, dst, 0);
+ __lsx_vst(out1, dst, 0);
dst += dst_stride;
- tmp = __lsx_vpickev_b(out5, out4);
- __lsx_vst(tmp, dst, 0);
+ __lsx_vst(out2, dst, 0);
dst += dst_stride;
- tmp = __lsx_vpickev_b(out7, out6);
- __lsx_vst(tmp, dst, 0);
+ __lsx_vst(out3, dst, 0);
dst += dst_stride;
}
}
@@ -674,7 +657,7 @@ static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt = (height >> 1);
__m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
__m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
__m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
mask = __lsx_vld(mc_filt_mask_arr, 0);
@@ -699,21 +682,16 @@ static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
filt0, out0, out1, out2, out3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
filt0, out4, out5, out6, out7);
- DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
- FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
- DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
- FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
-
- tmp = __lsx_vpickev_b(out1, out0);
- __lsx_vst(tmp, dst, 0);
- tmp = __lsx_vpickev_b(out3, out2);
- __lsx_vst(tmp, dst, 16);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
dst += dst_stride;
- tmp = __lsx_vpickev_b(out5, out4);
- __lsx_vst(tmp, dst, 0);
- tmp = __lsx_vpickev_b(out7, out6);
- __lsx_vst(tmp, dst, 16);
+ __lsx_vst(out2, dst, 0);
+ __lsx_vst(out3, dst, 16);
dst += dst_stride;
}
}
@@ -724,7 +702,7 @@ static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt = height;
__m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
__m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
__m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
mask = __lsx_vld(mc_filt_mask_arr, 0);
@@ -749,19 +727,14 @@ static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
filt0, out0, out1, out2, out3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
filt0, out4, out5, out6, out7);
- DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2,
- FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3);
- DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6,
- FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7);
-
- tmp = __lsx_vpickev_b(out1, out0);
- __lsx_vst(tmp, dst, 0);
- tmp = __lsx_vpickev_b(out3, out2);
- __lsx_vst(tmp, dst, 16);
- tmp = __lsx_vpickev_b(out5, out4);
- __lsx_vst(tmp, dst, 32);
- tmp = __lsx_vpickev_b(out7, out6);
- __lsx_vst(tmp, dst, 48);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+ __lsx_vst(out2, dst, 32);
+ __lsx_vst(out3, dst, 48);
dst += dst_stride;
}
}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
index 894c13720..73583abb9 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -248,7 +248,7 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
int8_t *filter_horiz,
int8_t *filter_vert) {
__m128i src0, src1, src2, src3, src4, mask;
- __m128i filt_vt, filt_hz, vec0, vec1, res0, res1;
+ __m128i filt_vt, filt_hz, vec0, vec1;
__m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
__m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
@@ -276,13 +276,13 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- DUP2_ARG2(__lsx_vpickev_b, tmp0, tmp0, tmp1, tmp1, res0, res1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
+ FILTER_BITS, tmp0, tmp1);
- __lsx_vstelm_w(res0, dst, 0, 0);
- __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
- __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
- __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
}
static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
@@ -290,7 +290,6 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
int8_t *filter_horiz,
int8_t *filter_vert) {
__m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
- __m128i res0, res1, res2, res3;
__m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
__m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
__m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
@@ -331,20 +330,19 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
filt_vt, vec4, vec5, vec6, vec7);
- DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6,
- FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7);
- DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
- res0, res1, res2, res3);
-
- __lsx_vstelm_w(res0, dst, 0, 0);
- __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
- __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
- __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+ FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
+ vec5, vec6, vec7);
+
+ __lsx_vstelm_w(vec4, dst, 0, 0);
+ __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
dst += dst_stride4;
- __lsx_vstelm_w(res2, dst, 0, 0);
- __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
- __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
- __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+ __lsx_vstelm_w(vec6, dst, 0, 0);
+ __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
}
static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
@@ -364,7 +362,7 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert) {
- __m128i src0, src1, src2, src3, src4, mask, out0, out1;
+ __m128i src0, src1, src2, src3, src4, mask;
__m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
__m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
@@ -401,14 +399,13 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
- DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
- FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
- __lsx_vstelm_d(out0, dst, 0, 0);
- __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
- __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
- __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
}
static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
@@ -417,9 +414,9 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
int8_t *filter_horiz,
int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt = (height >> 3);
- __m128i src0, src1, src2, src3, src4, mask, out0, out1;
+ __m128i src0, src1, src2, src3, src4, mask;
__m128i filt_hz, filt_vt, vec0;
- __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
@@ -449,8 +446,6 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
- DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
-
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
@@ -463,43 +458,44 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
- DUP2_ARG2(__lsx_vsrari_h, tmp3, FILTER_BITS, tmp4, FILTER_BITS, tmp3, tmp4);
- DUP2_ARG2(__lsx_vpickev_b, tmp2, tmp1, tmp4, tmp3, out0, out1);
- __lsx_vstelm_d(out0, dst, 0, 0);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+ FILTER_BITS, tmp1, tmp2);
+
+ __lsx_vstelm_d(tmp1, dst, 0, 0);
dst += dst_stride;
- __lsx_vstelm_d(out0, dst, 0, 1);
+ __lsx_vstelm_d(tmp1, dst, 0, 1);
dst += dst_stride;
- __lsx_vstelm_d(out1, dst, 0, 0);
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
dst += dst_stride;
- __lsx_vstelm_d(out1, dst, 0, 1);
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
dst += dst_stride;
hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
- tmp5 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
- tmp6 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
- tmp7 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
- tmp8 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
- DUP4_ARG2(__lsx_vsrari_h, tmp5, FILTER_BITS, tmp6, FILTER_BITS, tmp7,
- FILTER_BITS, tmp8, FILTER_BITS, tmp5, tmp6, tmp7, tmp8);
- DUP2_ARG2(__lsx_vpickev_b, tmp6, tmp5, tmp8, tmp7, out0, out1);
- __lsx_vstelm_d(out0, dst, 0, 0);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+ FILTER_BITS, tmp1, tmp2);
+
+ __lsx_vstelm_d(tmp1, dst, 0, 0);
dst += dst_stride;
- __lsx_vstelm_d(out0, dst, 0, 1);
+ __lsx_vstelm_d(tmp1, dst, 0, 1);
dst += dst_stride;
- __lsx_vstelm_d(out1, dst, 0, 0);
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
dst += dst_stride;
- __lsx_vstelm_d(out1, dst, 0, 1);
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
dst += dst_stride;
}
}
@@ -554,8 +550,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
- DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
- tmp = __lsx_vpickev_b(tmp2, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
__lsx_vst(tmp, dst, 0);
dst += dst_stride;
@@ -563,8 +558,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
- DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
- tmp = __lsx_vpickev_b(tmp2, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
__lsx_vst(tmp, dst, 0);
dst += dst_stride;
@@ -572,8 +566,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
- DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
- tmp = __lsx_vpickev_b(tmp2, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
__lsx_vst(tmp, dst, 0);
dst += dst_stride;
@@ -581,8 +574,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
- DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2);
- tmp = __lsx_vpickev_b(tmp2, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
__lsx_vst(tmp, dst, 0);
dst += dst_stride;
}
@@ -599,8 +591,6 @@ static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
filter_vert, height);
- src += 16;
- dst += 16;
}
static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
index c0bb10f3b..7e3a95b2f 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -361,13 +361,12 @@ static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int8_t *filter) {
__m128i src0, src1, src2, src3, src4;
- __m128i src10_l, src32_l, src21_l, src43_l, src2110, src4332;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
__m128i filt0, tmp0, tmp1;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
int32_t src_stride4 = src_stride2 << 1;
-
int32_t dst_stride2 = dst_stride << 1;
int32_t dst_stride3 = dst_stride2 + dst_stride;
@@ -378,37 +377,33 @@ static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
src, src_stride4, src1, src2, src3, src4);
src += (src_stride4 + src_stride);
- DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
- src10_l, src21_l, src32_l, src43_l);
- DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src2110,
- src4332);
- DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- src2110 = __lsx_vpickev_b(tmp1, tmp0);
-
- __lsx_vstelm_w(src2110, dst, 0, 0);
- __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1);
- __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2);
- __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+ vec1, vec2, vec3);
+ DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
}
static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int8_t *filter) {
__m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
- __m128i src65_l, src87_l, src2110, src4332, src6554, src8776;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i vec6, vec7, vec8, vec9, vec10, vec11;
__m128i tmp0, tmp1, tmp2, tmp3;
__m128i filt0;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
int32_t src_stride4 = src_stride2 << 1;
-
int32_t dst_stride2 = dst_stride << 1;
int32_t dst_stride3 = dst_stride2 + dst_stride;
int32_t dst_stride4 = dst_stride2 << 1;
-
uint8_t *dst_tmp1 = dst + dst_stride4;
filt0 = __lsx_vldrepl_h(filter, 0);
@@ -420,27 +415,27 @@ static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
src, src_stride4, src5, src6, src7, src8);
src += (src_stride4 + src_stride);
- DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
- src10_l, src21_l, src32_l, src43_l);
- DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
- src54_l, src65_l, src76_l, src87_l);
- DUP4_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
- src87_l, src76_l, src2110, src4332, src6554, src8776);
- DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
- src8776, filt0, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
- FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, src2110, src4332);
-
- __lsx_vstelm_w(src2110, dst, 0, 0);
- __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1);
- __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2);
- __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3);
-
- __lsx_vstelm_w(src4332, dst_tmp1, 0, 0);
- __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride, 0, 1);
- __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride2, 0, 2);
- __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride3, 0, 3);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+ vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4,
+ vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8,
+ vec9, vec10, vec11);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+
+ __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3);
}
static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
@@ -457,17 +452,14 @@ static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int8_t *filter) {
__m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
- __m128i out0, out1;
- __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
int32_t src_stride4 = src_stride2 << 1;
-
int32_t dst_stride2 = dst_stride << 1;
int32_t dst_stride3 = dst_stride2 + dst_stride;
- /* rearranging filter_y */
filt0 = __lsx_vldrepl_h(filter, 0);
src0 = __lsx_vld(src, 0);
@@ -478,9 +470,8 @@ static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
vec1, vec2, vec3);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
- FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
__lsx_vstelm_d(out0, dst, 0, 0);
__lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
@@ -494,13 +485,11 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt = (height >> 3);
__m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
__m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- __m128i out0, out1;
- __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
int32_t src_stride4 = src_stride2 << 1;
-
int32_t dst_stride2 = dst_stride << 1;
int32_t dst_stride3 = dst_stride2 + dst_stride;
int32_t dst_stride4 = dst_stride2 << 1;
@@ -525,9 +514,9 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
vec4, vec5, vec6, vec7);
DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
filt0, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
- FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+
__lsx_vstelm_d(out0, dst, 0, 0);
__lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
__lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
@@ -536,9 +525,9 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
filt0, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
- FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+
__lsx_vstelm_d(out0, dst, 0, 0);
__lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
__lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
@@ -559,29 +548,17 @@ static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
}
}
-static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int y0_q4,
- int y_step_q4, int w, int height) {
+static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
uint32_t loop_cnt = (height >> 2);
- __m128i src0, src1, src2, src3, src4;
+ __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1;
__m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
int32_t src_stride4 = src_stride2 << 1;
- const int16_t *const filter_y = filter[y0_q4];
- int8_t cnt, filt_ver[8];
-
- assert(y_step_q4 == 16);
- assert(((const int32_t *)filter_y)[1] != 0x800000);
-
- for (cnt = 8; cnt--;) {
- filt_ver[cnt] = filter_y[cnt];
- }
-
- filt0 = __lsx_vldrepl_h(&filt_ver[3], 0);
+ filt0 = __lsx_vldrepl_h(filter, 0);
src0 = __lsx_vld(src, 0);
src += src_stride;
@@ -595,29 +572,25 @@ static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride,
DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp4 = __lsx_vpickev_b(tmp1, tmp0);
- __lsx_vst(tmp4, dst, 0);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
dst += dst_stride;
DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
- tmp4 = __lsx_vpickev_b(tmp3, tmp2);
- __lsx_vst(tmp4, dst, 0);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
dst += dst_stride;
DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp4 = __lsx_vpickev_b(tmp1, tmp0);
- __lsx_vst(tmp4, dst, 0);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
dst += dst_stride;
- DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
- tmp4 = __lsx_vpickev_b(tmp3, tmp2);
- __lsx_vst(tmp4, dst, 0);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
dst += dst_stride;
src0 = src4;
@@ -630,20 +603,18 @@ static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt = (height >> 2);
__m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
__m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
+ __m128i tmp, tmp0, tmp1;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
int32_t src_stride4 = src_stride2 << 1;
-
int32_t dst_stride2 = dst_stride << 1;
int32_t dst_stride3 = dst_stride2 + dst_stride;
-
uint8_t *src_tmp;
+
filt0 = __lsx_vldrepl_h(filter, 0);
- src0 = __lsx_vld(src, 0);
- src5 = __lsx_vld(src, 16);
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
src += src_stride;
src_tmp = src + 16;
@@ -658,53 +629,45 @@ static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
src_tmp += src_stride4;
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp4 = __lsx_vpickev_b(tmp1, tmp0);
- __lsx_vst(tmp4, dst, 0);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
- tmp4 = __lsx_vpickev_b(tmp3, tmp2);
- __lsx_vstx(tmp4, dst, dst_stride);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride);
DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp4 = __lsx_vpickev_b(tmp1, tmp0);
- __lsx_vstx(tmp4, dst, dst_stride2);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride2);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
- tmp4 = __lsx_vpickev_b(tmp3, tmp2);
- __lsx_vstx(tmp4, dst, dst_stride3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride3);
DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp4 = __lsx_vpickev_b(tmp1, tmp0);
- __lsx_vst(tmp4, dst, 16);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 16);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
dst += dst_stride;
- tmp4 = __lsx_vpickev_b(tmp3, tmp2);
- __lsx_vst(tmp4, dst, 16);
+ __lsx_vst(tmp, dst, 16);
DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
dst += dst_stride;
- tmp4 = __lsx_vpickev_b(tmp1, tmp0);
- __lsx_vst(tmp4, dst, 16);
+ __lsx_vst(tmp, dst, 16);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
dst += dst_stride;
- tmp4 = __lsx_vpickev_b(tmp3, tmp2);
- __lsx_vst(tmp4, dst, 16);
+ __lsx_vst(tmp, dst, 16);
dst += dst_stride;
@@ -719,7 +682,7 @@ static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt = (height >> 1);
__m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
__m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- __m128i tmp, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp, tmp0, tmp1;
int32_t src_stride2 = src_stride << 1;
int32_t dst_stride2 = dst_stride << 1;
@@ -743,49 +706,41 @@ static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp = __lsx_vpickev_b(tmp1, tmp0);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
__lsx_vst(tmp, dst, 0);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
- tmp = __lsx_vpickev_b(tmp3, tmp2);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
__lsx_vst(tmp, dst_tmp1, 0);
DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5);
- DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5);
- tmp = __lsx_vpickev_b(tmp5, tmp4);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
__lsx_vst(tmp, dst, 16);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7);
- DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7);
- tmp = __lsx_vpickev_b(tmp7, tmp6);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
__lsx_vst(tmp, dst_tmp1, 16);
DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
- DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
- tmp = __lsx_vpickev_b(tmp1, tmp0);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
__lsx_vst(tmp, dst, 32);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
- tmp = __lsx_vpickev_b(tmp3, tmp2);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
__lsx_vst(tmp, dst_tmp1, 32);
DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5);
- DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5);
- tmp = __lsx_vpickev_b(tmp5, tmp4);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
__lsx_vst(tmp, dst, 48);
- DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7);
- DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7);
- tmp = __lsx_vpickev_b(tmp7, tmp6);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
__lsx_vst(tmp, dst_tmp1, 48);
dst += dst_stride2;
dst_tmp1 += dst_stride2;
@@ -823,8 +778,8 @@ void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
&filt_ver[3], h);
break;
case 16:
- common_vt_2t_16w_lsx(src, src_stride, dst, dst_stride, filter, y0_q4,
- y_step_q4, w, h);
+ common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
break;
case 32:
common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
diff --git a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
index 398788a43..53dc7097e 100644
--- a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
@@ -15,7 +15,6 @@
static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, int32_t height) {
int32_t cnt;
- uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
int32_t src_stride2 = src_stride << 1;
int32_t src_stride3 = src_stride2 + src_stride;
diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h
index d319bc4f7..2428407f2 100644
--- a/vpx_dsp/loongarch/vpx_convolve_lsx.h
+++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -125,19 +125,18 @@
tmp1_m; \
})
-#define PCKEV_AVG_ST4_D(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
- { \
- __m128i tmp0_m, tmp1_m; \
- \
- DUP2_ARG2(__lsx_vpickev_b, in1, in0, in3, in2, tmp0_m, tmp1_m); \
- DUP2_ARG2(__lsx_vavgr_bu, tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
- __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \
- pdst += stride; \
- __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \
- pdst += stride; \
- __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \
- pdst += stride; \
- __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \
+#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \
+ { \
+ __m128i tmp0_m, tmp1_m; \
+ \
+ DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
+ __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \
}
#endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_