diff options
-rw-r--r-- | vp8/common/loongarch/idct_lsx.c | 39 | ||||
-rw-r--r-- | vp8/common/loongarch/loopfilter_filters_lsx.c | 16 | ||||
-rw-r--r-- | vp8/common/loongarch/sixtap_filter_lsx.c | 322 | ||||
-rw-r--r-- | vpx_dsp/loongarch/bitdepth_conversion_lsx.h | 43 | ||||
-rw-r--r-- | vpx_dsp/loongarch/fwd_dct32x32_lsx.c | 4 | ||||
-rw-r--r-- | vpx_dsp/loongarch/fwd_txfm_lsx.c | 4 | ||||
-rw-r--r-- | vpx_dsp/loongarch/fwd_txfm_lsx.h | 40 | ||||
-rw-r--r-- | vpx_dsp/loongarch/idct32x32_lsx.c | 4 | ||||
-rw-r--r-- | vpx_dsp/loongarch/loopfilter_16_lsx.c | 8 | ||||
-rw-r--r-- | vpx_dsp/loongarch/loopfilter_lsx.h | 20 | ||||
-rw-r--r-- | vpx_dsp/loongarch/quantize_lsx.c | 192 | ||||
-rw-r--r-- | vpx_dsp/loongarch/sad_lsx.c | 231 | ||||
-rw-r--r-- | vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 102 | ||||
-rw-r--r-- | vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c | 28 | ||||
-rw-r--r-- | vpx_dsp/loongarch/vpx_convolve8_lsx.c | 110 | ||||
-rw-r--r-- | vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c | 44 | ||||
-rw-r--r-- | vpx_dsp/loongarch/vpx_convolve_lsx.h | 100 |
17 files changed, 654 insertions, 653 deletions
diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c index 679019ff6..eee871eec 100644 --- a/vp8/common/loongarch/idct_lsx.c +++ b/vp8/common/loongarch/idct_lsx.c @@ -16,47 +16,44 @@ static const int32_t cospi8sqrt2minus1 = 20091; static const int32_t sinpi8sqrt2 = 35468; #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ \ DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m); \ DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m); \ DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ - } + } while (0) #define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i s4_m, s5_m, s6_m, s7_m; \ \ TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \ DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2); \ out1 = __lsx_vilvh_d(s6_m, s4_m); \ out3 = __lsx_vilvh_d(s7_m, s5_m); \ - } + } while (0) -#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in) \ - ({ \ - __m128i out_m; \ +#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1) \ + do { \ __m128i zero_m = __lsx_vldi(0); \ __m128i tmp1_m, tmp2_m; \ __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \ \ - tmp1_m = __lsx_vilvl_h(in, zero_m); \ - tmp2_m = __lsx_vilvh_h(in, zero_m); \ + tmp1_m = __lsx_vilvl_h(in0, zero_m); \ + tmp2_m = __lsx_vilvh_h(in0, zero_m); \ tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m); \ tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m); \ tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ - out_m = __lsx_vpickev_h(tmp2_m, tmp1_m); \ - \ - out_m; \ - }) + in1 = __lsx_vpickev_h(tmp2_m, tmp1_m); \ + } while (0) #define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i a1_m, b1_m, c1_m, d1_m; \ __m128i c_tmp1_m, c_tmp2_m; \ __m128i d_tmp1_m, d_tmp2_m; \ @@ -65,7 +62,7 @@ static const int32_t sinpi8sqrt2 = 35468; const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \ a1_m = __lsx_vadd_h(in0, in2); \ b1_m = __lsx_vsub_h(in0, in2); \ - c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1); \ + EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m); \ \ c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m); \ c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1); \ @@ -77,13 +74,13 @@ static const int32_t sinpi8sqrt2 = 35468; d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1); \ d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1); \ d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m); \ - d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3); \ + EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m); \ d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m); \ LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ - } + } while (0) #define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i a1_m, b1_m, c1_m, d1_m; \ __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \ @@ -105,13 +102,13 @@ static const int32_t sinpi8sqrt2 = 35468; d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16); \ d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m); \ LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ - } + } while (0) #define UNPCK_SH_SW(in, out0, out1) \ - { \ + do { \ out0 = __lsx_vsllwil_w_h(in, 0); \ out1 = __lsx_vexth_w_h(in); \ - } + } while (0) static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred, int32_t pred_stride, uint8_t *dest, diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c index a3ac76d25..f743ec0c5 100644 --- a/vp8/common/loongarch/loopfilter_filters_lsx.c +++ b/vp8/common/loongarch/loopfilter_filters_lsx.c @@ -14,7 +14,7 @@ #include "vpx_util/loongson_intrinsics.h" #define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ - { \ + do { \ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ const __m128i cnst4b = __lsx_vldi(4); \ const __m128i cnst3b = __lsx_vldi(3); \ @@ -46,10 +46,10 @@ q1 = __lsx_vxori_b(q1_m, 0x80); \ p1_m = __lsx_vsadd_b(p1_m, filt); \ p1 = __lsx_vxori_b(p1_m, 0x80); \ - } + } while (0) #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ - { \ + do { \ __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ __m128i u, filt, t1, t2, filt_sign, q0_sub_p0; \ __m128i filt_r, filt_l; \ @@ -113,12 +113,12 @@ p0_m = __lsx_vsadd_b(p0_m, u); \ q0 = __lsx_vxori_b(q0_m, 0x80); \ p0 = __lsx_vxori_b(p0_m, 0x80); \ - } + } while (0) #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ flat_out) \ - { \ + do { \ __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ \ @@ -143,13 +143,13 @@ mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ mask_out = __lsx_vslt_bu(limit_in, mask_out); \ mask_out = __lsx_vxori_b(mask_out, 0xff); \ - } + } while (0) #define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \ - { \ + do { \ __lsx_vstelm_w(in0, pdst, 0, in0_idx); \ __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \ - } + } while (0) static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c index a23ed16d2..cd7ba5474 100644 --- a/vp8/common/loongarch/sixtap_filter_lsx.c +++ b/vp8/common/loongarch/sixtap_filter_lsx.c @@ -33,37 +33,61 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 }; -#define DPADD_H3(in0, in1, in2, coeff0, coeff1, coeff2) \ - ({ \ - __m128i out0_m; \ - \ - out0_m = __lsx_vdp2_h_b(in0, coeff0); \ - out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); \ - out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); \ - \ - out0_m; \ - }) - -#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \ - filt_h2) \ - ({ \ - __m128i vec0_m, vec1_m, vec2_m; \ - __m128i hz_out_m; \ - \ - DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \ - vec1_m); \ - vec2_m = __lsx_vshuf_b(src1, src0, mask2); \ - hz_out_m = DPADD_H3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \ - \ - hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __lsx_vsat_h(hz_out_m, 7); \ - \ - hz_out_m; \ - }) +static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2, + __m128i coeff0, __m128i coeff1, __m128i coeff2) { + __m128i out0_m; + + out0_m = __lsx_vdp2_h_b(in0, coeff0); + out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); + out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); + + return out0_m; +} + +static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0, + __m128i mask1, __m128i mask2, + __m128i filt_h0, __m128i filt_h1, + __m128i filt_h2) { + __m128i vec0_m, vec1_m, vec2_m; + __m128i hz_out_m; + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, + vec1_m); + vec2_m = __lsx_vshuf_b(src1, src0, mask2); + hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); + hz_out_m = __lsx_vsat_h(hz_out_m, 7); + + return hz_out_m; +} + +static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1, + __m128i filt0, __m128i filt1) { + __m128i tmp_m; + + tmp_m = __lsx_vdp2_h_b(vec0, filt0); + tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1); + + return tmp_m; +} + +static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0, + __m128i mask1, __m128i filt_h0, + __m128i filt_h1) { + __m128i vec0_m, vec1_m, hz_out_m; + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, + vec1_m); + hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1); + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); + hz_out_m = __lsx_vsat_h(hz_out_m, 7); + + return hz_out_m; +} #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, filt0, filt1, filt2, out0, out1) \ - { \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ \ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ @@ -77,12 +101,12 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { vec5_m); \ DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ out0, out1); \ - } + } while (0) #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, filt0, filt1, filt2, out0, out1, \ out2, out3) \ - ({ \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ \ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ @@ -105,35 +129,11 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, \ out3); \ - }) - -#define FILT_4TAP_DPADD_H(vec0, vec1, filt0, filt1) \ - ({ \ - __m128i tmp0; \ - \ - tmp0 = __lsx_vdp2_h_b(vec0, filt0); \ - tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1); \ - \ - tmp0; \ - }) - -#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ - ({ \ - __m128i vec0_m, vec1_m; \ - __m128i hz_out_m; \ - \ - DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \ - vec1_m); \ - hz_out_m = FILT_4TAP_DPADD_H(vec0_m, vec1_m, filt_h0, filt_h1); \ - hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __lsx_vsat_h(hz_out_m, 7); \ - \ - hz_out_m; \ - }) + } while (0) #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ filt0, filt1, out0, out1) \ - { \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ \ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ @@ -143,11 +143,11 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { vec3_m); \ DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \ out0, out1); \ - } + } while (0) #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ filt0, filt1, out0, out1, out2, out3) \ - ({ \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ \ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ @@ -163,7 +163,7 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \ out3); \ - }) + } while (0) static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, @@ -424,8 +424,8 @@ static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554, src8776); DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776); - out0 = DPADD_H3(src2110, src4332, src6554, filt0, filt1, filt2); - out1 = DPADD_H3(src4332, src6554, src8776, filt0, filt1, filt2); + out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2); + out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2); out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); out0 = __lsx_vxori_b(out0, 128); @@ -487,10 +487,10 @@ static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); - out0_r = DPADD_H3(src10_r, src32_r, src76_r, filt0, filt1, filt2); - out1_r = DPADD_H3(src21_r, src43_r, src87_r, filt0, filt1, filt2); - out2_r = DPADD_H3(src32_r, src76_r, src98_r, filt0, filt1, filt2); - out3_r = DPADD_H3(src43_r, src87_r, src109_r, filt0, filt1, filt2); + out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2); DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -555,14 +555,14 @@ static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src54_r, src65_r, src76_r, src87_r); DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, src76_l, src87_l); - out0_r = DPADD_H3(src10_r, src32_r, src54_r, filt0, filt1, filt2); - out1_r = DPADD_H3(src21_r, src43_r, src65_r, filt0, filt1, filt2); - out2_r = DPADD_H3(src32_r, src54_r, src76_r, filt0, filt1, filt2); - out3_r = DPADD_H3(src43_r, src65_r, src87_r, filt0, filt1, filt2); - out0_l = DPADD_H3(src10_l, src32_l, src54_l, filt0, filt1, filt2); - out1_l = DPADD_H3(src21_l, src43_l, src65_l, filt0, filt1, filt2); - out2_l = DPADD_H3(src32_l, src54_l, src76_l, filt0, filt1, filt2); - out3_l = DPADD_H3(src43_l, src65_l, src87_l, filt0, filt1, filt2); + out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2); DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); @@ -621,12 +621,12 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src1, src2, src3); src4 = __lsx_vxori_b(src4, 128); - hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); - hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); @@ -636,7 +636,7 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src += src_stride_x2; DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); - hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); @@ -645,15 +645,15 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src += src_stride_x2; DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8); - hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0, + hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); out2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); out3 = __lsx_vpackev_b(hz_out7, hz_out6); - tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); tmp0 = __lsx_vxori_b(tmp0, 128); @@ -710,15 +710,15 @@ static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src1, src2, src3); src4 = __lsx_vxori_b(src4, 128); - hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); filt = __lsx_vld(filter_vert, 0); DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); @@ -734,25 +734,25 @@ static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, src6, src7, src8); - hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); out2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); - hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); out5 = __lsx_vpackev_b(hz_out6, hz_out5); - tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); - hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, + hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); out7 = __lsx_vpackev_b(hz_out7, hz_out6); - tmp2 = DPADD_H3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); + tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); - hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, + hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); out6 = __lsx_vpackev_b(hz_out8, hz_out7); - tmp3 = DPADD_H3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); + tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2, VP8_FILTER_SHIFT, vec0, vec1); @@ -997,14 +997,14 @@ static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); src4332 = __lsx_vilvl_d(src43_r, src32_r); src4332 = __lsx_vxori_b(src4332, 128); - out0 = FILT_4TAP_DPADD_H(src2110, src4332, filt0, filt1); + out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1); src2 = __lsx_vld(src, 0); src += src_stride; DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r); src2110 = __lsx_vilvl_d(src65_r, src54_r); src2110 = __lsx_vxori_b(src2110, 128); - out1 = FILT_4TAP_DPADD_H(src4332, src2110, filt0, filt1); + out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1); out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); out0 = __lsx_vxori_b(out0, 128); @@ -1055,10 +1055,10 @@ static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src8, src9, src10); DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9, src72_r, src87_r, src98_r, src109_r); - out0_r = FILT_4TAP_DPADD_H(src10_r, src72_r, filt0, filt1); - out1_r = FILT_4TAP_DPADD_H(src21_r, src87_r, filt0, filt1); - out2_r = FILT_4TAP_DPADD_H(src72_r, src98_r, filt0, filt1); - out3_r = FILT_4TAP_DPADD_H(src87_r, src109_r, filt0, filt1); + out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1); + out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1); + out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1); + out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1); DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -1114,14 +1114,14 @@ static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src32_r, src43_r, src54_r, src65_r); DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l, src54_l, src65_l); - out0_r = FILT_4TAP_DPADD_H(src10_r, src32_r, filt0, filt1); - out1_r = FILT_4TAP_DPADD_H(src21_r, src43_r, filt0, filt1); - out2_r = FILT_4TAP_DPADD_H(src32_r, src54_r, filt0, filt1); - out3_r = FILT_4TAP_DPADD_H(src43_r, src65_r, filt0, filt1); - out0_l = FILT_4TAP_DPADD_H(src10_l, src32_l, filt0, filt1); - out1_l = FILT_4TAP_DPADD_H(src21_l, src43_l, filt0, filt1); - out2_l = FILT_4TAP_DPADD_H(src32_l, src54_l, filt0, filt1); - out3_l = FILT_4TAP_DPADD_H(src43_l, src65_l, filt0, filt1); + out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1); + out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1); + out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1); + out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1); + out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1); + out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1); + out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1); + out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1); DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); @@ -1168,8 +1168,8 @@ static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); src2 = __lsx_vxori_b(src2, 128); - hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); - hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, @@ -1182,16 +1182,16 @@ static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src += src_stride_x4; DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4); - hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1); hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); vec1 = __lsx_vpackev_b(hz_out3, hz_out2); - tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); - hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1); hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); vec2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); tmp0 = __lsx_vxori_b(tmp0, 128); @@ -1239,9 +1239,9 @@ static inline void common_hv_4ht_4vt_8w_lsx( DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); src2 = __lsx_vxori_b(src2, 128); - hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); - hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); - hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); filt = __lsx_vld(filter_vert, 0); @@ -1254,21 +1254,21 @@ static inline void common_hv_4ht_4vt_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, src4, src5, src6); - hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1); vec1 = __lsx_vpackev_b(hz_out3, hz_out2); - tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); - hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1); vec3 = __lsx_vpackev_b(hz_out0, hz_out3); - tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1); + tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1); - hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1); vec4 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp2 = FILT_4TAP_DPADD_H(vec1, vec4, filt_vt0, filt_vt1); + tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1); - hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1); - tmp3 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -1324,9 +1324,9 @@ static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); src2 = __lsx_vxori_b(src2, 128); - hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); @@ -1341,17 +1341,17 @@ static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, src4, src5, src6); - hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, + hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); vec1 = __lsx_vpackev_b(hz_out3, hz_out2); - tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); - hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); vec2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -1402,11 +1402,11 @@ static inline void common_hv_6ht_4vt_8w_lsx( DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); src2 = __lsx_vxori_b(src2, 128); - hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); @@ -1420,25 +1420,25 @@ static inline void common_hv_6ht_4vt_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, src4, src5, src6); - hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, + hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); vec1 = __lsx_vpackev_b(hz_out3, hz_out2); - tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); - hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, + hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); vec3 = __lsx_vpackev_b(hz_out0, hz_out3); - tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1); + tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1); - hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp2 = FILT_4TAP_DPADD_H(vec1, vec0, filt_vt0, filt_vt1); + tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1); - hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2); - tmp3 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -1492,9 +1492,9 @@ static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, src1, src2, src3); src4 = __lsx_vxori_b(src4, 128); - hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); - hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1); - hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1); hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); @@ -1510,15 +1510,15 @@ static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src6, src7, src8); src += src_stride_x4; - hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1); hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); out2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); - hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1); hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); out3 = __lsx_vpackev_b(hz_out7, hz_out6); - tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); tmp0 = __lsx_vxori_b(tmp0, 128); @@ -1571,11 +1571,11 @@ static inline void common_hv_4ht_6vt_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, src1, src2, src3); src4 = __lsx_vxori_b(src4, 128); - hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); - hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); - hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); - hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); - hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4); @@ -1590,21 +1590,21 @@ static inline void common_hv_4ht_6vt_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, src6, src7, src8); - hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1); out2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); - hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1); out5 = __lsx_vpackev_b(hz_out6, hz_out5); - tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); - hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); + hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1); out6 = __lsx_vpackev_b(hz_out7, hz_out6); - tmp2 = DPADD_H3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); + tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); - hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1); out7 = __lsx_vpackev_b(hz_out8, hz_out7); - tmp3 = DPADD_H3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); + tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1); DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1); __lsx_vstelm_d(vec0, dst, 0, 0); diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h index 4834f18fc..b0db1e99c 100644 --- a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h +++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h @@ -16,33 +16,26 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_util/loongson_intrinsics.h" +static INLINE __m128i load_tran_low(const tran_low_t *s) { #if CONFIG_VP9_HIGHBITDEPTH -#define load_tran_low(s) \ - ({ \ - __m128i res0_m; \ - __m128i v0_m = __lsx_vld(s, 0); \ - __m128i v1_m = __lsx_vld(s + 4, 0); \ - res0_m = __lsx_vsrlni_h_w(v0_m, v1_m, 0); \ - res0_m; \ - }) - -#define store_tran_low(v, s, c) \ - { \ - __m128i v0_m, v1_m; \ - v1_m = __lsx_vexth_w_h(v); \ - v0_m = __lsx_vsllwil_w_h(v, 0); \ - __lsx_vst(v0_m, s + c, 0); \ - __lsx_vst(v1_m, s + c + 4, 0); \ - } + __m128i v0_m = __lsx_vld(s, 0); + __m128i v1_m = __lsx_vld(s + 4, 0); + return __lsx_vsrlni_h_w(v0_m, v1_m, 0); #else -#define load_tran_low(s) \ - ({ \ - __m128i res0_m; \ - res0_m = __lsx_vld(s, 0); \ - res0_m; \ - }) + return __lsx_vld(s, 0); +#endif +} -#define store_tran_low(v, s, c) __lsx_vst(v, s + c, 0) -#endif // CONFIG_VP9_HIGHBITDEPTH +static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) { +#if CONFIG_VP9_HIGHBITDEPTH + __m128i v0_m, v1_m; + v1_m = __lsx_vexth_w_h(v); + v0_m = __lsx_vsllwil_w_h(v, 0); + __lsx_vst(v0_m, s + c, 0); + __lsx_vst(v1_m, s + c + 4, 0); +#else + __lsx_vst(v, s + c, 0); +#endif +} #endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ diff --git a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c index e5c301b2c..9bb387721 100644 --- a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c +++ b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c @@ -13,10 +13,10 @@ #include "vpx_dsp/fwd_txfm.h" #define UNPCK_SH_SW(in, out0, out1) \ - { \ + do { \ out0 = __lsx_vsllwil_w_h(in, 0); \ out1 = __lsx_vexth_w_h(in); \ - } + } while (0) static void fdct8x32_1d_column_load_butterfly(const int16_t *input, int32_t src_stride, diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c index 6f2d4d6fe..508532b9d 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.c +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c @@ -12,7 +12,7 @@ #include "vpx_dsp/loongarch/fwd_txfm_lsx.h" #define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ + do { \ __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ \ DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ @@ -23,7 +23,7 @@ _t3 = __lsx_vilvh_h(_s3, _s2); \ DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ - } + } while (0) #if !CONFIG_VP9_HIGHBITDEPTH void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h index d04427a6e..4a9fce9a3 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.h +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -15,7 +15,7 @@ #include "vpx_dsp/txfm_common.h" #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ __m128i vec4_m, vec5_m, vec6_m, vec7_m; \ @@ -38,11 +38,11 @@ DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \ vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \ vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \ - } + } while (0) #define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ out3, out4, out5, out6, out7) \ - { \ + do { \ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ @@ -97,10 +97,10 @@ x3_m = __lsx_vneg_h(x3_m); \ x2_m = __lsx_vpackev_h(x2_m, x3_m); \ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ - } + } while (0) #define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ - { \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ \ DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \ @@ -111,10 +111,10 @@ in3, in0, in1, in2, in3); \ DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \ in7, in4, in5, in6, in7); \ - } + } while (0) #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ - { \ + do { \ __m128i tp0_m, tp1_m; \ __m128i one = __lsx_vreplgr2vr_h(1); \ \ @@ -130,10 +130,10 @@ vec1 = __lsx_vadd_h(vec1, tp1_m); \ vec0 = __lsx_vsrai_h(vec0, 2); \ vec1 = __lsx_vsrai_h(vec1, 2); \ - } + } while (0) #define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ - { \ + do { \ __m128i tp0_m, tp1_m; \ __m128i one_m = __lsx_vldi(0x401); \ \ @@ -147,10 +147,10 @@ vec1 = __lsx_vadd_h(vec1, tp1_m); \ vec0 = __lsx_vsrai_h(vec0, 2); \ vec1 = __lsx_vsrai_h(vec1, 2); \ - } + } while (0) #define FDCT32_POSTPROC_NEG_W(vec) \ - { \ + do { \ __m128i temp_m; \ __m128i one_m = __lsx_vreplgr2vr_w(1); \ \ @@ -159,11 +159,11 @@ temp_m = __lsx_vand_v(one_m, temp_m); \ vec = __lsx_vadd_w(vec, temp_m); \ vec = __lsx_vsrai_w(vec, 2); \ - } + } while (0) #define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ const0, const1, out0, out1, out2, out3) \ - { \ + do { \ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \ __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \ @@ -188,11 +188,11 @@ DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \ DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ DCT_CONST_BITS, out2, out3); \ - } + } while (0) #define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \ in3) \ - { \ + do { \ __m128i dst0_m, dst1_m, dst2_m, dst3_m; \ __m128i tmp0_m, tmp1_m; \ __m128i res0_m, res1_m, res2_m, res3_m; \ @@ -210,11 +210,11 @@ __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \ __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \ __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \ - } + } while (0) #define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, out4, out5, out6, out7) \ - { \ + do { \ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ __m128i x0_m, x1_m, x2_m, x3_m; \ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ @@ -270,12 +270,12 @@ x3_m = __lsx_vneg_h(x3_m); \ x2_m = __lsx_vpackev_h(x2_m, x3_m); \ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ - } + } while (0) #define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ input7, out1, out3, out5, out7, out9, out11, out13, \ out15) \ - { \ + do { \ __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ __m128i stp36_m, stp37_m, vec0_m, vec1_m; \ @@ -373,7 +373,7 @@ cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \ - } + } while (0) void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride); diff --git a/vpx_dsp/loongarch/idct32x32_lsx.c b/vpx_dsp/loongarch/idct32x32_lsx.c index d6890c28e..ec07f57d9 100644 --- a/vpx_dsp/loongarch/idct32x32_lsx.c +++ b/vpx_dsp/loongarch/idct32x32_lsx.c @@ -12,10 +12,10 @@ #include "vpx_dsp/loongarch/fwd_txfm_lsx.h" #define UNPCK_UB_SH(_in, _out0, _out1) \ - { \ + do { \ _out0 = __lsx_vsllwil_hu_bu(_in, 0); \ _out1 = __lsx_vexth_hu_bu(_in); \ - } + } while (0) static void idct32x8_row_transpose_store(const int16_t *input, int16_t *tmp_buf) { diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c index cbaefcd6e..539817777 100644 --- a/vpx_dsp/loongarch/loopfilter_16_lsx.c +++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c @@ -15,7 +15,7 @@ #define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \ _in2, _in3, _in4, _in5, _in6, _in7) \ - { \ + do { \ _in0 = __lsx_vld(_src, 0); \ _in1 = __lsx_vldx(_src, _stride); \ _in2 = __lsx_vldx(_src, _stride2); \ @@ -25,11 +25,11 @@ _in5 = __lsx_vldx(_src, _stride); \ _in6 = __lsx_vldx(_src, _stride2); \ _in7 = __lsx_vldx(_src, _stride3); \ - } + } while (0) #define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \ _stride, _stride2, _stride3, _stride4) \ - { \ + do { \ __lsx_vst(_dst0, _dst, 0); \ __lsx_vstx(_dst1, _dst, _stride); \ __lsx_vstx(_dst2, _dst, _stride2); \ @@ -39,7 +39,7 @@ __lsx_vstx(_dst5, _dst, _stride); \ __lsx_vstx(_dst6, _dst, _stride2); \ __lsx_vstx(_dst7, _dst, _stride3); \ - } + } while (0) static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, uint8_t *filter48, diff --git a/vpx_dsp/loongarch/loopfilter_lsx.h b/vpx_dsp/loongarch/loopfilter_lsx.h index 53e15fe6d..1c4383650 100644 --- a/vpx_dsp/loongarch/loopfilter_lsx.h +++ b/vpx_dsp/loongarch/loopfilter_lsx.h @@ -16,7 +16,7 @@ #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ flat_out) \ - { \ + do { \ __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ \ @@ -47,10 +47,10 @@ \ mask_out = __lsx_vslt_bu(limit_in, mask_out); \ mask_out = __lsx_vxori_b(mask_out, 0xff); \ - } + } while (0) #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ - { \ + do { \ __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \ __m128i flat4_tmp = __lsx_vldi(1); \ \ @@ -64,11 +64,11 @@ flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \ flat_out = __lsx_vxori_b(flat_out, 0xff); \ flat_out = flat_out & (mask); \ - } + } while (0) #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ q6_in, q7_in, flat_in, flat2_out) \ - { \ + do { \ __m128i flat5_tmp = __lsx_vldi(1); \ __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \ __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \ @@ -87,11 +87,11 @@ flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \ flat2_out = __lsx_vxori_b(flat2_out, 0xff); \ flat2_out = flat2_out & flat_in; \ - } + } while (0) #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ p0_out, q0_out, q1_out) \ - { \ + do { \ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ const __m128i cnst4b = __lsx_vldi(4); \ const __m128i cnst3b = __lsx_vldi(3); \ @@ -118,12 +118,12 @@ q1_m = __lsx_vssub_b(q1_m, filt); \ p1_m = __lsx_vsadd_b(p1_m, filt); \ DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \ - } + } while (0) #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ q1_filt8_out, q2_filt8_out) \ - { \ + do { \ __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ \ tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \ @@ -162,6 +162,6 @@ tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ - } + } while (0) #endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c index e3fbb9e9e..2fc33b06b 100644 --- a/vpx_dsp/loongarch/quantize_lsx.c +++ b/vpx_dsp/loongarch/quantize_lsx.c @@ -12,79 +12,83 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_util/loongson_intrinsics.h" -#define CALCULATE_QCOEFF(coeff, coeff_abs, round, quant, shift, cmp_mask) \ - ({ \ - __m128i rounded, qcoeff; \ - \ - rounded = __lsx_vsadd_h(coeff_abs, round); \ - qcoeff = __lsx_vmuh_h(rounded, quant); \ - qcoeff = __lsx_vadd_h(rounded, qcoeff); \ - qcoeff = __lsx_vmuh_h(qcoeff, shift); \ - qcoeff = __lsx_vsigncov_h(coeff, qcoeff); \ - qcoeff = __lsx_vand_v(qcoeff, cmp_mask); \ - \ - qcoeff; \ - }) - -#define CALCULATE_DQCOEFF_AND_STORE(qcoeff, dequant, dqcoeff) \ - { \ - __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); \ - __lsx_vst(dqcoeff16, dqcoeff, 0); \ - } +static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs, + __m128i round, __m128i quant, + __m128i shift, __m128i cmp_mask) { + __m128i rounded, qcoeff; + + rounded = __lsx_vsadd_h(coeff_abs, round); + qcoeff = __lsx_vmuh_h(rounded, quant); + qcoeff = __lsx_vadd_h(rounded, qcoeff); + qcoeff = __lsx_vmuh_h(qcoeff, shift); + qcoeff = __lsx_vsigncov_h(coeff, qcoeff); + qcoeff = __lsx_vand_v(qcoeff, cmp_mask); + + return qcoeff; +} -#define CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff, dequant, dqcoeff) \ - { \ - __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; \ - __m128i zero = __lsx_vldi(0); \ - __m128i coeff = __lsx_vabsd_h(qcoeff, zero); \ - \ - __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); \ - __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); \ - \ - low = __lsx_vmul_h(coeff, dequant); \ - high = __lsx_vmuh_h(coeff, dequant); \ - dqcoeff32_0 = __lsx_vilvl_h(high, low); \ - dqcoeff32_1 = __lsx_vilvh_h(high, low); \ - \ - dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); \ - dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); \ - dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); \ - dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); \ - res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); \ - __lsx_vst(res, dqcoeff, 0); \ - } +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + int16_t *dqcoeff) { + __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); + __lsx_vst(dqcoeff16, dqcoeff, 0); +} + +static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, + __m128i dequant, + int16_t *dqcoeff) { + // Un-sign to bias rounding like C. + __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; + __m128i zero = __lsx_vldi(0); + __m128i coeff = __lsx_vabsd_h(qcoeff, zero); + + const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); + const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); + + low = __lsx_vmul_h(coeff, dequant); + high = __lsx_vmuh_h(coeff, dequant); + dqcoeff32_0 = __lsx_vilvl_h(high, low); + dqcoeff32_1 = __lsx_vilvh_h(high, low); + + // "Divide" by 2. + dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); + dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); + dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); + dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); + res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); + __lsx_vst(res, dqcoeff, 0); +} + +static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, + __m128i zbin_mask0, __m128i zbin_mask1, + const int16_t *scan, int index, + __m128i zero) { + const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); + const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); + __m128i scan0 = __lsx_vld(scan + index, 0); + __m128i scan1 = __lsx_vld(scan + index + 8, 0); + __m128i eob0, eob1; + + scan0 = __lsx_vsub_h(scan0, zbin_mask0); + scan1 = __lsx_vsub_h(scan1, zbin_mask1); + eob0 = __lsx_vandn_v(zero_coeff0, scan0); + eob1 = __lsx_vandn_v(zero_coeff1, scan1); + return __lsx_vmax_h(eob0, eob1); +} -#define SCAN_FOR_EOB(coeff0, coeff1, zbin_mask0, zbin_mask1, scan, index, \ - zero) \ - ({ \ - __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); \ - __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); \ - __m128i scan0 = __lsx_vld(scan + index, 0); \ - __m128i scan1 = __lsx_vld(scan + index + 8, 0); \ - __m128i eob0, eob1, eob_max; \ - \ - scan0 = __lsx_vsub_h(scan0, zbin_mask0); \ - scan1 = __lsx_vsub_h(scan1, zbin_mask1); \ - eob0 = __lsx_vandn_v(zero_coeff0, scan0); \ - eob1 = __lsx_vandn_v(zero_coeff1, scan1); \ - eob_max = __lsx_vmax_h(eob0, eob1); \ - eob_max; \ - }) - -#define ACCUMULATE_EOB(eob) \ - ({ \ - __m128i eob_shuffled; \ - int16_t res_m; \ - \ - eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); \ - eob = __lsx_vmax_h(eob, eob_shuffled); \ - eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); \ - eob = __lsx_vmax_h(eob, eob_shuffled); \ - eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); \ - eob = __lsx_vmax_h(eob, eob_shuffled); \ - res_m = __lsx_vpickve2gr_h(eob, 1); \ - res_m; \ - }) +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + int16_t res_m; + + eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); + eob = __lsx_vmax_h(eob, eob_shuffled); + res_m = __lsx_vpickve2gr_h(eob, 1); + + return res_m; +} #if !CONFIG_VP9_HIGHBITDEPTH void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, @@ -120,21 +124,21 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); qcoeff0 = - CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); round = __lsx_vilvh_d(round, round); quant = __lsx_vilvh_d(quant, quant); quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); qcoeff1 = - CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); __lsx_vst(qcoeff0, qcoeff_ptr, 0); __lsx_vst(qcoeff1, qcoeff_ptr, 16); - CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = __lsx_vilvh_d(dequant, dequant); - CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + 8); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -147,24 +151,24 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); qcoeff0 = - CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); qcoeff1 = - CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); - CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr + index); - CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + index + 8); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); index += 16; } - *eob_ptr = ACCUMULATE_EOB(eob); + *eob_ptr = accumulate_eob(eob); } void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, @@ -204,20 +208,20 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); qcoeff0 = - CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); // remove DC in quant_shift, quant, quant_shift round = __lsx_vilvh_d(round, round); quant = __lsx_vilvh_d(quant, quant); quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); qcoeff1 = - CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); __lsx_vst(qcoeff0, qcoeff_ptr, 0); __lsx_vst(qcoeff1, qcoeff_ptr, 16); - CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); dequant = __lsx_vilvh_d(dequant, dequant); - CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); + eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. for (index = 16; index < 32 * 32; index += 16) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -230,20 +234,20 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); qcoeff0 = - CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); qcoeff1 = - CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); - CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr + index); - CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8 + index); - eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); } - *eob_ptr = ACCUMULATE_EOB(eob); + *eob_ptr = accumulate_eob(eob); } -#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 5eaebfb51..b6fbedb0d 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -8,59 +8,63 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_util/loongson_intrinsics.h" -#define SAD_UB2_UH(in0, in1, ref0, ref1) \ - ({ \ - __m128i diff0_m, diff1_m, sad_m0; \ - __m128i sad_m = __lsx_vldi(0); \ - \ - diff0_m = __lsx_vabsd_bu(in0, ref0); \ - diff1_m = __lsx_vabsd_bu(in1, ref1); \ - \ - sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); \ - sad_m = __lsx_vadd_h(sad_m, sad_m0); \ - sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); \ - sad_m = __lsx_vadd_h(sad_m, sad_m0); \ - \ - sad_m; \ - }) - -#define HADD_UW_U32(in) \ - ({ \ - __m128i res0_m; \ - uint32_t sum_m; \ - res0_m = __lsx_vhaddw_du_wu(in, in); \ - res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); \ - sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ - sum_m; \ - }) - -#define HADD_UH_U32(in) \ - ({ \ - __m128i res_m; \ - uint32_t sum_m; \ - res_m = __lsx_vhaddw_wu_hu(in, in); \ - sum_m = HADD_UW_U32(res_m); \ - sum_m; \ - }) - -#define HADD_SW_S32(in) \ - ({ \ - __m128i res0_m; \ - int32_t sum_m; \ - \ - res0_m = __lsx_vhaddw_d_w(in, in); \ - res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ - sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ - sum_m; \ - }) +static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0, + __m128i ref1) { + __m128i diff0_m, diff1_m, sad_m0; + __m128i sad_m = __lsx_vldi(0); + + diff0_m = __lsx_vabsd_bu(in0, ref0); + diff1_m = __lsx_vabsd_bu(in1, ref1); + + sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + + return sad_m; +} + +static INLINE uint32_t hadd_uw_u32(__m128i in) { + __m128i res0_m; + uint32_t sum_m; + + res0_m = __lsx_vhaddw_du_wu(in, in); + res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} + +static INLINE uint32_t hadd_uh_u32(__m128i in) { + __m128i res_m; + uint32_t sum_m; + + res_m = __lsx_vhaddw_wu_hu(in, in); + sum_m = hadd_uw_u32(res_m); + + return sum_m; +} + +static INLINE int32_t hadd_sw_s32(__m128i in) { + __m128i res0_m; + int32_t sum_m; + + res0_m = __lsx_vhaddw_d_w(in, in); + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { int32_t ht_cnt; + uint32_t res; __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp; __m128i sad = __lsx_vldi(0); @@ -79,16 +83,18 @@ static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, ref += ref_stride; DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); } - return HADD_UH_U32(sad); + res = hadd_uh_u32(sad); + return res; } static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { int32_t ht_cnt = (height >> 2); + uint32_t res; __m128i src0, src1, ref0, ref1, sad_tmp; __m128i sad = __lsx_vldi(0); int32_t src_stride2 = src_stride << 1; @@ -99,23 +105,26 @@ static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); src += src_stride2; ref += ref_stride2; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); src += src_stride2; ref += ref_stride2; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); } - return HADD_UH_U32(sad); + + res = hadd_uh_u32(sad); + return res; } static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { int32_t ht_cnt = (height >> 2); + uint32_t res; __m128i src0, src1, ref0, ref1; __m128i sad_tmp; __m128i sad = __lsx_vldi(0); @@ -125,31 +134,32 @@ static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, src += src_stride; DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); src += src_stride; DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); src += src_stride; DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); src += src_stride; DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); } - return HADD_UH_U32(sad); + res = hadd_uh_u32(sad); + return res; } static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, @@ -170,9 +180,9 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, ref3); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, @@ -181,14 +191,14 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, ref3); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad1 = __lsx_vadd_h(sad1, sad_tmp); } - sad = HADD_UH_U32(sad0); - sad += HADD_UH_U32(sad1); + sad = hadd_uh_u32(sad0); + sad += hadd_uh_u32(sad1); return sad; } @@ -247,25 +257,25 @@ static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1); DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0 = __lsx_vadd_h(sad0, sad_tmp); DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad2 = __lsx_vadd_h(sad2, sad_tmp); DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad3 = __lsx_vadd_h(sad3, sad_tmp); } - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); } static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, @@ -334,10 +344,10 @@ static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); sad3 = __lsx_vadd_h(sad3, sad_tmp); } - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); } static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride, @@ -363,28 +373,28 @@ static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1); ref0_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0 = __lsx_vadd_h(sad0, sad_tmp); DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1); ref1_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1); ref2_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad2 = __lsx_vadd_h(sad2, sad_tmp); DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1); ref3_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad3 = __lsx_vadd_h(sad3, sad_tmp); } - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); } static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, @@ -419,60 +429,60 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48, ref0, ref1, ref2, ref3); ref0_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp); DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48, ref0, ref1, ref2, ref3); ref1_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp); DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48, ref0, ref1, ref2, ref3); ref2_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp); DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48, ref0, ref1, ref2, ref3); ref3_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp); } sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0); sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1); sad = __lsx_vadd_w(sad, sad_tmp); - sad_array[0] = HADD_UW_U32(sad); + sad_array[0] = hadd_uw_u32(sad); sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0); sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1); sad = __lsx_vadd_w(sad, sad_tmp); - sad_array[1] = HADD_UW_U32(sad); + sad_array[1] = hadd_uw_u32(sad); sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0); sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1); sad = __lsx_vadd_w(sad, sad_tmp); - sad_array[2] = HADD_UW_U32(sad); + sad_array[2] = hadd_uw_u32(sad); sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0); sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1); sad = __lsx_vadd_w(sad, sad_tmp); - sad_array[3] = HADD_UW_U32(sad); + sad_array[3] = hadd_uw_u32(sad); } static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt = (height >> 2); + int32_t res, ht_cnt = (height >> 2); __m128i src0, src1, src2, src3, src4, src5, src6, src7; __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; @@ -514,26 +524,26 @@ static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 128; DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1); - sad_tmp = SAD_UB2_UH(src2, src3, comp0, comp1); + sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1); - sad_tmp = SAD_UB2_UH(src4, src5, comp0, comp1); + sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1); - sad_tmp = SAD_UB2_UH(src6, src7, comp0, comp1); + sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1); sad = __lsx_vadd_h(sad, sad_tmp); } - - return HADD_UH_U32(sad); + res = hadd_uh_u32(sad); + return res; } static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt = (height >> 2); + int32_t res, ht_cnt = (height >> 2); __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3; __m128i sad, sad_tmp; @@ -552,9 +562,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 64; DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, comp1, comp2, comp3); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, @@ -568,9 +578,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 64; DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, comp1, comp2, comp3); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, @@ -584,9 +594,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 64; DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, comp1, comp2, comp3); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, @@ -600,16 +610,17 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 64; DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, comp1, comp2, comp3); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); sad1 = __lsx_vadd_h(sad1, sad_tmp); } sad = __lsx_vhaddw_wu_hu(sad0, sad0); sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1); sad = __lsx_vadd_w(sad, sad_tmp); - return HADD_SW_S32(sad); + res = hadd_sw_s32(sad); + return res; } #define VPX_SAD_8xHT_LSX(height) \ diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c index 54fcd6c57..d1abf622a 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -57,13 +57,13 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); src6 = __lsx_vxori_b(src6, 128); - tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, @@ -87,17 +87,17 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( src2 = __lsx_vilvl_d(src3, src2); DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, src8, src9, src10); - tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); tmp4 = __lsx_vpackev_b(tmp3, tmp4); - out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src0 = __lsx_vshuf_b(src1, tmp3, shuff); src0 = __lsx_vpackev_b(src1, src0); - out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS); out0 = __lsx_vxori_b(out0, 128); @@ -152,19 +152,19 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); src6 = __lsx_vxori_b(src6, 128); - src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, @@ -181,25 +181,25 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, src8, src9, src10); - src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); tmp3 = __lsx_vpackev_b(src7, src6); - out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src0 = __lsx_vpackev_b(src8, src7); - out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src1 = __lsx_vpackev_b(src9, src8); - src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src2 = __lsx_vpackev_b(src10, src9); - src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, filt_vt2, filt_vt3); DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3, FILTER_BITS, out0, out1); @@ -296,9 +296,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, src, src_stride4, src1, src2, src3, src4); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); @@ -348,11 +348,11 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( src, src_stride4, src5, src6, src7, src8); src += src_stride4; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); - hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); - hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, hz_out1, hz_out3); hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); @@ -449,20 +449,20 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( dst_tmp += dst_stride; dst3 = __lsx_vldrepl_d(dst_tmp, 0); DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec1 = __lsx_vpackev_b(hz_out0, hz_out1); tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec2 = __lsx_vpackev_b(hz_out1, hz_out0); tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); vec3 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, @@ -494,7 +494,7 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( src0 = __lsx_vld(src, 0); src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); for (; loop_cnt--;) { src1 = __lsx_vld(src, 0); @@ -502,19 +502,19 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( src4 = __lsx_vldx(src, src_stride3); src += src_stride4; - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, @@ -571,8 +571,8 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); for (; loop_cnt--;) { src0 = __lsx_vld(src, 0); @@ -588,32 +588,32 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); dst3 = __lsx_vldx(dst, dst_stride3); - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst0); __lsx_vst(tmp3, dst, 0); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst1); __lsx_vstx(tmp3, dst, dst_stride); - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst2); __lsx_vstx(tmp3, dst, dst_stride2); - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c index 584f24183..5c6413df4 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c @@ -68,9 +68,9 @@ static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, tmp0, tmp1, tmp2, tmp3); DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); - out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1, + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, filter2, filter3); - out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1, + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, filter2, filter3); out0 = __lsx_vssrarni_b_h(out1, out0, 7); out0 = __lsx_vxori_b(out0, 128); @@ -146,13 +146,13 @@ static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, src8, src9, src10); DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, tmp0, tmp1, tmp2, tmp3); - out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1, + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, filter2, filter3); - out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1, + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, filter2, filter3); - out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1, + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, filter2, filter3); - out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1, + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -231,13 +231,13 @@ static void common_vt_8t_and_aver_dst_16w_mult_lsx( src0, src1, src2, src3); DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, src4, src5, src7, src8); - tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -246,13 +246,13 @@ static void common_vt_8t_and_aver_dst_16w_mult_lsx( DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); __lsx_vst(tmp0, dst_reg, 0); __lsx_vstx(tmp1, dst_reg, dst_stride); - tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c index 73583abb9..9f5cd6cfe 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -54,13 +54,13 @@ static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); src6 = __lsx_vxori_b(src6, 128); - tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, @@ -73,17 +73,17 @@ static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, src += src_stride; DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, src8, src9, src10); - tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); tmp4 = __lsx_vpackev_b(tmp3, tmp4); - out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src0 = __lsx_vshuf_b(src1, tmp3, shuff); src0 = __lsx_vpackev_b(src1, src0); - out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); out0 = __lsx_vssrarni_b_h(out1, out0, 7); out0 = __lsx_vxori_b(out0, 128); @@ -135,19 +135,19 @@ static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); src6 = __lsx_vxori_b(src6, 128); - src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, @@ -161,25 +161,25 @@ static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, src += src_stride; DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, src8, src9, src10); - src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); tmp3 = __lsx_vpackev_b(src7, src6); - out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src0 = __lsx_vpackev_b(src8, src7); - out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src1 = __lsx_vpackev_b(src9, src8); - src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src2 = __lsx_vpackev_b(src10, src9); - src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, filt_vt2, filt_vt3); DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -267,9 +267,9 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, src0 = __lsx_vld(src, 0); DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, src, src_stride4, src1, src2, src3, src4); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); @@ -316,11 +316,11 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, src, src_stride4, src5, src6, src7, src8); src += src_stride4; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); - hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); - hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, hz_out1, hz_out3); @@ -382,20 +382,20 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, src, src_stride4, src1, src2, src3, src4); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec1 = __lsx_vpackev_b(hz_out0, hz_out1); tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec2 = __lsx_vpackev_b(hz_out1, hz_out0); tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); vec3 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); @@ -430,7 +430,7 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, src0 = __lsx_vld(src, 0); src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); for (; loop_cnt--;) { src1 = __lsx_vld(src, 0); @@ -438,19 +438,19 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, src4 = __lsx_vldx(src, src_stride3); src += src_stride4; - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); src1 = __lsx_vld(src, 0); DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); src4 = __lsx_vldx(src, src_stride3); @@ -470,19 +470,19 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, __lsx_vstelm_d(tmp2, dst, 0, 1); dst += dst_stride; - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); @@ -534,8 +534,8 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); for (; loop_cnt--;) { uint8_t *src_tmp0 = src + 8; @@ -546,32 +546,32 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7); src += src_stride4; - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c index 7e3a95b2f..6022e43c8 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c @@ -52,9 +52,9 @@ static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, tmp0, tmp1, tmp2, tmp3); DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); - out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1, + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, filter2, filter3); - out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1, + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, filter2, filter3); out0 = __lsx_vssrarni_b_h(out1, out0, 7); out0 = __lsx_vxori_b(out0, 128); @@ -116,13 +116,13 @@ static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, src8, src9, src10); DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, tmp0, tmp1, tmp2, tmp3); - out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1, + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, filter2, filter3); - out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1, + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, filter2, filter3); - out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1, + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, filter2, filter3); - out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1, + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -192,13 +192,13 @@ static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, src0, src1, src2, src3); DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, src4, src5, src7, src8); - tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -206,13 +206,13 @@ static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, dst += dst_stride; __lsx_vst(tmp1, dst, 0); dst += dst_stride; - tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -298,25 +298,25 @@ static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, src0, src1, src2, src3); DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, src4, src5, src7, src8); - tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); __lsx_vst(tmp0, dst_tmp, 0); __lsx_vstx(tmp1, dst_tmp, dst_stride); - tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h index 2428407f2..d886b0019 100644 --- a/vpx_dsp/loongarch/vpx_convolve_lsx.h +++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -11,11 +11,50 @@ #ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ #define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ -#include "vpx_util/loongson_intrinsics.h" +#include "./vpx_config.h" #include "vpx_dsp/vpx_filter.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1, + __m128i _reg2, __m128i _reg3, + __m128i _filter0, __m128i _filter1, + __m128i _filter2, __m128i _filter3) { + __m128i _vec0, _vec1; + + _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); + _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); + _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); + _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); + return __lsx_vsadd_h(_vec0, _vec1); +} + +static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1, + __m128i _mask0, __m128i _mask1, + __m128i _mask2, __m128i _mask3, + __m128i _filt_h0, __m128i _filt_h1, + __m128i _filt_h2, __m128i _filt_h3) { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + __m128i _out; + + DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1, + _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3); + _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, + _filt_h2, _filt_h3); + _out = __lsx_vsrari_h(_out, FILTER_BITS); + return __lsx_vsat_h(_out, 7); +} + +static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask, + __m128i coeff) { + __m128i tmp0_m, tmp1_m; + + tmp0_m = __lsx_vshuf_b(in1, in0, mask); + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); + return __lsx_vsrari_h(tmp1_m, FILTER_BITS); +} #define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ - { \ + do { \ _src0 = __lsx_vld(_src, 0); \ _src += _stride; \ _src1 = __lsx_vld(_src, 0); \ @@ -23,43 +62,12 @@ _src2 = __lsx_vld(_src, 0); \ _src += _stride; \ _src3 = __lsx_vld(_src, 0); \ - } - -#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, \ - _filter2, _filter3) \ - ({ \ - __m128i _vec0, _vec1; \ - \ - _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); \ - _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); \ - _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); \ - _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); \ - _vec0 = __lsx_vsadd_h(_vec0, _vec1); \ - \ - _vec0; \ - }) - -#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, \ - _filt_h0, _filt_h1, _filt_h2, _filt_h3) \ - ({ \ - __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ - __m128i _out; \ - \ - DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, \ - _src1, _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, \ - _tmp3); \ - _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \ - _filt_h2, _filt_h3); \ - _out = __lsx_vsrari_h(_out, FILTER_BITS); \ - _out = __lsx_vsat_h(_out, 7); \ - \ - _out; \ - }) + } while (0) #define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \ _mask2, _mask3, _filter0, _filter1, \ _filter2, _filter3, _out0, _out1) \ - { \ + do { \ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ __m128i _reg0, _reg1, _reg2, _reg3; \ \ @@ -78,12 +86,12 @@ DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \ _filter3, _reg2, _reg3); \ DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \ - } + } while (0) #define HORIZ_8TAP_8WID_4VECS_FILT( \ _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \ _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \ - { \ + do { \ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \ \ @@ -111,22 +119,10 @@ _reg5, _reg6, _reg7); \ DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \ _reg7, _out0, _out1, _out2, _out3); \ - } - -#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ - ({ \ - __m128i tmp0_m; \ - __m128i tmp1_m; \ - \ - tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ - tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ - tmp1_m = __lsx_vsrari_h(tmp1_m, shift); \ - \ - tmp1_m; \ - }) + } while (0) #define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \ - { \ + do { \ __m128i tmp0_m, tmp1_m; \ \ DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \ @@ -137,6 +133,6 @@ __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ pdst += stride; \ __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ - } + } while (0) #endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ |