diff options
Diffstat (limited to 'vp8/encoder/mips/msa/dct_msa.c')
-rw-r--r-- | vp8/encoder/mips/msa/dct_msa.c | 359 |
1 files changed, 178 insertions, 181 deletions
diff --git a/vp8/encoder/mips/msa/dct_msa.c b/vp8/encoder/mips/msa/dct_msa.c index be61ffa0d..308466755 100644 --- a/vp8/encoder/mips/msa/dct_msa.c +++ b/vp8/encoder/mips/msa/dct_msa.c @@ -11,189 +11,186 @@ #include "./vp8_rtcd.h" #include "vp8/common/mips/msa/vp8_macros_msa.h" -#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ -{ \ - v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m; \ - \ - ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m); \ - ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m); \ - PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2); \ - PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3); \ +#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m; \ + \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m); \ + PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2); \ + PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3); \ + } + +#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ + { \ + v8i16 tmp0_m; \ + \ + SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2); \ + ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2); \ + } + +#define RET_1_IF_NZERO_H(in0) \ + ({ \ + v8i16 tmp0_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tmp0_m = __msa_ceqi_h(in0, 0); \ + tmp0_m = tmp0_m ^ 255; \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ + }) + +#define RET_1_IF_NZERO_W(in0) \ + ({ \ + v4i32 tmp0_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + tmp0_m = __msa_ceqi_w(in0, 0); \ + tmp0_m = tmp0_m ^ 255; \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ + }) + +#define RET_1_IF_NEG_W(in0) \ + ({ \ + v4i32 tmp0_m; \ + \ + v4i32 one_m = __msa_ldi_w(1); \ + tmp0_m = __msa_clti_s_w(in0, 0); \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ + }) + +void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { + v8i16 in0, in1, in2, in3; + v8i16 temp0, temp1; + v8i16 const0, const1; + v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; + v4i32 out0, out1, out2, out3; + v8i16 zero = { 0 }; + + LD_SH4(input, pitch / 2, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + SLLI_4V(temp0, temp1, in1, in3, 3); + in0 = temp0 + temp1; + in2 = temp0 - temp1; + SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); + temp0 = __msa_ilvr_h(in3, in1); + in1 = __msa_splati_h(coeff, 3); + out0 = (v4i32)__msa_ilvev_h(zero, in1); + coeff = __msa_ilvl_h(zero, coeff); + out1 = __msa_splati_w((v4i32)coeff, 0); + DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); + out0 >>= 12; + out1 >>= 12; + PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = temp0 + temp1 + 7; + in2 = temp0 - temp1 + 7; + in0 >>= 4; + in2 >>= 4; + ILVR_H2_SW(zero, in0, zero, in2, out0, out2); + temp1 = RET_1_IF_NZERO_H(in3); + ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); + SPLATI_W2_SW(coeff, 2, out3, out1); + out3 += out1; + out1 = __msa_splati_w((v4i32)coeff, 1); + DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); + out1 >>= 16; + out3 >>= 16; + out1 += (v4i32)temp1; + PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); + ST_SH2(in0, in2, output, 8); } -#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ -{ \ - v8i16 tmp0_m; \ - \ - SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2); \ - ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2); \ +void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) { + v8i16 in0, in1, in2, in3; + v8i16 temp0, temp1, tmp0, tmp1; + v8i16 const0, const1, const2; + v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; + v8i16 zero = { 0 }; + v4i32 vec0_w, vec1_w, vec2_w, vec3_w; + + LD_SH4(input, pitch / 2, in0, in1, in2, in3); + TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + SLLI_4V(temp0, temp1, in1, in3, 3); + in0 = temp0 + temp1; + in2 = temp0 - temp1; + SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); + temp0 = __msa_splati_h(coeff, 3); + vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); + coeff = __msa_ilvl_h(zero, coeff); + vec3_w = __msa_splati_w((v4i32)coeff, 0); + ILVRL_H2_SH(in3, in1, tmp1, tmp0); + vec0_w = vec1_w; + vec2_w = vec3_w; + DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, + vec1_w, vec2_w, vec3_w); + SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); + PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); + TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = temp0 + temp1 + 7; + in2 = temp0 - temp1 + 7; + in0 >>= 4; + in2 >>= 4; + SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); + vec3_w += vec1_w; + vec1_w = __msa_splati_w((v4i32)coeff, 1); + const0 = RET_1_IF_NZERO_H(in3); + ILVRL_H2_SH(in3, in1, tmp1, tmp0); + vec0_w = vec1_w; + vec2_w = vec3_w; + DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, + vec1_w, vec2_w, vec3_w); + SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); + PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); + in1 += const0; + PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); + ST_SH2(temp0, temp1, output, 8); + + PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output + 16, 8); } -#define RET_1_IF_NZERO_H(in0) \ -({ \ - v8i16 tmp0_m; \ - v8i16 one_m = __msa_ldi_h(1); \ - \ - tmp0_m = __msa_ceqi_h(in0, 0); \ - tmp0_m = tmp0_m ^ 255; \ - tmp0_m = one_m & tmp0_m; \ - \ - tmp0_m; \ -}) - -#define RET_1_IF_NZERO_W(in0) \ -({ \ - v4i32 tmp0_m; \ - v4i32 one_m = __msa_ldi_w(1); \ - \ - tmp0_m = __msa_ceqi_w(in0, 0); \ - tmp0_m = tmp0_m ^ 255; \ - tmp0_m = one_m & tmp0_m; \ - \ - tmp0_m; \ -}) - -#define RET_1_IF_NEG_W(in0) \ -({ \ - v4i32 tmp0_m; \ - \ - v4i32 one_m = __msa_ldi_w(1); \ - tmp0_m = __msa_clti_s_w(in0, 0); \ - tmp0_m = one_m & tmp0_m; \ - \ - tmp0_m; \ -}) - -void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) -{ - v8i16 in0, in1, in2, in3; - v8i16 temp0, temp1; - v8i16 const0, const1; - v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; - v4i32 out0, out1, out2, out3; - v8i16 zero = { 0 }; - - LD_SH4(input, pitch / 2, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - - BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); - SLLI_4V(temp0, temp1, in1, in3, 3); - in0 = temp0 + temp1; - in2 = temp0 - temp1; - SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); - temp0 = __msa_ilvr_h(in3, in1); - in1 = __msa_splati_h(coeff, 3); - out0 = (v4i32)__msa_ilvev_h(zero, in1); - coeff = __msa_ilvl_h(zero, coeff); - out1 = __msa_splati_w((v4i32)coeff, 0); - DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); - out0 >>= 12; - out1 >>= 12; - PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - - BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); - in0 = temp0 + temp1 + 7; - in2 = temp0 - temp1 + 7; - in0 >>= 4; - in2 >>= 4; - ILVR_H2_SW(zero, in0, zero, in2, out0, out2); - temp1 = RET_1_IF_NZERO_H(in3); - ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); - SPLATI_W2_SW(coeff, 2, out3, out1); - out3 += out1; - out1 = __msa_splati_w((v4i32)coeff, 1); - DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); - out1 >>= 16; - out3 >>= 16; - out1 += (v4i32)temp1; - PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); - ST_SH2(in0, in2, output, 8); -} - -void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) -{ - v8i16 in0, in1, in2, in3; - v8i16 temp0, temp1, tmp0, tmp1; - v8i16 const0, const1, const2; - v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; - v8i16 zero = { 0 }; - v4i32 vec0_w, vec1_w, vec2_w, vec3_w; - - LD_SH4(input, pitch / 2, in0, in1, in2, in3); - TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); - - BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); - SLLI_4V(temp0, temp1, in1, in3, 3); - in0 = temp0 + temp1; - in2 = temp0 - temp1; - SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); - temp0 = __msa_splati_h(coeff, 3); - vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); - coeff = __msa_ilvl_h(zero, coeff); - vec3_w = __msa_splati_w((v4i32)coeff, 0); - ILVRL_H2_SH(in3, in1, tmp1, tmp0); - vec0_w = vec1_w; - vec2_w = vec3_w; - DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, - vec0_w, vec1_w, vec2_w, vec3_w); - SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); - PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); - TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); - - BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); - in0 = temp0 + temp1 + 7; - in2 = temp0 - temp1 + 7; - in0 >>= 4; - in2 >>= 4; - SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); - vec3_w += vec1_w; - vec1_w = __msa_splati_w((v4i32)coeff, 1); - const0 = RET_1_IF_NZERO_H(in3); - ILVRL_H2_SH(in3, in1, tmp1, tmp0); - vec0_w = vec1_w; - vec2_w = vec3_w; - DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, - vec0_w, vec1_w, vec2_w, vec3_w); - SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); - PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); - in1 += const0; - PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); - ST_SH2(temp0, temp1, output, 8); - - PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); - ST_SH2(in0, in2, output + 16, 8); -} - -void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) -{ - v8i16 in0_h, in1_h, in2_h, in3_h; - v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; - - LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); - TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); - - UNPCK_R_SH_SW(in0_h, in0_w); - UNPCK_R_SH_SW(in1_h, in1_w); - UNPCK_R_SH_SW(in2_h, in2_w); - UNPCK_R_SH_SW(in3_h, in3_w); - BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); - SLLI_4V(temp0, temp1, temp2, temp3, 2); - BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); - temp0 = RET_1_IF_NZERO_W(temp0); - in0_w += temp0; - TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); - - BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); - BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); - in0_w += RET_1_IF_NEG_W(in0_w); - in1_w += RET_1_IF_NEG_W(in1_w); - in2_w += RET_1_IF_NEG_W(in2_w); - in3_w += RET_1_IF_NEG_W(in3_w); - ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); - SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); - PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); - ST_SH2(in0_h, in1_h, output, 8); +void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { + v8i16 in0_h, in1_h, in2_h, in3_h; + v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; + + LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); + TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); + + UNPCK_R_SH_SW(in0_h, in0_w); + UNPCK_R_SH_SW(in1_h, in1_w); + UNPCK_R_SH_SW(in2_h, in2_w); + UNPCK_R_SH_SW(in3_h, in3_w); + BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); + SLLI_4V(temp0, temp1, temp2, temp3, 2); + BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); + temp0 = RET_1_IF_NZERO_W(temp0); + in0_w += temp0; + TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); + + BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); + BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); + in0_w += RET_1_IF_NEG_W(in0_w); + in1_w += RET_1_IF_NEG_W(in1_w); + in2_w += RET_1_IF_NEG_W(in2_w); + in3_w += RET_1_IF_NEG_W(in3_w); + ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); + SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); + PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); + ST_SH2(in0_h, in1_h, output, 8); } |