diff options
author | Jingning Han <jingning@google.com> | 2015-08-05 19:00:31 -0700 |
---|---|---|
committer | Jingning Han <jingning@google.com> | 2015-08-11 17:05:28 -0700 |
commit | 3ee6db6c8110680c051fe7a4dca97bb27474ca00 (patch) | |
tree | e05e37dfc10ed26a4fdaee5551b13e2d85a7a1b2 /vp10/encoder/mips/msa | |
parent | b04dad328c33874ae1eda72c73079519935a3feb (diff) | |
download | libvpx-3ee6db6c8110680c051fe7a4dca97bb27474ca00.tar libvpx-3ee6db6c8110680c051fe7a4dca97bb27474ca00.tar.gz libvpx-3ee6db6c8110680c051fe7a4dca97bb27474ca00.tar.bz2 libvpx-3ee6db6c8110680c051fe7a4dca97bb27474ca00.zip |
Fork VP9 and VP10 codebase
This commit folks the VP9 and VP10 codebase and makes libvpx
support VP8, VP9, and VP10.
Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356
Diffstat (limited to 'vp10/encoder/mips/msa')
-rw-r--r-- | vp10/encoder/mips/msa/vp9_avg_msa.c | 56 | ||||
-rw-r--r-- | vp10/encoder/mips/msa/vp9_error_msa.c | 114 | ||||
-rw-r--r-- | vp10/encoder/mips/msa/vp9_fdct16x16_msa.c | 507 | ||||
-rw-r--r-- | vp10/encoder/mips/msa/vp9_fdct4x4_msa.c | 99 | ||||
-rw-r--r-- | vp10/encoder/mips/msa/vp9_fdct8x8_msa.c | 66 | ||||
-rw-r--r-- | vp10/encoder/mips/msa/vp9_fdct_msa.h | 117 | ||||
-rw-r--r-- | vp10/encoder/mips/msa/vp9_temporal_filter_msa.c | 289 |
7 files changed, 1248 insertions, 0 deletions
diff --git a/vp10/encoder/mips/msa/vp9_avg_msa.c b/vp10/encoder/mips/msa/vp9_avg_msa.c new file mode 100644 index 000000000..e8cfd53db --- /dev/null +++ b/vp10/encoder/mips/msa/vp9_avg_msa.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp10_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +uint32_t vp10_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + v4u32 sum = { 0 }; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); + HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); + ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); + ADD2(sum0, sum2, sum4, sum6, sum0, sum4); + sum0 += sum4; + + sum = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); + sum = __msa_hadd_u_w(sum0, sum0); + sum = (v4u32)__msa_srari_w((v4i32)sum, 6); + sum_out = __msa_copy_u_w((v4i32)sum, 0); + + return sum_out; +} + +uint32_t vp10_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + uint32_t src0, src1, src2, src3; + v16u8 vec = { 0 }; + v8u16 sum0; + v4u32 sum1; + v2u64 sum2; + + LW4(src, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, vec); + + sum0 = __msa_hadd_u_h(vec, vec); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum2 = __msa_hadd_u_d(sum1, sum1); + sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); + sum_out = __msa_copy_u_w((v4i32)sum1, 0); + + return sum_out; +} diff --git a/vp10/encoder/mips/msa/vp9_error_msa.c b/vp10/encoder/mips/msa/vp9_error_msa.c new file mode 100644 index 000000000..dacca32c0 --- /dev/null +++ b/vp10/encoder/mips/msa/vp9_error_msa.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp10_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \ +static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr, \ + const int16_t *dq_coeff_ptr, \ + int64_t *ssz) { \ + int64_t err = 0; \ + uint32_t loop_cnt; \ + v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \ + v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \ + v2i64 sq_coeff_r, sq_coeff_l; \ + v2i64 err0, err_dup0, err1, err_dup1; \ + \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, \ + sq_coeff_r, sq_coeff_l); \ + DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + \ + for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + } \ + \ + err_dup0 = __msa_splati_d(sq_coeff_r, 1); \ + err_dup1 = __msa_splati_d(sq_coeff_l, 1); \ + sq_coeff_r += err_dup0; \ + sq_coeff_l += err_dup1; \ + *ssz = __msa_copy_s_d(sq_coeff_r, 0); \ + *ssz += __msa_copy_s_d(sq_coeff_l, 0); \ + \ + err_dup0 = __msa_splati_d(err0, 1); \ + err_dup1 = __msa_splati_d(err1, 1); \ + err0 += err_dup0; \ + err1 += err_dup1; \ + err = __msa_copy_s_d(err0, 0); \ + err += __msa_copy_s_d(err1, 0); \ + \ + return err; \ +} + +BLOCK_ERROR_BLOCKSIZE_MSA(16); +BLOCK_ERROR_BLOCKSIZE_MSA(64); +BLOCK_ERROR_BLOCKSIZE_MSA(256); +BLOCK_ERROR_BLOCKSIZE_MSA(1024); + +int64_t vp10_block_error_msa(const tran_low_t *coeff_ptr, + const tran_low_t *dq_coeff_ptr, + intptr_t blk_size, int64_t *ssz) { + int64_t err; + const int16_t *coeff = (const int16_t *)coeff_ptr; + const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr; + + switch (blk_size) { + case 16: + err = block_error_16size_msa(coeff, dq_coeff, ssz); + break; + case 64: + err = block_error_64size_msa(coeff, dq_coeff, ssz); + break; + case 256: + err = block_error_256size_msa(coeff, dq_coeff, ssz); + break; + case 1024: + err = block_error_1024size_msa(coeff, dq_coeff, ssz); + break; + default: + err = vp10_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz); + break; + } + + return err; +} diff --git a/vp10/encoder/mips/msa/vp9_fdct16x16_msa.c b/vp10/encoder/mips/msa/vp9_fdct16x16_msa.c new file mode 100644 index 000000000..c031ab689 --- /dev/null +++ b/vp10/encoder/mips/msa/vp9_fdct16x16_msa.c @@ -0,0 +1,507 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vp10/common/vp9_enums.h" +#include "vp10/encoder/mips/msa/vp9_fdct_msa.h" +#include "vpx_dsp/mips/fwd_txfm_msa.h" + +static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, + const int32_t *const0, int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r15 = LD_SH(input + 15 * stride); + r7 = LD_SH(input + 7 * stride); + r8 = LD_SH(input + 8 * stride); + SLLI_4V(r0, r15, r7, r8, 2); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 8, 4, k2, k3); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * stride); + r4 = LD_SH(input + 4 * stride); + r11 = LD_SH(input + 11 * stride); + r12 = LD_SH(input + 12 * stride); + SLLI_4V(r3, r4, r11, r12, 2); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp2, int_buf, 8); + ST_SH2(tp1, tp3, int_buf + 4 * 8, 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + ST_SH2(h0, h1, int_buf + 8 * 8, 8); + ST_SH2(h3, h2, int_buf + 12 * 8, 8); + + r9 = LD_SH(input + 9 * stride); + r6 = LD_SH(input + 6 * stride); + r1 = LD_SH(input + stride); + r14 = LD_SH(input + 14 * stride); + SLLI_4V(r9, r6, r1, r14, 2); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r13 = LD_SH(input + 13 * stride); + r2 = LD_SH(input + 2 * stride); + r5 = LD_SH(input + 5 * stride); + r10 = LD_SH(input + 10 * stride); + SLLI_4V(r13, r2, r5, r10, 2); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + int16_t *out_ptr = out + 128; + v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v4i32 k0, k1, k2, k3; + + LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15); + LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7); + LD_SW2(const0 + 4 * 19, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 21); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + + tp0 = LD_SH(int_buf + 4 * 8); + tp1 = LD_SH(int_buf + 5 * 8); + tp3 = LD_SH(int_buf + 10 * 8); + tp2 = LD_SH(int_buf + 14 * 8); + LD_SW2(const0 + 4 * 22, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 24); + MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + ST_SH(out4, (out + 3 * 16)); + ST_SH(out5, (out_ptr + 4 * 16)); + + h1 = LD_SH(int_buf + 9 * 8); + h3 = LD_SH(int_buf + 12 * 8); + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + ST_SH(out12, (out + 2 * 16)); + ST_SH(out13, (out_ptr + 5 * 16)); + + tp0 = LD_SH(int_buf); + tp1 = LD_SH(int_buf + 8); + tp2 = LD_SH(int_buf + 2 * 8); + tp3 = LD_SH(int_buf + 6 * 8); + + BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); + out1 = -out1; + ST_SH(out0, (out)); + ST_SH(out1, (out_ptr + 7 * 16)); + + h0 = LD_SH(int_buf + 8 * 8); + h2 = LD_SH(int_buf + 13 * 8); + + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + ST_SH(out8, (out + 16)); + ST_SH(out9, (out_ptr + 6 * 16)); + + /* stage 4 */ + LD_SW2(const0 + 4 * 25, 4, k0, k1); + LD_SW2(const0 + 4 * 27, 4, k2, k3); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + ST_SH(out2, (out + 7 * 16)); + ST_SH(out3, (out_ptr)); + + MADD_SHORT(out6, out7, k0, k3, out6, out7); + ST_SH(out6, (out + 4 * 16)); + ST_SH(out7, (out_ptr + 3 * 16)); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + ST_SH(out10, (out + 6 * 16)); + ST_SH(out11, (out_ptr + 16)); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + ST_SH(out14, (out + 5 * 16)); + ST_SH(out15, (out_ptr + 2 * 16)); +} + +static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + r0, r1, r2, r3, r4, r5, r6, r7); + FDCT_POSTPROC_2V_NEG_H(r0, r1); + FDCT_POSTPROC_2V_NEG_H(r2, r3); + FDCT_POSTPROC_2V_NEG_H(r4, r5); + FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + r8, r9, r10, r11, r12, r13, r14, r15); + FDCT_POSTPROC_2V_NEG_H(r8, r9); + FDCT_POSTPROC_2V_NEG_H(r10, r11); + FDCT_POSTPROC_2V_NEG_H(r12, r13); + FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); + out += 64; + + /* load input data */ + input += 128; + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + r0, r1, r2, r3, r4, r5, r6, r7); + FDCT_POSTPROC_2V_NEG_H(r0, r1); + FDCT_POSTPROC_2V_NEG_H(r2, r3); + FDCT_POSTPROC_2V_NEG_H(r4, r5); + FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + r8, r9, r10, r11, r12, r13, r14, r15); + FDCT_POSTPROC_2V_NEG_H(r8, r9); + FDCT_POSTPROC_2V_NEG_H(r10, r11); + FDCT_POSTPROC_2V_NEG_H(r12, r13); + FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); +} + +static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, + int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r7 = LD_SH(input + 7 * 8); + r8 = LD_SH(input + 8 * 8); + r15 = LD_SH(input + 15 * 8); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 4 * 2, 4, k2, k3); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * 8); + r4 = LD_SH(input + 4 * 8); + r11 = LD_SH(input + 11 * 8); + r12 = LD_SH(input + 12 * 8); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp1, int_buf, 4 * 8); + ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); + ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); + + r1 = LD_SH(input + 8); + r6 = LD_SH(input + 6 * 8); + r9 = LD_SH(input + 9 * 8); + r14 = LD_SH(input + 14 * 8); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r2 = LD_SH(input + 2 * 8); + r5 = LD_SH(input + 5 * 8); + r10 = LD_SH(input + 10 * 8); + r13 = LD_SH(input + 13 * 8); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + int16_t *out_ptr = out + 8; + v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v4i32 k0, k1, k2, k3; + + g13 = LD_SH(int_buf + 3 * 8); + g15 = LD_SH(int_buf + 7 * 8); + g5 = LD_SH(int_buf + 11 * 8); + g7 = LD_SH(int_buf + 15 * 8); + + LD_SW2(const0 + 4 * 19, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 21); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + + tp0 = LD_SH(int_buf + 4 * 8); + tp1 = LD_SH(int_buf + 5 * 8); + tp3 = LD_SH(int_buf + 10 * 8); + tp2 = LD_SH(int_buf + 14 * 8); + + LD_SW2(const0 + 4 * 22, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 24); + MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + ST_SH(out4, (out + 3 * 16)); + ST_SH(out5, (out_ptr + 4 * 16)); + + h1 = LD_SH(int_buf + 9 * 8); + h3 = LD_SH(int_buf + 12 * 8); + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + ST_SH(out12, (out + 2 * 16)); + ST_SH(out13, (out_ptr + 5 * 16)); + + tp0 = LD_SH(int_buf); + tp1 = LD_SH(int_buf + 8); + tp2 = LD_SH(int_buf + 2 * 8); + tp3 = LD_SH(int_buf + 6 * 8); + + BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); + out1 = -out1; + ST_SH(out0, (out)); + ST_SH(out1, (out_ptr + 7 * 16)); + + h0 = LD_SH(int_buf + 8 * 8); + h2 = LD_SH(int_buf + 13 * 8); + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + ST_SH(out8, (out + 16)); + ST_SH(out9, (out_ptr + 6 * 16)); + + /* stage 4 */ + LD_SW2(const0 + 4 * 25, 4, k0, k1); + LD_SW2(const0 + 4 * 27, 4, k2, k3); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + ST_SH(out2, (out + 7 * 16)); + ST_SH(out3, (out_ptr)); + + MADD_SHORT(out6, out7, k0, k3, out6, out7); + ST_SH(out6, (out + 4 * 16)); + ST_SH(out7, (out_ptr + 3 * 16)); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + ST_SH(out10, (out + 6 * 16)); + ST_SH(out11, (out_ptr + 16)); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + ST_SH(out14, (out + 5 * 16)); + ST_SH(out15, (out_ptr + 2 * 16)); +} + +static void fadst16_transpose_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, + l4, l12, l5, l13, l6, l14, l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + r0, r1, r2, r3, r4, r5, r6, r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + r8, r9, r10, r11, r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); + out += 16 * 8; + + /* load input data */ + input += 128; + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, + l4, l12, l5, l13, l6, l14, l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + r0, r1, r2, r3, r4, r5, r6, r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + r8, r9, r10, r11, r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); +} + +static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { + int16_t *temp = intermediate; + int16_t *out = output; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; + v8i16 in12, in13, in14, in15; + + LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); + temp = intermediate + 8; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(in0, in1); + FDCT_POSTPROC_2V_NEG_H(in2, in3); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + FDCT_POSTPROC_2V_NEG_H(in6, in7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + in8, in9, in10, in11, in12, in13, in14, in15); + temp = intermediate; + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + temp = intermediate; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, + tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, + tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); + out = output + 8; + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); +} + +void vp10_fht16x16_msa(const int16_t *input, int16_t *output, + int32_t stride, int32_t tx_type) { + DECLARE_ALIGNED(32, int16_t, tmp[256]); + DECLARE_ALIGNED(32, int16_t, trans_buf[256]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[128]); + int32_t i; + int16_t *ptmpbuf = &tmp_buf[0]; + int16_t *trans = &trans_buf[0]; + const int32_t const_arr[29 * 4] = { + 52707308, 52707308, 52707308, 52707308, + -1072430300, -1072430300, -1072430300, -1072430300, + 795618043, 795618043, 795618043, 795618043, + -721080468, -721080468, -721080468, -721080468, + 459094491, 459094491, 459094491, 459094491, + -970646691, -970646691, -970646691, -970646691, + 1010963856, 1010963856, 1010963856, 1010963856, + -361743294, -361743294, -361743294, -361743294, + 209469125, 209469125, 209469125, 209469125, + -1053094788, -1053094788, -1053094788, -1053094788, + 1053160324, 1053160324, 1053160324, 1053160324, + 639644520, 639644520, 639644520, 639644520, + -862444000, -862444000, -862444000, -862444000, + 1062144356, 1062144356, 1062144356, 1062144356, + -157532337, -157532337, -157532337, -157532337, + 260914709, 260914709, 260914709, 260914709, + -1041559667, -1041559667, -1041559667, -1041559667, + 920985831, 920985831, 920985831, 920985831, + -551995675, -551995675, -551995675, -551995675, + 596522295, 596522295, 596522295, 596522295, + 892853362, 892853362, 892853362, 892853362, + -892787826, -892787826, -892787826, -892787826, + 410925857, 410925857, 410925857, 410925857, + -992012162, -992012162, -992012162, -992012162, + 992077698, 992077698, 992077698, 992077698, + 759246145, 759246145, 759246145, 759246145, + -759180609, -759180609, -759180609, -759180609, + -759222975, -759222975, -759222975, -759222975, + 759288511, 759288511, 759288511, 759288511 }; + + switch (tx_type) { + case DCT_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case ADST_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case DCT_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + case ADST_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + default: + assert(0); + break; + } +} diff --git a/vp10/encoder/mips/msa/vp9_fdct4x4_msa.c b/vp10/encoder/mips/msa/vp9_fdct4x4_msa.c new file mode 100644 index 000000000..e06bf724e --- /dev/null +++ b/vp10/encoder/mips/msa/vp9_fdct4x4_msa.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vp10/common/vp9_enums.h" +#include "vp10/encoder/mips/msa/vp9_fdct_msa.h" + +void vp10_fwht4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + in0 += in1; + in3 -= in2; + in4 = (in0 - in3) >> 1; + SUB2(in4, in1, in4, in2, in1, in2); + in0 -= in2; + in3 += in1; + + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + + in0 += in2; + in1 -= in3; + in4 = (in0 - in1) >> 1; + SUB2(in4, in2, in4, in3, in2, in3); + in0 -= in3; + in1 += in2; + + SLLI_4V(in0, in1, in2, in3, 2); + + TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); + + ST4x2_UB(in0, output, 4); + ST4x2_UB(in3, output + 4, 4); + ST4x2_UB(in1, output + 8, 4); + ST4x2_UB(in2, output + 12, 4); +} + +void vp10_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 temp, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + temp = __msa_ceqi_h(in0, 0); + temp = (v8i16)__msa_xori_b((v16u8)temp, 255); + temp = mask & temp; + in0 += temp; + } + + switch (tx_type) { + case DCT_DCT: + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_DCT: + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case DCT_ADST: + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_ADST: + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + default: + assert(0); + break; + } + + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} diff --git a/vp10/encoder/mips/msa/vp9_fdct8x8_msa.c b/vp10/encoder/mips/msa/vp9_fdct8x8_msa.c new file mode 100644 index 000000000..cf89daabb --- /dev/null +++ b/vp10/encoder/mips/msa/vp9_fdct8x8_msa.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vp10/common/vp9_enums.h" +#include "vp10/encoder/mips/msa/vp9_fdct_msa.h" + +void vp10_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + + switch (tx_type) { + case DCT_DCT: + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case ADST_DCT: + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case DCT_ADST: + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case ADST_ADST: + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + default: + assert(0); + break; + } + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} diff --git a/vp10/encoder/mips/msa/vp9_fdct_msa.h b/vp10/encoder/mips/msa/vp9_fdct_msa.h new file mode 100644 index 000000000..d7d40cb72 --- /dev/null +++ b/vp10/encoder/mips/msa/vp9_fdct_msa.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ + +#include "vpx_dsp/mips/fwd_txfm_msa.h" +#include "vpx_dsp/mips/txfm_macros_msa.h" +#include "vpx_ports/mem.h" + +#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst1_m, cnst2_m, cnst3_m, in7, in0, \ + in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst1_m, cnst2_m, cnst3_m, in5, in2, \ + in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst2_m, cnst3_m, cnst1_m, out1, out6, \ + s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ +} + +#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \ + v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \ + \ + UNPCK_R_SH_SW(in0, in0_r_m); \ + UNPCK_R_SH_SW(in1, in1_r_m); \ + UNPCK_R_SH_SW(in2, in2_r_m); \ + UNPCK_R_SH_SW(in3, in3_r_m); \ + \ + constant_m = __msa_fill_w(sinpi_4_9); \ + MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \ + \ + constant_m = __msa_fill_w(sinpi_1_9); \ + s0_m += in0_r_m * constant_m; \ + s1_m -= in1_r_m * constant_m; \ + \ + constant_m = __msa_fill_w(sinpi_2_9); \ + s0_m += in1_r_m * constant_m; \ + s1_m += in3_r_m * constant_m; \ + \ + s2_m = in0_r_m + in1_r_m - in3_r_m; \ + \ + constant_m = __msa_fill_w(sinpi_3_9); \ + MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \ + \ + in0_r_m = s0_m + s3_m; \ + s2_m = s1_m - s3_m; \ + s3_m = s1_m - s0_m + s3_m; \ + \ + SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, \ + s3_m, s3_m, out0, out1, out2, out3); \ +} +#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */ diff --git a/vp10/encoder/mips/msa/vp9_temporal_filter_msa.c b/vp10/encoder/mips/msa/vp9_temporal_filter_msa.c new file mode 100644 index 000000000..5d4558b94 --- /dev/null +++ b/vp10/encoder/mips/msa/vp9_temporal_filter_msa.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp10_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, + uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, + int32_t filt_wgt, + uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + uint64_t f0, f1, f2, f3; + v16i8 frm2, frm1 = { 0 }; + v16i8 frm4, frm3 = { 0 }; + v16u8 frm_r, frm_l; + v8i16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 2; row--;) { + LD4(frm1_ptr, stride, f0, f1, f2, f3); + frm1_ptr += (4 * stride); + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 32; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + INSERT_D2_SB(f0, f1, frm1); + INSERT_D2_SB(f2, f3, frm3); + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + UNPCK_UB_SH(frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + } +} + +static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, + uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, + int32_t filt_wgt, + uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + v16i8 frm1, frm2, frm3, frm4; + v16u8 frm_r, frm_l; + v16i8 zero = { 0 }; + v8u16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 8; row--;) { + LD_SB2(frm1_ptr, stride, frm1, frm3); + frm1_ptr += stride; + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + frm1_ptr += stride; + frm2_ptr += 16; + } +} + +void vp10_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, + uint8_t *frame2_ptr, uint32_t blk_w, + uint32_t blk_h, int32_t strength, + int32_t filt_wgt, uint32_t *accu, + uint16_t *cnt) { + if (8 == (blk_w * blk_h)) { + temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, + strength, filt_wgt, accu, cnt); + } else if (16 == (blk_w * blk_h)) { + temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, + strength, filt_wgt, accu, cnt); + } else { + vp10_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, + strength, filt_wgt, accu, cnt); + } +} |