From af6733aec65f49c6dd5306d4f5bca60f7af4824b Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Sat, 25 Jul 2015 12:32:26 +0530 Subject: mips msa vp8 recon intra optimization average improvement ~3x-5x Change-Id: I73306863e9bf172d5adc06b8dd54e43985d1e063 --- vp8/common/mips/msa/reconintra_msa.c | 342 +++++++++++++++++++++++++++++++++++ vp8/common/rtcd_defs.pl | 4 +- 2 files changed, 344 insertions(+), 2 deletions(-) create mode 100644 vp8/common/mips/msa/reconintra_msa.c (limited to 'vp8/common') diff --git a/vp8/common/mips/msa/reconintra_msa.c b/vp8/common/mips/msa/reconintra_msa.c new file mode 100644 index 000000000..57f705d25 --- /dev/null +++ b/vp8/common/mips/msa/reconintra_msa.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + uint64_t out = LD(src); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + v16u8 out = LD_UB(src); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0 * src_stride] * 0x0101010101010101ull; + out1 = src[1 * src_stride] * 0x0101010101010101ull; + out2 = src[2 * src_stride] * 0x0101010101010101ull; + out3 = src[3 * src_stride] * 0x0101010101010101ull; + out4 = src[4 * src_stride] * 0x0101010101010101ull; + out5 = src[5 * src_stride] * 0x0101010101010101ull; + out6 = src[6 * src_stride] * 0x0101010101010101ull; + out7 = src[7 * src_stride] * 0x0101010101010101ull; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) + { + inp0 = src[0]; + src += src_stride; + inp1 = src[0]; + src += src_stride; + inp2 = src[0]; + src += src_stride; + inp3 = src[0]; + src += src_stride; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left, + int32_t src_stride_left, + uint8_t *dst, int32_t dst_stride, + uint8_t is_above, uint8_t is_left) +{ + uint32_t row, addition = 0; + uint64_t out; + v16u8 src_above, store; + v8u16 sum_above; + v4u32 sum_top; + v2u64 sum; + + if (is_left && is_above) + { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + addition = __msa_copy_u_w((v4i32)sum, 0); + + for (row = 0; row < 8; ++row) + { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 8) >> 4; + store = (v16u8)__msa_fill_b(addition); + } + else if (is_left) + { + for (row = 0; row < 8; ++row) + { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 4) >> 3; + store = (v16u8)__msa_fill_b(addition); + } + else if (is_above) + { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum = (v2u64)__msa_srari_d((v2i64)sum, 3); + store = (v16u8)__msa_splati_b((v16i8)sum, 0); + } + else + { + store = (v16u8)__msa_ldi_b(128); + } + + out = __msa_copy_u_d((v2i64)store, 0); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_16x16_msa(uint8_t *src_top, uint8_t *src_left, + int32_t src_stride_left, + uint8_t *dst, int32_t dst_stride, + uint8_t is_above, uint8_t is_left) +{ + uint32_t row; + uint32_t addition = 0; + v16u8 src_above, out; + v8u16 sum_above; + v4u32 sum_top; + v2u64 sum; + + if (is_left && is_above) + { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum_top = (v4u32)__msa_pckev_w((v4i32)sum, (v4i32)sum); + sum = __msa_hadd_u_d(sum_top, sum_top); + addition = __msa_copy_u_w((v4i32)sum, 0); + + for (row = 0; row < 16; ++row) + { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 16) >> 5; + out = (v16u8)__msa_fill_b(addition); + } + else if (is_left) + { + for (row = 0; row < 16; ++row) + { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 8) >> 4; + out = (v16u8)__msa_fill_b(addition); + } + else if (is_above) + { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum_top = (v4u32)__msa_pckev_w((v4i32)sum, (v4i32)sum); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum = (v2u64)__msa_srari_d((v2i64)sum, 4); + out = (v16u8)__msa_splati_b((v16i8)sum, 0); + } + else + { + out = (v16u8)__msa_ldi_b(128); + } + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +void vp8_build_intra_predictors_mby_s_msa(struct macroblockd *x, + unsigned char *yabove_row, + unsigned char *yleft, + int left_stride, + unsigned char *ypred_ptr, + int y_stride) +{ + uint32_t row, col; + uint8_t ytop_left = yabove_row[-1]; + + switch (x->mode_info_context->mbmi.mode) + { + case DC_PRED: + intra_predict_dc_16x16_msa(yabove_row, yleft, left_stride, + ypred_ptr, y_stride, + x->up_available, x->left_available); + break; + + case V_PRED: + intra_predict_vert_16x16_msa(yabove_row, ypred_ptr, y_stride); + break; + + case H_PRED: + intra_predict_horiz_16x16_msa(yleft, left_stride, ypred_ptr, + y_stride); + break; + + case TM_PRED: + for (row = 0; row < 16; ++row) + { + for (col = 0; col < 16; ++col) + { + int pred = yleft[row * left_stride] + yabove_row[col] - + ytop_left; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + ypred_ptr[col] = pred; + } + + ypred_ptr += y_stride; + } + break; + + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} + +void vp8_build_intra_predictors_mbuv_s_msa(struct macroblockd *x, + unsigned char *uabove_row, + unsigned char *vabove_row, + unsigned char *uleft, + unsigned char *vleft, + int left_stride, + unsigned char *upred_ptr, + unsigned char *vpred_ptr, + int pred_stride) +{ + uint32_t row, col; + uint8_t utop_left = uabove_row[-1]; + uint8_t vtop_left = vabove_row[-1]; + + switch (x->mode_info_context->mbmi.uv_mode) + { + case DC_PRED: + intra_predict_dc_8x8_msa(uabove_row, uleft, left_stride, + upred_ptr, pred_stride, + x->up_available, x->left_available); + intra_predict_dc_8x8_msa(vabove_row, vleft, left_stride, + vpred_ptr, pred_stride, + x->up_available, x->left_available); + break; + + case V_PRED: + intra_predict_vert_8x8_msa(uabove_row, upred_ptr, pred_stride); + intra_predict_vert_8x8_msa(vabove_row, vpred_ptr, pred_stride); + break; + + case H_PRED: + intra_predict_horiz_8x8_msa(uleft, left_stride, upred_ptr, + pred_stride); + intra_predict_horiz_8x8_msa(vleft, left_stride, vpred_ptr, + pred_stride); + break; + + case TM_PRED: + for (row = 0; row < 8; ++row) + { + for (col = 0; col < 8; ++col) + { + int predu = uleft[row * left_stride] + uabove_row[col] - + utop_left; + int predv = vleft[row * left_stride] + vabove_row[col] - + vtop_left; + + if (predu < 0) + predu = 0; + + if (predu > 255) + predu = 255; + + if (predv < 0) + predv = 0; + + if (predv > 255) + predv = 255; + + upred_ptr[col] = predu; + vpred_ptr[col] = predv; + } + + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + } + break; + + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 1ec93e301..f7f137915 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -153,10 +153,10 @@ $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6; $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"; -specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon/; +specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon msa/; add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"; -specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon/; +specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon msa/; add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"; specialize qw/vp8_intra4x4_predict media/; -- cgit v1.2.3