diff options
author | Kaustubh Raste <kaustubh.raste@imgtec.com> | 2017-02-16 12:12:24 +0530 |
---|---|---|
committer | Kaustubh Raste <kaustubh.raste@imgtec.com> | 2017-02-16 13:17:00 +0530 |
commit | fddf66b741dc51787755c46d64877ea16e0cc3bd (patch) | |
tree | ebca04b0f9231608be1cc078e163b5b98e32c032 | |
parent | b63e88e506a28b0e2814259c1af04ef319dac77b (diff) | |
download | libvpx-fddf66b741dc51787755c46d64877ea16e0cc3bd.tar libvpx-fddf66b741dc51787755c46d64877ea16e0cc3bd.tar.gz libvpx-fddf66b741dc51787755c46d64877ea16e0cc3bd.tar.bz2 libvpx-fddf66b741dc51787755c46d64877ea16e0cc3bd.zip |
Fix mips vpx_post_proc_down_and_across_mb_row_msa function
Added fix to handle non-multiple of 16 cols case for size 16
Change-Id: If3a6d772d112077c5e0a9be9e612e1148f04338c
-rw-r--r-- | vpx_dsp/mips/deblock_msa.c | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c index e33ea740a..aafa272fb 100644 --- a/vpx_dsp/mips/deblock_msa.c +++ b/vpx_dsp/mips/deblock_msa.c @@ -9,6 +9,7 @@ */ #include <stdlib.h> + #include "./macros_msa.h" extern const int16_t vpx_rv[]; @@ -295,6 +296,7 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, uint8_t *p_dst_st = dst_ptr; uint8_t *f_orig = f; uint16_t col; + uint64_t out0, out1, out2, out3; v16u8 above2, above1, below2, below1; v16u8 src, ref, ref_temp; v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6; @@ -346,6 +348,67 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, f += 16; } + if (0 != (cols / 16)) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + src = LD_UB(p_src + 10 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); + below1 = LD_UB(p_src + 11 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); + below2 = LD_UB(p_src + 12 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); + above2 = LD_UB(p_src + 13 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); + above1 = LD_UB(p_src + 14 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); + src = LD_UB(p_src + 15 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); + below1 = LD_UB(p_src + 16 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); + below2 = LD_UB(p_src + 17 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); + out0 = __msa_copy_u_d((v2i64)inter0, 0); + out1 = __msa_copy_u_d((v2i64)inter1, 0); + out2 = __msa_copy_u_d((v2i64)inter2, 0); + out3 = __msa_copy_u_d((v2i64)inter3, 0); + SD4(out0, out1, out2, out3, p_dst, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter4, 0); + out1 = __msa_copy_u_d((v2i64)inter5, 0); + out2 = __msa_copy_u_d((v2i64)inter6, 0); + out3 = __msa_copy_u_d((v2i64)inter7, 0); + SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter8, 0); + out1 = __msa_copy_u_d((v2i64)inter9, 0); + out2 = __msa_copy_u_d((v2i64)inter10, 0); + out3 = __msa_copy_u_d((v2i64)inter11, 0); + SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter12, 0); + out1 = __msa_copy_u_d((v2i64)inter13, 0); + out2 = __msa_copy_u_d((v2i64)inter14, 0); + out3 = __msa_copy_u_d((v2i64)inter15, 0); + SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride); + } + f = f_orig; p_dst = dst_ptr - 2; LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, |