summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaustubh Raste <kaustubh.raste@imgtec.com>2017-02-16 12:12:24 +0530
committerKaustubh Raste <kaustubh.raste@imgtec.com>2017-02-16 13:17:00 +0530
commitfddf66b741dc51787755c46d64877ea16e0cc3bd (patch)
treeebca04b0f9231608be1cc078e163b5b98e32c032
parentb63e88e506a28b0e2814259c1af04ef319dac77b (diff)
downloadlibvpx-fddf66b741dc51787755c46d64877ea16e0cc3bd.tar
libvpx-fddf66b741dc51787755c46d64877ea16e0cc3bd.tar.gz
libvpx-fddf66b741dc51787755c46d64877ea16e0cc3bd.tar.bz2
libvpx-fddf66b741dc51787755c46d64877ea16e0cc3bd.zip
Fix mips vpx_post_proc_down_and_across_mb_row_msa function
Added fix to handle non-multiple of 16 cols case for size 16 Change-Id: If3a6d772d112077c5e0a9be9e612e1148f04338c
-rw-r--r--vpx_dsp/mips/deblock_msa.c63
1 files changed, 63 insertions, 0 deletions
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
index e33ea740a..aafa272fb 100644
--- a/vpx_dsp/mips/deblock_msa.c
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -9,6 +9,7 @@
*/
#include <stdlib.h>
+
#include "./macros_msa.h"
extern const int16_t vpx_rv[];
@@ -295,6 +296,7 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
uint8_t *p_dst_st = dst_ptr;
uint8_t *f_orig = f;
uint16_t col;
+ uint64_t out0, out1, out2, out3;
v16u8 above2, above1, below2, below1;
v16u8 src, ref, ref_temp;
v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
@@ -346,6 +348,67 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
f += 16;
}
+ if (0 != (cols / 16)) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ src = LD_UB(p_src + 10 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+ below1 = LD_UB(p_src + 11 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+ below2 = LD_UB(p_src + 12 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+ above2 = LD_UB(p_src + 13 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+ above1 = LD_UB(p_src + 14 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+ src = LD_UB(p_src + 15 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+ below1 = LD_UB(p_src + 16 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+ below2 = LD_UB(p_src + 17 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+ out0 = __msa_copy_u_d((v2i64)inter0, 0);
+ out1 = __msa_copy_u_d((v2i64)inter1, 0);
+ out2 = __msa_copy_u_d((v2i64)inter2, 0);
+ out3 = __msa_copy_u_d((v2i64)inter3, 0);
+ SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter4, 0);
+ out1 = __msa_copy_u_d((v2i64)inter5, 0);
+ out2 = __msa_copy_u_d((v2i64)inter6, 0);
+ out3 = __msa_copy_u_d((v2i64)inter7, 0);
+ SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter8, 0);
+ out1 = __msa_copy_u_d((v2i64)inter9, 0);
+ out2 = __msa_copy_u_d((v2i64)inter10, 0);
+ out3 = __msa_copy_u_d((v2i64)inter11, 0);
+ SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter12, 0);
+ out1 = __msa_copy_u_d((v2i64)inter13, 0);
+ out2 = __msa_copy_u_d((v2i64)inter14, 0);
+ out3 = __msa_copy_u_d((v2i64)inter15, 0);
+ SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride);
+ }
+
f = f_orig;
p_dst = dst_ptr - 2;
LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,