diff options
-rw-r--r-- | test/pp_filter_test.cc | 24 | ||||
-rw-r--r-- | vp8/common/mips/msa/postproc_msa.c | 801 | ||||
-rw-r--r-- | vp8/common/postproc.c | 242 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.pl | 10 | ||||
-rw-r--r-- | vp8/common/x86/postproc_mmx.asm | 253 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 3 | ||||
-rw-r--r-- | vp9/common/vp9_alloccommon.c | 2 | ||||
-rw-r--r-- | vp9/common/vp9_postproc.c | 354 | ||||
-rw-r--r-- | vp9/common/vp9_postproc.h | 7 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 12 | ||||
-rw-r--r-- | vp9/common/x86/vp9_postproc_sse2.asm | 632 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 8 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/deblock.c | 204 | ||||
-rw-r--r-- | vpx_dsp/mips/deblock_msa.c | 683 | ||||
-rw-r--r-- | vpx_dsp/mips/macros_msa.h | 4 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 3 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 12 | ||||
-rw-r--r-- | vpx_dsp/x86/deblock_sse2.asm (renamed from vp8/common/x86/postproc_sse2.asm) | 30 |
19 files changed, 1015 insertions, 2270 deletions
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index e4688dd8c..89349e48b 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -11,7 +11,7 @@ #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" -#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" @@ -25,7 +25,7 @@ typedef void (*PostProcFunc)(unsigned char *src_ptr, namespace { -class VP8PostProcessingFilterTest +class VPxPostProcessingFilterTest : public ::testing::TestWithParam<PostProcFunc> { public: virtual void TearDown() { @@ -33,10 +33,10 @@ class VP8PostProcessingFilterTest } }; -// Test routine for the VP8 post-processing function -// vp8_post_proc_down_and_across_mb_row_c. +// Test routine for the VPx post-processing function +// vpx_post_proc_down_and_across_mb_row_c. -TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) { +TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) { // Size of the underlying data block that will be filtered. const int block_width = 16; const int block_height = 16; @@ -92,7 +92,7 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) { for (int i = 0; i < block_height; ++i) { for (int j = 0; j < block_width; ++j) { EXPECT_EQ(expected_data[i], pixel_ptr[j]) - << "VP8PostProcessingFilterTest failed with invalid filter output"; + << "VPxPostProcessingFilterTest failed with invalid filter output"; } pixel_ptr += output_stride; } @@ -102,17 +102,17 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) { vpx_free(flimits); }; -INSTANTIATE_TEST_CASE_P(C, VP8PostProcessingFilterTest, - ::testing::Values(vp8_post_proc_down_and_across_mb_row_c)); +INSTANTIATE_TEST_CASE_P(C, VPxPostProcessingFilterTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, VP8PostProcessingFilterTest, - ::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2)); +INSTANTIATE_TEST_CASE_P(SSE2, VPxPostProcessingFilterTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2)); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, VP8PostProcessingFilterTest, - ::testing::Values(vp8_post_proc_down_and_across_mb_row_msa)); +INSTANTIATE_TEST_CASE_P(MSA, VPxPostProcessingFilterTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa)); #endif } // namespace diff --git a/vp8/common/mips/msa/postproc_msa.c b/vp8/common/mips/msa/postproc_msa.c deleted file mode 100644 index 23dcde2eb..000000000 --- a/vp8/common/mips/msa/postproc_msa.c +++ /dev/null @@ -1,801 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <stdlib.h> -#include "./vp8_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "vp8/common/mips/msa/vp8_macros_msa.h" - -static const int16_t vp8_rv_msa[] = -{ - 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, - 0, 3, 9, 0, 0, 0, 8, 3, 14, 4, - 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, - 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, - 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, - 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, - 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, - 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, - 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, - 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, - 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, - 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, - 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, - 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, - 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, - 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, - 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, - 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, - 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, - 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, - 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, - 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, - 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, - 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, - 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, - 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, - 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, - 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, - 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, - 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, - 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, - 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, - 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, - 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, - 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, - 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, - 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, - 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, - 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, - 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, -}; - -#define VP8_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7, \ - out8, out9, out10, out11, \ - out12, out13, out14, out15) \ -{ \ - v8i16 temp0, temp1, temp2, temp3, temp4; \ - v8i16 temp5, temp6, temp7, temp8, temp9; \ - \ - ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ - temp0, temp1, temp2, temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ - ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ - temp0, temp1, temp2, temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out8, out10); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out12, out14); \ - out0 = (v16u8)temp6; \ - out2 = (v16u8)temp7; \ - out4 = (v16u8)temp8; \ - out6 = (v16u8)temp9; \ - out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ - out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ - out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ - out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ - out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ - out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ - out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ -} - -#define VP8_AVER_IF_RETAIN(above2_in, above1_in, src_in, \ - below1_in, below2_in, ref, out) \ -{ \ - v16u8 temp0, temp1; \ - \ - temp1 = __msa_aver_u_b(above2_in, above1_in); \ - temp0 = __msa_aver_u_b(below2_in, below1_in); \ - temp1 = __msa_aver_u_b(temp1, temp0); \ - out = __msa_aver_u_b(src_in, temp1); \ - temp0 = __msa_asub_u_b(src_in, above2_in); \ - temp1 = __msa_asub_u_b(src_in, above1_in); \ - temp0 = (temp0 < ref); \ - temp1 = (temp1 < ref); \ - temp0 = temp0 & temp1; \ - temp1 = __msa_asub_u_b(src_in, below1_in); \ - temp1 = (temp1 < ref); \ - temp0 = temp0 & temp1; \ - temp1 = __msa_asub_u_b(src_in, below2_in); \ - temp1 = (temp1 < ref); \ - temp0 = temp0 & temp1; \ - out = __msa_bmz_v(out, src_in, temp0); \ -} - -#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \ - in8, in9, in10, in11, in12, in13, in14, in15) \ -{ \ - v8i16 temp0, temp1, temp2, temp3, temp4; \ - v8i16 temp5, temp6, temp7, temp8, temp9; \ - \ - ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ - ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ - ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ - ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ - ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ - ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ - ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ - ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ - ILVRL_H2_SH(temp5, temp4, temp6, temp7); \ - ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \ - ILVRL_H2_SH(temp5, temp4, temp8, temp9); \ - ILVRL_W2_SH(temp8, temp6, temp4, temp5); \ - ILVRL_W2_SH(temp9, temp7, temp6, temp7); \ - ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \ - ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \ - in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \ - in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \ - ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \ - ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \ - in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \ - in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \ - ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \ - temp2, temp3, temp4, temp5); \ - ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \ - temp6, temp7, temp8, temp9); \ - ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \ - in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \ - in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \ - ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \ - in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \ - in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \ -} - -#define VP8_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \ - in6, in7, in8, in9, in10, in11) \ -{ \ - v8i16 temp0, temp1, temp2, temp3; \ - v8i16 temp4, temp5, temp6, temp7; \ - \ - ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ - ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ - ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ - ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ - ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ - ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ - ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \ - temp4 = __msa_ilvr_h(temp5, temp4); \ - ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \ - temp5 = __msa_ilvr_h(temp7, temp6); \ - ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ - in0 = (v16u8)temp0; \ - in2 = (v16u8)temp1; \ - in4 = (v16u8)temp2; \ - in6 = (v16u8)temp3; \ - in8 = (v16u8)temp6; \ - in10 = (v16u8)temp7; \ - in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \ - in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \ - in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \ - in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \ - in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \ - in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \ -} - -static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, - int32_t src_stride, - int32_t dst_stride, - int32_t cols, uint8_t *f) -{ - uint8_t *p_src = src_ptr; - uint8_t *p_dst = dst_ptr; - uint8_t *f_orig = f; - uint8_t *p_dst_st = dst_ptr; - uint16_t col; - uint64_t out0, out1, out2, out3; - v16u8 above2, above1, below2, below1, src, ref, ref_temp; - v16u8 inter0, inter1, inter2, inter3, inter4, inter5; - v16u8 inter6, inter7, inter8, inter9, inter10, inter11; - - for (col = (cols / 16); col--;) - { - ref = LD_UB(f); - LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); - src = LD_UB(p_src); - LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); - above2 = LD_UB(p_src + 3 * src_stride); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); - above1 = LD_UB(p_src + 4 * src_stride); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); - src = LD_UB(p_src + 5 * src_stride); - VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); - below1 = LD_UB(p_src + 6 * src_stride); - VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); - below2 = LD_UB(p_src + 7 * src_stride); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); - above2 = LD_UB(p_src + 8 * src_stride); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); - above1 = LD_UB(p_src + 9 * src_stride); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); - ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7, - p_dst, dst_stride); - - p_dst += 16; - p_src += 16; - f += 16; - } - - if (0 != (cols / 16)) - { - ref = LD_UB(f); - LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); - src = LD_UB(p_src); - LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); - above2 = LD_UB(p_src + 3 * src_stride); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); - above1 = LD_UB(p_src + 4 * src_stride); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); - src = LD_UB(p_src + 5 * src_stride); - VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); - below1 = LD_UB(p_src + 6 * src_stride); - VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); - below2 = LD_UB(p_src + 7 * src_stride); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); - above2 = LD_UB(p_src + 8 * src_stride); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); - above1 = LD_UB(p_src + 9 * src_stride); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); - out0 = __msa_copy_u_d((v2i64)inter0, 0); - out1 = __msa_copy_u_d((v2i64)inter1, 0); - out2 = __msa_copy_u_d((v2i64)inter2, 0); - out3 = __msa_copy_u_d((v2i64)inter3, 0); - SD4(out0, out1, out2, out3, p_dst, dst_stride); - - out0 = __msa_copy_u_d((v2i64)inter4, 0); - out1 = __msa_copy_u_d((v2i64)inter5, 0); - out2 = __msa_copy_u_d((v2i64)inter6, 0); - out3 = __msa_copy_u_d((v2i64)inter7, 0); - SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); - } - - f = f_orig; - p_dst = dst_ptr - 2; - LD_UB8(p_dst, dst_stride, - inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7); - - for (col = 0; col < (cols / 8); ++col) - { - ref = LD_UB(f); - f += 8; - VP8_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, - inter4, inter5, inter6, inter7, - inter8, inter9, inter10, inter11); - if (0 == col) - { - above2 = inter2; - above1 = inter2; - } - else - { - above2 = inter0; - above1 = inter1; - } - src = inter2; - below1 = inter3; - below2 = inter4; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, - ref_temp, inter2); - above2 = inter5; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, - ref_temp, inter3); - above1 = inter6; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, - ref_temp, inter4); - src = inter7; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3); - VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, - ref_temp, inter5); - below1 = inter8; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4); - VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, - ref_temp, inter6); - below2 = inter9; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, - ref_temp, inter7); - if (col == (cols / 8 - 1)) - { - above2 = inter9; - } - else - { - above2 = inter10; - } - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, - ref_temp, inter8); - if (col == (cols / 8 - 1)) - { - above1 = inter9; - } - else - { - above1 = inter11; - } - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, - ref_temp, inter9); - TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, - inter8, inter9, inter2, inter3, inter4, inter5, - inter6, inter7, inter8, inter9); - p_dst += 8; - LD_UB2(p_dst, dst_stride, inter0, inter1); - ST8x1_UB(inter2, p_dst_st); - ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride)); - LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3); - ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride)); - ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride)); - LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5); - ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride)); - ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride)); - LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7); - ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride)); - ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride)); - p_dst_st += 8; - } -} - -static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, - int32_t src_stride, - int32_t dst_stride, - int32_t cols, uint8_t *f) -{ - uint8_t *p_src = src_ptr; - uint8_t *p_dst = dst_ptr; - uint8_t *p_dst_st = dst_ptr; - uint8_t *f_orig = f; - uint16_t col; - v16u8 above2, above1, below2, below1; - v16u8 src, ref, ref_temp; - v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6; - v16u8 inter7, inter8, inter9, inter10, inter11; - v16u8 inter12, inter13, inter14, inter15; - - for (col = (cols / 16); col--;) - { - ref = LD_UB(f); - LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); - src = LD_UB(p_src); - LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); - above2 = LD_UB(p_src + 3 * src_stride); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); - above1 = LD_UB(p_src + 4 * src_stride); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); - src = LD_UB(p_src + 5 * src_stride); - VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); - below1 = LD_UB(p_src + 6 * src_stride); - VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); - below2 = LD_UB(p_src + 7 * src_stride); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); - above2 = LD_UB(p_src + 8 * src_stride); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); - above1 = LD_UB(p_src + 9 * src_stride); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); - src = LD_UB(p_src + 10 * src_stride); - VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); - below1 = LD_UB(p_src + 11 * src_stride); - VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); - below2 = LD_UB(p_src + 12 * src_stride); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); - above2 = LD_UB(p_src + 13 * src_stride); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); - above1 = LD_UB(p_src + 14 * src_stride); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); - src = LD_UB(p_src + 15 * src_stride); - VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); - below1 = LD_UB(p_src + 16 * src_stride); - VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); - below2 = LD_UB(p_src + 17 * src_stride); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); - ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7, - p_dst, dst_stride); - ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, - inter14, inter15, p_dst + 8 * dst_stride, dst_stride); - p_src += 16; - p_dst += 16; - f += 16; - } - - f = f_orig; - p_dst = dst_ptr - 2; - LD_UB8(p_dst, dst_stride, - inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7); - LD_UB8(p_dst + 8 * dst_stride, dst_stride, - inter8, inter9, inter10, inter11, inter12, inter13, - inter14, inter15); - - for (col = 0; col < cols / 8; ++col) - { - ref = LD_UB(f); - f += 8; - TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, - inter6, inter7, inter8, inter9, inter10, inter11, - inter12, inter13, inter14, inter15); - if (0 == col) - { - above2 = inter2; - above1 = inter2; - } - else - { - above2 = inter0; - above1 = inter1; - } - - src = inter2; - below1 = inter3; - below2 = inter4; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, - ref_temp, inter2); - above2 = inter5; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, - ref_temp, inter3); - above1 = inter6; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, - ref_temp, inter4); - src = inter7; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3); - VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, - ref_temp, inter5); - below1 = inter8; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4); - VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, - ref_temp, inter6); - below2 = inter9; - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5); - VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, - ref_temp, inter7); - if (col == (cols / 8 - 1)) - { - above2 = inter9; - } - else - { - above2 = inter10; - } - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6); - VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, - ref_temp, inter8); - if (col == (cols / 8 - 1)) - { - above1 = inter9; - } - else - { - above1 = inter11; - } - ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7); - VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, - ref_temp, inter9); - VP8_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, - inter6, inter7, inter8, inter9, - inter2, inter3, inter4, inter5, - inter6, inter7, inter8, inter9, - inter10, inter11, inter12, inter13, - inter14, inter15, above2, above1); - - p_dst += 8; - LD_UB2(p_dst, dst_stride, inter0, inter1); - ST8x1_UB(inter2, p_dst_st); - ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride)); - LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3); - ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride)); - ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride)); - LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5); - ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride)); - ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride)); - LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7); - ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride)); - ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride)); - LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9); - ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride)); - ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride)); - LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11); - ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride)); - ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride)); - LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13); - ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride)); - ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride)); - LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15); - ST8x1_UB(above2, (p_dst_st + 14 * dst_stride)); - ST8x1_UB(above1, (p_dst_st + 15 * dst_stride)); - p_dst_st += 8; - } -} - -void vp8_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst, - int32_t src_stride, - int32_t dst_stride, - int32_t cols, uint8_t *f, - int32_t size) -{ - if (8 == size) - { - postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, - cols, f); - } - else if (16 == size) - { - postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, - cols, f); - } -} - -void vp8_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, - int32_t rows, int32_t cols, int32_t flimit) -{ - int32_t row, col, cnt; - uint8_t *src_dup = src_ptr; - v16u8 src0, src, tmp_orig; - v16u8 tmp = { 0 }; - v16i8 zero = { 0 }; - v8u16 sum_h, src_r_h, src_l_h; - v4u32 src_r_w, src_l_w; - v4i32 flimit_vec; - - flimit_vec = __msa_fill_w(flimit); - for (row = rows; row--;) - { - int32_t sum_sq = 0; - int32_t sum = 0; - src0 = (v16u8)__msa_fill_b(src_dup[0]); - ST8x1_UB(src0, (src_dup - 8)); - - src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]); - ST_UB(src0, src_dup + cols); - src_dup[cols + 16] = src_dup[cols - 1]; - tmp_orig = (v16u8)__msa_ldi_b(0); - tmp_orig[15] = tmp[15]; - src = LD_UB(src_dup - 8); - src[15] = 0; - ILVRL_B2_UH(zero, src, src_r_h, src_l_h); - src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); - src_l_w = __msa_dotp_u_w(src_l_h, src_l_h); - sum_sq = HADD_SW_S32(src_r_w); - sum_sq += HADD_SW_S32(src_l_w); - sum_h = __msa_hadd_u_h(src, src); - sum = HADD_UH_U32(sum_h); - { - v16u8 src7, src8, src_r, src_l; - v16i8 mask; - v8u16 add_r, add_l; - v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1; - v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3; - v4i32 sub0, sub1, sub2, sub3; - v4i32 sum0_w, sum1_w, sum2_w, sum3_w; - v4i32 mul0, mul1, mul2, mul3; - v4i32 total0, total1, total2, total3; - v8i16 const8 = __msa_fill_h(8); - - src7 = LD_UB(src_dup + 7); - src8 = LD_UB(src_dup - 8); - for (col = 0; col < (cols >> 4); ++col) - { - ILVRL_B2_UB(src7, src8, src_r, src_l); - HSUB_UB2_SH(src_r, src_l, sub_r, sub_l); - - sum_r[0] = sum + sub_r[0]; - for (cnt = 0; cnt < 7; ++cnt) - { - sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1]; - } - sum_l[0] = sum_r[7] + sub_l[0]; - for (cnt = 0; cnt < 7; ++cnt) - { - sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1]; - } - sum = sum_l[7]; - src = LD_UB(src_dup + 16 * col); - ILVRL_B2_UH(zero, src, src_r_h, src_l_h); - src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4); - src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4); - tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7); - - HADD_UB2_UH(src_r, src_l, add_r, add_l); - UNPCK_SH_SW(sub_r, sub0, sub1); - UNPCK_SH_SW(sub_l, sub2, sub3); - ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w); - ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w); - MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, - mul0, mul1, mul2, mul3); - sum_sq0[0] = sum_sq + mul0[0]; - for (cnt = 0; cnt < 3; ++cnt) - { - sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1]; - } - sum_sq1[0] = sum_sq0[3] + mul1[0]; - for (cnt = 0; cnt < 3; ++cnt) - { - sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1]; - } - sum_sq2[0] = sum_sq1[3] + mul2[0]; - for (cnt = 0; cnt < 3; ++cnt) - { - sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1]; - } - sum_sq3[0] = sum_sq2[3] + mul3[0]; - for (cnt = 0; cnt < 3; ++cnt) - { - sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1]; - } - sum_sq = sum_sq3[3]; - - UNPCK_SH_SW(sum_r, sum0_w, sum1_w); - UNPCK_SH_SW(sum_l, sum2_w, sum3_w); - total0 = sum_sq0 * __msa_ldi_w(15); - total0 -= sum0_w * sum0_w; - total1 = sum_sq1 * __msa_ldi_w(15); - total1 -= sum1_w * sum1_w; - total2 = sum_sq2 * __msa_ldi_w(15); - total2 -= sum2_w * sum2_w; - total3 = sum_sq3 * __msa_ldi_w(15); - total3 -= sum3_w * sum3_w; - total0 = (total0 < flimit_vec); - total1 = (total1 < flimit_vec); - total2 = (total2 < flimit_vec); - total3 = (total3 < flimit_vec); - PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); - mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); - tmp = __msa_bmz_v(tmp, src, (v16u8)mask); - - if (col == 0) - { - uint64_t src_d; - - src_d = __msa_copy_u_d((v2i64)tmp_orig, 1); - SD(src_d, (src_dup - 8)); - } - - src7 = LD_UB(src_dup + 16 * (col + 1) + 7); - src8 = LD_UB(src_dup + 16 * (col + 1) - 8); - ST_UB(tmp, (src_dup + (16 * col))); - } - - src_dup += pitch; - } - } -} - -void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, - int32_t cols, int32_t flimit) -{ - int32_t row, col, cnt, i; - const int16_t *rv3 = &vp8_rv_msa[63 & rand()]; - v4i32 flimit_vec; - v16u8 dst7, dst8, dst_r_b, dst_l_b; - v16i8 mask; - v8u16 add_r, add_l; - v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1; - v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3; - - flimit_vec = __msa_fill_w(flimit); - - for (col = 0; col < (cols >> 4); ++col) - { - uint8_t *dst_tmp = &dst_ptr[col << 4]; - v16u8 dst; - v16i8 zero = { 0 }; - v16u8 tmp[16]; - v8i16 mult0, mult1, rv2_0, rv2_1; - v8i16 sum0_h = { 0 }; - v8i16 sum1_h = { 0 }; - v4i32 mul0 = { 0 }; - v4i32 mul1 = { 0 }; - v4i32 mul2 = { 0 }; - v4i32 mul3 = { 0 }; - v4i32 sum0_w, sum1_w, sum2_w, sum3_w; - v4i32 add0, add1, add2, add3; - const int16_t *rv2[16]; - - dst = LD_UB(dst_tmp); - for (cnt = (col << 4), i = 0; i < 16; ++cnt) - { - rv2[i] = rv3 + ((cnt * 17) & 127); - ++i; - } - for (cnt = -8; cnt < 0; ++cnt) - { - ST_UB(dst, dst_tmp + cnt * pitch); - } - - dst = LD_UB((dst_tmp + (rows - 1) * pitch)); - for (cnt = rows; cnt < rows + 17; ++cnt) - { - ST_UB(dst, dst_tmp + cnt * pitch); - } - for (cnt = -8; cnt <= 6; ++cnt) - { - dst = LD_UB(dst_tmp + (cnt * pitch)); - UNPCK_UB_SH(dst, dst_r_h, dst_l_h); - MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1); - mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0); - mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0); - mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1); - mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1); - ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h); - } - - for (row = 0; row < (rows + 8); ++row) - { - for (i = 0; i < 8; ++i) - { - rv2_0[i] = *(rv2[i] + (row & 127)); - rv2_1[i] = *(rv2[i + 8] + (row & 127)); - } - dst7 = LD_UB(dst_tmp + (7 * pitch)); - dst8 = LD_UB(dst_tmp - (8 * pitch)); - ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b); - - HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l); - UNPCK_SH_SW(sub_r, sub0, sub1); - UNPCK_SH_SW(sub_l, sub2, sub3); - sum0_h += sub_r; - sum1_h += sub_l; - - HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l); - - ILVRL_H2_SW(zero, add_r, add0, add1); - ILVRL_H2_SW(zero, add_l, add2, add3); - mul0 += add0 * sub0; - mul1 += add1 * sub1; - mul2 += add2 * sub2; - mul3 += add3 * sub3; - dst = LD_UB(dst_tmp); - ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h); - dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4); - dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4); - tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7); - - UNPCK_SH_SW(sum0_h, sum0_w, sum1_w); - UNPCK_SH_SW(sum1_h, sum2_w, sum3_w); - total0 = mul0 * __msa_ldi_w(15); - total0 -= sum0_w * sum0_w; - total1 = mul1 * __msa_ldi_w(15); - total1 -= sum1_w * sum1_w; - total2 = mul2 * __msa_ldi_w(15); - total2 -= sum2_w * sum2_w; - total3 = mul3 * __msa_ldi_w(15); - total3 -= sum3_w * sum3_w; - total0 = (total0 < flimit_vec); - total1 = (total1 < flimit_vec); - total2 = (total2 < flimit_vec); - total3 = (total3 < flimit_vec); - PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); - mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); - tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask); - - if (row >= 8) - { - ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch)); - } - - dst_tmp += pitch; - } - } -} diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index 6baf00f1e..cd3bb95f1 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -72,142 +72,11 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = }; #endif -const short vp8_rv[] = -{ - 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, - 0, 3, 9, 0, 0, 0, 8, 3, 14, 4, - 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, - 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, - 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, - 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, - 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, - 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, - 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, - 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, - 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, - 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, - 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, - 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, - 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, - 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, - 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, - 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, - 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, - 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, - 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, - 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, - 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, - 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, - 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, - 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, - 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, - 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, - 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, - 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, - 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, - 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, - 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, - 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, - 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, - 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, - 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, - 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, - 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, - 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, -}; extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch); extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch); /*********************************************************************************************************** */ -void vp8_post_proc_down_and_across_mb_row_c -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int cols, - unsigned char *f, - int size -) -{ - unsigned char *p_src, *p_dst; - int row; - int col; - unsigned char v; - unsigned char d[4]; - - for (row = 0; row < size; row++) - { - /* post_proc_down for one row */ - p_src = src_ptr; - p_dst = dst_ptr; - - for (col = 0; col < cols; col++) - { - unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line]; - unsigned char p_above1 = p_src[col - src_pixels_per_line]; - unsigned char p_below1 = p_src[col + src_pixels_per_line]; - unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line]; - - v = p_src[col]; - - if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) - && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) - { - unsigned char k1, k2, k3; - k1 = (p_above2 + p_above1 + 1) >> 1; - k2 = (p_below2 + p_below1 + 1) >> 1; - k3 = (k1 + k2 + 1) >> 1; - v = (k3 + v + 1) >> 1; - } - - p_dst[col] = v; - } - - /* now post_proc_across */ - p_src = dst_ptr; - p_dst = dst_ptr; - - p_src[-2] = p_src[-1] = p_src[0]; - p_src[cols] = p_src[cols + 1] = p_src[cols - 1]; - - for (col = 0; col < cols; col++) - { - v = p_src[col]; - - if ((abs(v - p_src[col - 2]) < f[col]) - && (abs(v - p_src[col - 1]) < f[col]) - && (abs(v - p_src[col + 1]) < f[col]) - && (abs(v - p_src[col + 2]) < f[col])) - { - unsigned char k1, k2, k3; - k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; - k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; - k3 = (k1 + k2 + 1) >> 1; - v = (k3 + v + 1) >> 1; - } - - d[col & 3] = v; - - if (col >= 2) - p_dst[col - 2] = d[(col - 2) & 3]; - } - - /* handle the last two pixels */ - p_dst[col - 2] = d[(col - 2) & 3]; - p_dst[col - 1] = d[(col - 1) & 3]; - - /* next row */ - src_ptr += src_pixels_per_line; - dst_ptr += dst_pixels_per_line; - } -} - static int q2mbl(int x) { if (x < 20) x = 20; @@ -216,108 +85,13 @@ static int q2mbl(int x) return x * x / 3; } -void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit) -{ - int r, c, i; - - unsigned char *s = src; - unsigned char d[16]; - - for (r = 0; r < rows; r++) - { - int sumsq = 0; - int sum = 0; - - for (i = -8; i < 0; i++) - s[i]=s[0]; - - /* 17 avoids valgrind warning - we buffer values in c in d - * and only write them when we've read 8 ahead... - */ - for (i = 0; i < 17; i++) - s[i+cols]=s[cols-1]; - - for (i = -8; i <= 6; i++) - { - sumsq += s[i] * s[i]; - sum += s[i]; - d[i+8] = 0; - } - - for (c = 0; c < cols + 8; c++) - { - int x = s[c+7] - s[c-8]; - int y = s[c+7] + s[c-8]; - - sum += x; - sumsq += x * y; - - d[c&15] = s[c]; - - if (sumsq * 15 - sum * sum < flimit) - { - d[c&15] = (8 + sum + s[c]) >> 4; - } - - s[c-8] = d[(c-8)&15]; - } - - s += pitch; - } -} - -void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit) -{ - int r, c, i; - const short *rv3 = &vp8_rv[63&rand()]; - - for (c = 0; c < cols; c++ ) - { - unsigned char *s = &dst[c]; - int sumsq = 0; - int sum = 0; - unsigned char d[16]; - const short *rv2 = rv3 + ((c * 17) & 127); - - for (i = -8; i < 0; i++) - s[i*pitch]=s[0]; - - /* 17 avoids valgrind warning - we buffer values in c in d - * and only write them when we've read 8 ahead... - */ - for (i = 0; i < 17; i++) - s[(i+rows)*pitch]=s[(rows-1)*pitch]; - - for (i = -8; i <= 6; i++) - { - sumsq += s[i*pitch] * s[i*pitch]; - sum += s[i*pitch]; - } - - for (r = 0; r < rows + 8; r++) - { - sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch]; - sum += s[7*pitch] - s[-8*pitch]; - d[r&15] = s[0]; - - if (sumsq * 15 - sum * sum < flimit) - { - d[r&15] = (rv2[r&127] + sum + s[0]) >> 4; - } - if (r >= 8) - s[-8*pitch] = d[(r-8)&15]; - s += pitch; - } - } -} - #if CONFIG_POSTPROC static void vp8_de_mblock(YV12_BUFFER_CONFIG *post, int q) { - vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, + vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q)); - vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, + vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q)); } @@ -365,16 +139,16 @@ void vp8_deblock(VP8_COMMON *cm, } mode_info_context++; - vp8_post_proc_down_and_across_mb_row( + vpx_post_proc_down_and_across_mb_row( source->y_buffer + 16 * mbr * source->y_stride, post->y_buffer + 16 * mbr * post->y_stride, source->y_stride, post->y_stride, source->y_width, ylimits, 16); - vp8_post_proc_down_and_across_mb_row( + vpx_post_proc_down_and_across_mb_row( source->u_buffer + 8 * mbr * source->uv_stride, post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride, post->uv_stride, source->uv_width, uvlimits, 8); - vp8_post_proc_down_and_across_mb_row( + vpx_post_proc_down_and_across_mb_row( source->v_buffer + 8 * mbr * source->uv_stride, post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride, post->uv_stride, source->uv_width, uvlimits, 8); @@ -409,17 +183,17 @@ void vp8_de_noise(VP8_COMMON *cm, /* TODO: The original code don't filter the 2 outer rows and columns. */ for (mbr = 0; mbr < mb_rows; mbr++) { - vp8_post_proc_down_and_across_mb_row( + vpx_post_proc_down_and_across_mb_row( source->y_buffer + 16 * mbr * source->y_stride, source->y_buffer + 16 * mbr * source->y_stride, source->y_stride, source->y_stride, source->y_width, limits, 16); if (uvfilter == 1) { - vp8_post_proc_down_and_across_mb_row( + vpx_post_proc_down_and_across_mb_row( source->u_buffer + 8 * mbr * source->uv_stride, source->u_buffer + 8 * mbr * source->uv_stride, source->uv_stride, source->uv_stride, source->uv_width, limits, 8); - vp8_post_proc_down_and_across_mb_row( + vpx_post_proc_down_and_across_mb_row( source->v_buffer + 8 * mbr * source->uv_stride, source->v_buffer + 8 * mbr * source->uv_stride, source->uv_stride, source->uv_stride, source->uv_width, limits, diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 856ede189..a440352f4 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -156,16 +156,6 @@ $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; # Postproc # if (vpx_config("CONFIG_POSTPROC") eq "yes") { - add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vp8_mbpost_proc_down mmx sse2 msa/; - $vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm; - - add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vp8_mbpost_proc_across_ip sse2 msa/; - $vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm; - - add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; - specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/; add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"; # no asm yet diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm deleted file mode 100644 index 61fe8854d..000000000 --- a/vp8/common/x86/postproc_mmx.asm +++ /dev/null @@ -1,253 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define VP8_FILTER_WEIGHT 128 -%define VP8_FILTER_SHIFT 7 - -;void vp8_mbpost_proc_down_mmx(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vp8_rv) -global sym(vp8_mbpost_proc_down_mmx) PRIVATE -sym(vp8_mbpost_proc_down_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 136 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax -%define flimit2 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vp8_rv))] -%endif - - ;rows +=8; - add dword ptr arg(2), 8 - - ;for(c=0; c<cols; c+=4) -.loop_col: - mov rsi, arg(0) ;s - pxor mm0, mm0 ; - - movsxd rax, dword ptr arg(1) ;pitch ; - - ; this copies the last row down into the border 8 rows - mov rdi, rsi - mov rdx, arg(2) - sub rdx, 9 - imul rdx, rax - lea rdi, [rdi+rdx] - movq mm1, QWORD ptr[rdi] ; first row - mov rcx, 8 -.init_borderd: ; initialize borders - lea rdi, [rdi + rax] - movq [rdi], mm1 - - dec rcx - jne .init_borderd - - neg rax ; rax = -pitch - - ; this copies the first row up into the border 8 rows - mov rdi, rsi - movq mm1, QWORD ptr[rdi] ; first row - mov rcx, 8 -.init_border: ; initialize borders - lea rdi, [rdi + rax] - movq [rdi], mm1 - - dec rcx - jne .init_border - - - lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] - neg rax - - - pxor mm5, mm5 - pxor mm6, mm6 ; - - pxor mm7, mm7 ; - mov rdi, rsi - - mov rcx, 15 ; - -.loop_initvar: - movd mm1, DWORD PTR [rdi]; - punpcklbw mm1, mm0 ; - - paddw mm5, mm1 ; - pmullw mm1, mm1 ; - - movq mm2, mm1 ; - punpcklwd mm1, mm0 ; - - punpckhwd mm2, mm0 ; - paddd mm6, mm1 ; - - paddd mm7, mm2 ; - lea rdi, [rdi+rax] ; - - dec rcx - jne .loop_initvar - ;save the var and sum - xor rdx, rdx -.loop_row: - movd mm1, DWORD PTR [rsi] ; [s-pitch*8] - movd mm2, DWORD PTR [rdi] ; [s+pitch*7] - - punpcklbw mm1, mm0 - punpcklbw mm2, mm0 - - paddw mm5, mm2 - psubw mm5, mm1 - - pmullw mm2, mm2 - movq mm4, mm2 - - punpcklwd mm2, mm0 - punpckhwd mm4, mm0 - - paddd mm6, mm2 - paddd mm7, mm4 - - pmullw mm1, mm1 - movq mm2, mm1 - - punpcklwd mm1, mm0 - psubd mm6, mm1 - - punpckhwd mm2, mm0 - psubd mm7, mm2 - - - movq mm3, mm6 - pslld mm3, 4 - - psubd mm3, mm6 - movq mm1, mm5 - - movq mm4, mm5 - pmullw mm1, mm1 - - pmulhw mm4, mm4 - movq mm2, mm1 - - punpcklwd mm1, mm4 - punpckhwd mm2, mm4 - - movq mm4, mm7 - pslld mm4, 4 - - psubd mm4, mm7 - - psubd mm3, mm1 - psubd mm4, mm2 - - psubd mm3, flimit2 - psubd mm4, flimit2 - - psrad mm3, 31 - psrad mm4, 31 - - packssdw mm3, mm4 - packsswb mm3, mm0 - - movd mm1, DWORD PTR [rsi+rax*8] - - movq mm2, mm1 - punpcklbw mm1, mm0 - - paddw mm1, mm5 - mov rcx, rdx - - and rcx, 127 -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - push rax - lea rax, [GLOBAL(sym(vp8_rv))] - movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] - pop rax -%elif ABI_IS_32BIT=0 - movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] -%else - movq mm4, [sym(vp8_rv) + rcx*2] -%endif - paddw mm1, mm4 - psraw mm1, 4 - - packuswb mm1, mm0 - pand mm1, mm3 - - pandn mm3, mm2 - por mm1, mm3 - - and rcx, 15 - movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] - - cmp edx, 8 - jl .skip_assignment - - mov rcx, rdx - sub rcx, 8 - and rcx, 15 - movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] - movd [rsi], mm1 - -.skip_assignment: - lea rsi, [rsi+rax] - - lea rdi, [rdi+rax] - add rdx, 1 - - cmp edx, dword arg(2) ;rows - jl .loop_row - - - add dword arg(0), 4 ; s += 4 - sub dword arg(3), 4 ; cols -= 4 - cmp dword arg(3), 0 - jg .loop_col - - add rsp, 136 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -%undef flimit2 - - -SECTION_RODATA -align 16 -Blur: - times 16 dw 16 - times 8 dw 64 - times 16 dw 16 - times 8 dw 0 - -rd: - times 4 dw 0x40 diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 7c362d764..63a918838 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -96,9 +96,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) -VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm -VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm endif ifeq ($(ARCH_X86_64),yes) @@ -123,7 +121,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c -VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/postproc_msa.c endif # common (c) diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 7dd1005d3..b4b120bee 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -103,6 +103,8 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) { #if CONFIG_VP9_POSTPROC vpx_free_frame_buffer(&cm->post_proc_buffer); vpx_free_frame_buffer(&cm->post_proc_buffer_int); + vpx_free(cm->postproc_state.limits); + cm->postproc_state.limits = 0; #else (void)cm; #endif diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index 5dad81d64..6a71a19fa 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -32,129 +32,9 @@ static const int16_t kernel5[] = { 1, 1, 4, 1, 1 }; -const int16_t vp9_rv[] = { - 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, - 0, 3, 9, 0, 0, 0, 8, 3, 14, 4, - 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, - 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, - 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, - 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, - 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, - 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, - 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, - 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, - 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, - 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, - 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, - 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, - 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, - 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, - 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, - 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, - 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, - 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, - 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, - 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, - 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, - 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, - 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, - 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, - 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, - 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, - 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, - 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, - 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, - 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, - 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, - 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, - 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, - 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, - 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, - 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, - 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, - 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, -}; - static const uint8_t q_diff_thresh = 20; static const uint8_t last_q_thresh = 170; -void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit) { - uint8_t const *p_src; - uint8_t *p_dst; - int row, col, i, v, kernel; - int pitch = src_pixels_per_line; - uint8_t d[8]; - (void)dst_pixels_per_line; - - for (row = 0; row < rows; row++) { - /* post_proc_down for one row */ - p_src = src_ptr; - p_dst = dst_ptr; - - for (col = 0; col < cols; col++) { - kernel = 4; - v = p_src[col]; - - for (i = -2; i <= 2; i++) { - if (abs(v - p_src[col + i * pitch]) > flimit) - goto down_skip_convolve; - - kernel += kernel5[2 + i] * p_src[col + i * pitch]; - } - - v = (kernel >> 3); - down_skip_convolve: - p_dst[col] = v; - } - - /* now post_proc_across */ - p_src = dst_ptr; - p_dst = dst_ptr; - - for (i = 0; i < 8; i++) - d[i] = p_src[i]; - - for (col = 0; col < cols; col++) { - kernel = 4; - v = p_src[col]; - - d[col & 7] = v; - - for (i = -2; i <= 2; i++) { - if (abs(v - p_src[col + i]) > flimit) - goto across_skip_convolve; - - kernel += kernel5[2 + i] * p_src[col + i]; - } - - d[col & 7] = (kernel >> 3); - across_skip_convolve: - - if (col >= 2) - p_dst[col - 2] = d[(col - 2) & 7]; - } - - /* handle the last two pixels */ - p_dst[col - 2] = d[(col - 2) & 7]; - p_dst[col - 1] = d[(col - 1) & 7]; - - - /* next row */ - src_ptr += pitch; - dst_ptr += pitch; - } -} - #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr, uint16_t *dst_ptr, @@ -237,41 +117,6 @@ static int q2mbl(int x) { return x * x / 3; } -void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch, - int rows, int cols, int flimit) { - int r, c, i; - uint8_t *s = src; - uint8_t d[16]; - - for (r = 0; r < rows; r++) { - int sumsq = 0; - int sum = 0; - - for (i = -8; i <= 6; i++) { - sumsq += s[i] * s[i]; - sum += s[i]; - d[i + 8] = 0; - } - - for (c = 0; c < cols + 8; c++) { - int x = s[c + 7] - s[c - 8]; - int y = s[c + 7] + s[c - 8]; - - sum += x; - sumsq += x * y; - - d[c & 15] = s[c]; - - if (sumsq * 15 - sum * sum < flimit) { - d[c & 15] = (8 + sum + s[c]) >> 4; - } - - s[c - 8] = d[(c - 8) & 15]; - } - s += pitch; - } -} - #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit) { @@ -312,43 +157,12 @@ void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, } #endif // CONFIG_VP9_HIGHBITDEPTH -void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, - int rows, int cols, int flimit) { - int r, c, i; - const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT - - for (c = 0; c < cols; c++) { - uint8_t *s = &dst[c]; - int sumsq = 0; - int sum = 0; - uint8_t d[16]; - const int16_t *rv2 = rv3 + ((c * 17) & 127); - - for (i = -8; i <= 6; i++) { - sumsq += s[i * pitch] * s[i * pitch]; - sum += s[i * pitch]; - } - - for (r = 0; r < rows + 8; r++) { - sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch]; - sum += s[7 * pitch] - s[-8 * pitch]; - d[r & 15] = s[0]; - - if (sumsq * 15 - sum * sum < flimit) { - d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; - } - - s[-8 * pitch] = d[(r - 8) & 15]; - s += pitch; - } - } -} #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch, int rows, int cols, int flimit) { int r, c, i; - const int16_t *rv3 = &vp9_rv[63 & rand()]; // NOLINT + const int16_t *rv3 = &vpx_rv[63 & rand()]; // NOLINT for (c = 0; c < cols; c++) { uint16_t *s = &dst[c]; @@ -382,14 +196,14 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, - int flag) { - double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; - int ppl = (int)(level + .5); + int flag, + uint8_t *limits) { (void) low_var_thresh; (void) flag; - #if CONFIG_VP9_HIGHBITDEPTH if (source->flags & YV12_FLAG_HIGHBITDEPTH) { + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer), CONVERT_TO_SHORTPTR(post->y_buffer), source->y_stride, post->y_stride, @@ -415,124 +229,68 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, source->uv_height, source->uv_width, ppl); } else { - vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer, - source->y_stride, post->y_stride, - source->y_height, source->y_width, ppl); - - vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, +#endif // CONFIG_VP9_HIGHBITDEPTH + vp9_deblock(source, post, q, limits); + vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q)); - - vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, + vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q)); - - vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer, - source->uv_stride, post->uv_stride, - source->uv_height, source->uv_width, ppl); - vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer, - source->uv_stride, post->uv_stride, - source->uv_height, source->uv_width, ppl); +#if CONFIG_VP9_HIGHBITDEPTH } -#else - vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer, - source->y_stride, post->y_stride, - source->y_height, source->y_width, ppl); - - vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, - post->y_width, q2mbl(q)); - - vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, - post->y_width, q2mbl(q)); - - vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer, - source->uv_stride, post->uv_stride, - source->uv_height, source->uv_width, ppl); - vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer, - source->uv_stride, post->uv_stride, - source->uv_height, source->uv_width, ppl); #endif // CONFIG_VP9_HIGHBITDEPTH } void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, - int q) { + int q, uint8_t *limits) { const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q + 0.0065 + 0.5); - int i; - - const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; - const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; - const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; - const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; - - uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; - const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; - - for (i = 0; i < MAX_MB_PLANE; ++i) { #if CONFIG_VP9_HIGHBITDEPTH - assert((src->flags & YV12_FLAG_HIGHBITDEPTH) == - (dst->flags & YV12_FLAG_HIGHBITDEPTH)); - if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + int i; + const uint8_t * const srcs[3] = + {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; + const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; + + uint8_t * const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; + for (i = 0; i < MAX_MB_PLANE; ++i) { vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]), CONVERT_TO_SHORTPTR(dsts[i]), src_strides[i], dst_strides[i], src_heights[i], src_widths[i], ppl); - } else { - vp9_post_proc_down_and_across(srcs[i], dsts[i], - src_strides[i], dst_strides[i], - src_heights[i], src_widths[i], ppl); } -#else - vp9_post_proc_down_and_across(srcs[i], dsts[i], - src_strides[i], dst_strides[i], - src_heights[i], src_widths[i], ppl); + } else { #endif // CONFIG_VP9_HIGHBITDEPTH + int mbr; + const int mb_rows = src->y_height / 16; + const int mb_cols = src->y_width / 16; + + memset(limits, (unsigned char) ppl, 16 * mb_cols); + + for (mbr = 0; mbr < mb_rows; mbr++) { + vpx_post_proc_down_and_across_mb_row( + src->y_buffer + 16 * mbr * src->y_stride, + dst->y_buffer + 16 * mbr * dst->y_stride, src->y_stride, + dst->y_stride, src->y_width, limits, 16); + vpx_post_proc_down_and_across_mb_row( + src->u_buffer + 8 * mbr * src->uv_stride, + dst->u_buffer + 8 * mbr * dst->uv_stride, src->uv_stride, + dst->uv_stride, src->uv_width, limits, 8); + vpx_post_proc_down_and_across_mb_row( + src->v_buffer + 8 * mbr * src->uv_stride, + dst->v_buffer + 8 * mbr * dst->uv_stride, src->uv_stride, + dst->uv_stride, src->uv_width, limits, 8); + } +#if CONFIG_VP9_HIGHBITDEPTH } +#endif // CONFIG_VP9_HIGHBITDEPTH } void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, - int q) { - const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q - + 0.0065 + 0.5); - int i; - - const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; - const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; - const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; - const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; - - uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; - const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - const int src_stride = src_strides[i]; - const int src_width = src_widths[i] - 4; - const int src_height = src_heights[i] - 4; - const int dst_stride = dst_strides[i]; - -#if CONFIG_VP9_HIGHBITDEPTH - assert((src->flags & YV12_FLAG_HIGHBITDEPTH) == - (dst->flags & YV12_FLAG_HIGHBITDEPTH)); - if (src->flags & YV12_FLAG_HIGHBITDEPTH) { - const uint16_t *const src_plane = CONVERT_TO_SHORTPTR( - srcs[i] + 2 * src_stride + 2); - uint16_t *const dst_plane = CONVERT_TO_SHORTPTR( - dsts[i] + 2 * dst_stride + 2); - vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride, - dst_stride, src_height, src_width, - ppl); - } else { - const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2; - uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2; - - vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, - dst_stride, src_height, src_width, ppl); - } -#else - const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2; - uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2; - vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride, - src_height, src_width, ppl); -#endif - } + int q, uint8_t *limits) { + vp9_deblock(src, dst, q, limits); } static double gaussian(double sigma, double mu, double x) { @@ -664,6 +422,14 @@ int vp9_post_proc_frame(struct VP9Common *cm, vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate post-processing buffer"); + if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) { + if (!cm->postproc_state.limits) { + cm->postproc_state.limits = vpx_calloc( + cm->width, sizeof(*cm->postproc_state.limits)); + } + } + + if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 && ppstate->last_frame_valid && cm->bit_depth == 8 && ppstate->last_base_qindex <= last_q_thresh && @@ -678,17 +444,19 @@ int vp9_post_proc_frame(struct VP9Common *cm, if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) { deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf, q + (ppflags->deblocking_level - 5) * 10, - 1, 0); + 1, 0, cm->postproc_state.limits); } else if (flags & VP9D_DEBLOCK) { - vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q); + vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q, + cm->postproc_state.limits); } else { vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); } } else if (flags & VP9D_DEMACROBLOCK) { deblock_and_de_macro_block(cm->frame_to_show, ppbuf, - q + (ppflags->deblocking_level - 5) * 10, 1, 0); + q + (ppflags->deblocking_level - 5) * 10, 1, 0, + cm->postproc_state.limits); } else if (flags & VP9D_DEBLOCK) { - vp9_deblock(cm->frame_to_show, ppbuf, q); + vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits); } else { vp8_yv12_copy_frame(cm->frame_to_show, ppbuf); } diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h index 035c9cdf8..60e6f5232 100644 --- a/vp9/common/vp9_postproc.h +++ b/vp9/common/vp9_postproc.h @@ -33,6 +33,7 @@ struct postproc_state { DECLARE_ALIGNED(16, char, blackclamp[16]); DECLARE_ALIGNED(16, char, whiteclamp[16]); DECLARE_ALIGNED(16, char, bothclamp[16]); + uint8_t *limits; }; struct VP9Common; @@ -42,9 +43,11 @@ struct VP9Common; int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags); -void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); +void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, + uint8_t *limits); -void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); +void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, + uint8_t *limits); #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 276f14554..f315a3b85 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -35,18 +35,6 @@ if ($opts{arch} eq "x86_64") { # post proc # if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { -add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit"; -specialize qw/vp9_mbpost_proc_down sse2/; -$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm; - -add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit"; -specialize qw/vp9_mbpost_proc_across_ip sse2/; -$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm; - -add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"; -specialize qw/vp9_post_proc_down_and_across sse2/; -$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm; - add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; specialize qw/vp9_filter_by_weight16x16 sse2 msa/; diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm deleted file mode 100644 index 430762815..000000000 --- a/vp9/common/x86/vp9_postproc_sse2.asm +++ /dev/null @@ -1,632 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_post_proc_down_and_across_xmm -;( -; unsigned char *src_ptr, -; unsigned char *dst_ptr, -; int src_pixels_per_line, -; int dst_pixels_per_line, -; int rows, -; int cols, -; int flimit -;) -global sym(vp9_post_proc_down_and_across_xmm) PRIVATE -sym(vp9_post_proc_down_and_across_xmm): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - ALIGN_STACK 16, rax - ; move the global rd onto the stack, since we don't have enough registers - ; to do PIC addressing - movdqa xmm0, [GLOBAL(rd42)] - sub rsp, 16 - movdqa [rsp], xmm0 -%define RD42 [rsp] -%else -%define RD42 [GLOBAL(rd42)] -%endif - - - movd xmm2, dword ptr arg(6) ;flimit - punpcklwd xmm2, xmm2 - punpckldq xmm2, xmm2 - punpcklqdq xmm2, xmm2 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr - - movsxd rcx, DWORD PTR arg(4) ;rows - movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? - pxor xmm0, xmm0 ; mm0 = 00000000 - -.nextrow: - - xor rdx, rdx ; clear out rdx for use as loop counter -.nextcol: - movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 - punpcklbw xmm3, xmm0 ; mm3 = p0..p3 - movdqa xmm1, xmm3 ; mm1 = p0..p3 - psllw xmm3, 2 ; - - movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7 - punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 - paddusw xmm3, xmm5 ; mm3 += mm6 - - ; thresholding - movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 - psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 - psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 - paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw xmm7, xmm2 - - movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7 - punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 - psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 - psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - - neg rax - movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7 - punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 - psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7 - punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 - paddusw xmm3, xmm4 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 - psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 - paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - - paddusw xmm3, RD42 ; mm3 += round value - psraw xmm3, 3 ; mm3 /= 8 - - pand xmm1, xmm7 ; mm1 select vals > thresh from source - pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; combination - - packuswb xmm1, xmm0 ; pack to bytes - movq QWORD PTR [rdi], xmm1 ; - - neg rax ; pitch is positive - add rsi, 8 - add rdi, 8 - - add rdx, 8 - cmp edx, dword arg(5) ;cols - - jl .nextcol - - ; done with the all cols, start the across filtering in place - sub rsi, rdx - sub rdi, rdx - - xor rdx, rdx - movq mm0, QWORD PTR [rdi-8]; - -.acrossnextcol: - movq xmm7, QWORD PTR [rdi +rdx -2] - movd xmm4, DWORD PTR [rdi +rdx +6] - - pslldq xmm4, 8 - por xmm4, xmm7 - - movdqa xmm3, xmm4 - psrldq xmm3, 2 - punpcklbw xmm3, xmm0 ; mm3 = p0..p3 - movdqa xmm1, xmm3 ; mm1 = p0..p3 - psllw xmm3, 2 - - - movdqa xmm5, xmm4 - psrldq xmm5, 3 - punpcklbw xmm5, xmm0 ; mm5 = p1..p4 - paddusw xmm3, xmm5 ; mm3 += mm6 - - ; thresholding - movdqa xmm7, xmm1 ; mm7 = p0..p3 - psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) - pcmpgtw xmm7, xmm2 - - movdqa xmm5, xmm4 - psrldq xmm5, 4 - punpcklbw xmm5, xmm0 ; mm5 = p2..p5 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - - movdqa xmm5, xmm4 ; mm5 = p-2..p5 - punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - psrldq xmm4, 1 ; mm4 = p-1..p5 - punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 - paddusw xmm3, xmm4 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 - psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - paddusw xmm3, RD42 ; mm3 += round value - psraw xmm3, 3 ; mm3 /= 8 - - pand xmm1, xmm7 ; mm1 select vals > thresh from source - pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; combination - - packuswb xmm1, xmm0 ; pack to bytes - movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes - movdq2q mm0, xmm1 - - add rdx, 8 - cmp edx, dword arg(5) ;cols - jl .acrossnextcol; - - ; last 8 pixels - movq QWORD PTR [rdi+rdx-8], mm0 - - ; done with this rwo - add rsi,rax ; next line - mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? - add rdi,rax ; next destination - mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? - - dec rcx ; decrement count - jnz .nextrow ; next row - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - add rsp,16 - pop rsp -%endif - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%undef RD42 - - -;void vp9_mbpost_proc_down_xmm(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vp9_rv) -global sym(vp9_mbpost_proc_down_xmm) PRIVATE -sym(vp9_mbpost_proc_down_xmm): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 128+16 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax - mov [rsp+128+8], eax - mov [rsp+128+12], eax -%define flimit4 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vp9_rv))] -%endif - - ;rows +=8; - add dword arg(2), 8 - - ;for(c=0; c<cols; c+=8) -.loop_col: - mov rsi, arg(0) ; s - pxor xmm0, xmm0 ; - - movsxd rax, dword ptr arg(1) ;pitch ; - neg rax ; rax = -pitch - - lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] - neg rax - - - pxor xmm5, xmm5 - pxor xmm6, xmm6 ; - - pxor xmm7, xmm7 ; - mov rdi, rsi - - mov rcx, 15 ; - -.loop_initvar: - movq xmm1, QWORD PTR [rdi]; - punpcklbw xmm1, xmm0 ; - - paddw xmm5, xmm1 ; - pmullw xmm1, xmm1 ; - - movdqa xmm2, xmm1 ; - punpcklwd xmm1, xmm0 ; - - punpckhwd xmm2, xmm0 ; - paddd xmm6, xmm1 ; - - paddd xmm7, xmm2 ; - lea rdi, [rdi+rax] ; - - dec rcx - jne .loop_initvar - ;save the var and sum - xor rdx, rdx -.loop_row: - movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] - movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - paddw xmm5, xmm2 - psubw xmm5, xmm1 - - pmullw xmm2, xmm2 - movdqa xmm4, xmm2 - - punpcklwd xmm2, xmm0 - punpckhwd xmm4, xmm0 - - paddd xmm6, xmm2 - paddd xmm7, xmm4 - - pmullw xmm1, xmm1 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm0 - psubd xmm6, xmm1 - - punpckhwd xmm2, xmm0 - psubd xmm7, xmm2 - - - movdqa xmm3, xmm6 - pslld xmm3, 4 - - psubd xmm3, xmm6 - movdqa xmm1, xmm5 - - movdqa xmm4, xmm5 - pmullw xmm1, xmm1 - - pmulhw xmm4, xmm4 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm4 - punpckhwd xmm2, xmm4 - - movdqa xmm4, xmm7 - pslld xmm4, 4 - - psubd xmm4, xmm7 - - psubd xmm3, xmm1 - psubd xmm4, xmm2 - - psubd xmm3, flimit4 - psubd xmm4, flimit4 - - psrad xmm3, 31 - psrad xmm4, 31 - - packssdw xmm3, xmm4 - packsswb xmm3, xmm0 - - movq xmm1, QWORD PTR [rsi+rax*8] - - movq xmm2, xmm1 - punpcklbw xmm1, xmm0 - - paddw xmm1, xmm5 - mov rcx, rdx - - and rcx, 127 -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - push rax - lea rax, [GLOBAL(sym(vp9_rv))] - movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2] - pop rax -%elif ABI_IS_32BIT=0 - movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2] -%else - movdqu xmm4, [sym(vp9_rv) + rcx*2] -%endif - - paddw xmm1, xmm4 - ;paddw xmm1, eight8s - psraw xmm1, 4 - - packuswb xmm1, xmm0 - pand xmm1, xmm3 - - pandn xmm3, xmm2 - por xmm1, xmm3 - - and rcx, 15 - movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] - - mov rcx, rdx - sub rcx, 8 - - and rcx, 15 - movq mm0, [rsp + rcx*8] ;d[rcx*8] - - movq [rsi], mm0 - lea rsi, [rsi+rax] - - lea rdi, [rdi+rax] - add rdx, 1 - - cmp edx, dword arg(2) ;rows - jl .loop_row - - add dword arg(0), 8 ; s += 8 - sub dword arg(3), 8 ; cols -= 8 - cmp dword arg(3), 0 - jg .loop_col - - add rsp, 128+16 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%undef flimit4 - - -;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src, -; int pitch, int rows, int cols,int flimit) -global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE -sym(vp9_mbpost_proc_across_ip_xmm): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 - - ; create flimit4 at [rsp] - mov eax, dword ptr arg(4) ;flimit - mov [rsp], eax - mov [rsp+4], eax - mov [rsp+8], eax - mov [rsp+12], eax -%define flimit4 [rsp] - - - ;for(r=0;r<rows;r++) -.ip_row_loop: - - xor rdx, rdx ;sumsq=0; - xor rcx, rcx ;sum=0; - mov rsi, arg(0); s - mov rdi, -8 -.ip_var_loop: - ;for(i=-8;i<=6;i++) - ;{ - ; sumsq += s[i]*s[i]; - ; sum += s[i]; - ;} - movzx eax, byte [rsi+rdi] - add ecx, eax - mul al - add edx, eax - add rdi, 1 - cmp rdi, 6 - jle .ip_var_loop - - - ;mov rax, sumsq - ;movd xmm7, rax - movd xmm7, edx - - ;mov rax, sum - ;movd xmm6, rax - movd xmm6, ecx - - mov rsi, arg(0) ;s - xor rcx, rcx - - movsxd rdx, dword arg(3) ;cols - add rdx, 8 - pxor mm0, mm0 - pxor mm1, mm1 - - pxor xmm0, xmm0 -.nextcol4: - - movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 - movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 - - punpcklbw xmm1, xmm0 ; expanding - punpcklbw xmm2, xmm0 ; expanding - - punpcklwd xmm1, xmm0 ; expanding to dwords - punpcklwd xmm2, xmm0 ; expanding to dwords - - psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 - paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 - - paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 - pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 - - paddd xmm6, xmm2 - paddd xmm7, xmm1 - - pshufd xmm6, xmm6, 0 ; duplicate the last ones - pshufd xmm7, xmm7, 0 ; duplicate the last ones - - psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 - psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 - - pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared - pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared - - paddd xmm6, xmm4 - paddd xmm7, xmm3 - - pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared - pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared - - paddd xmm7, xmm3 - paddd xmm6, xmm4 - - pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared - pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared - - paddd xmm7, xmm3 - paddd xmm6, xmm4 - - movdqa xmm3, xmm6 - pmaddwd xmm3, xmm3 - - movdqa xmm5, xmm7 - pslld xmm5, 4 - - psubd xmm5, xmm7 - psubd xmm5, xmm3 - - psubd xmm5, flimit4 - psrad xmm5, 31 - - packssdw xmm5, xmm0 - packsswb xmm5, xmm0 - - movd xmm1, DWORD PTR [rsi+rcx] - movq xmm2, xmm1 - - punpcklbw xmm1, xmm0 - punpcklwd xmm1, xmm0 - - paddd xmm1, xmm6 - paddd xmm1, [GLOBAL(four8s)] - - psrad xmm1, 4 - packssdw xmm1, xmm0 - - packuswb xmm1, xmm0 - pand xmm1, xmm5 - - pandn xmm5, xmm2 - por xmm5, xmm1 - - movd [rsi+rcx-8], mm0 - movq mm0, mm1 - - movdq2q mm1, xmm5 - psrldq xmm7, 12 - - psrldq xmm6, 12 - add rcx, 4 - - cmp rcx, rdx - jl .nextcol4 - - ;s+=pitch; - movsxd rax, dword arg(1) - add arg(0), rax - - sub dword arg(2), 1 ;rows-=1 - cmp dword arg(2), 0 - jg .ip_row_loop - - add rsp, 16 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%undef flimit4 - - -SECTION_RODATA -align 16 -rd42: - times 8 dw 0x04 -four8s: - times 4 dd 8 diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 9413a436f..f113d240f 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3114,7 +3114,11 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, l = 150; break; } - vp9_denoise(cpi->Source, cpi->Source, l); + if (!cpi->common.postproc_state.limits) { + cpi->common.postproc_state.limits = vpx_calloc( + cpi->common.width, sizeof(*cpi->common.postproc_state.limits)); + } + vp9_denoise(cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits); } #endif // CONFIG_VP9_POSTPROC } @@ -4914,7 +4918,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } vp9_deblock(cm->frame_to_show, pp, - cm->lf.filter_level * 10 / 6); + cm->lf.filter_level * 10 / 6, cm->postproc_state.limits); #endif vpx_clear_system_state(); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index a7871d6ec..2fd42960e 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -67,7 +67,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm endif ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c new file mode 100644 index 000000000..411bc7754 --- /dev/null +++ b/vpx_dsp/deblock.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <stdlib.h> + + +const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, + 14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, + 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13, + 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10, + 8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, + 4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, + 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0, + 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, + 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, + 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, + 8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, + 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, + 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, + 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, + 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, + 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13, + 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, + 13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, }; + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, int cols, + unsigned char *f, int size) { + unsigned char *p_src, *p_dst; + int row; + int col; + unsigned char v; + unsigned char d[4]; + + for (row = 0; row < size; row++) { + /* post_proc_down for one row */ + p_src = src_ptr; + p_dst = dst_ptr; + + for (col = 0; col < cols; col++) { + unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line]; + unsigned char p_above1 = p_src[col - src_pixels_per_line]; + unsigned char p_below1 = p_src[col + src_pixels_per_line]; + unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line]; + + v = p_src[col]; + + if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) + && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) { + unsigned char k1, k2, k3; + k1 = (p_above2 + p_above1 + 1) >> 1; + k2 = (p_below2 + p_below1 + 1) >> 1; + k3 = (k1 + k2 + 1) >> 1; + v = (k3 + v + 1) >> 1; + } + + p_dst[col] = v; + } + + /* now post_proc_across */ + p_src = dst_ptr; + p_dst = dst_ptr; + + p_src[-2] = p_src[-1] = p_src[0]; + p_src[cols] = p_src[cols + 1] = p_src[cols - 1]; + + for (col = 0; col < cols; col++) { + v = p_src[col]; + + if ((abs(v - p_src[col - 2]) < f[col]) + && (abs(v - p_src[col - 1]) < f[col]) + && (abs(v - p_src[col + 1]) < f[col]) + && (abs(v - p_src[col + 2]) < f[col])) { + unsigned char k1, k2, k3; + k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; + k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; + k3 = (k1 + k2 + 1) >> 1; + v = (k3 + v + 1) >> 1; + } + + d[col & 3] = v; + + if (col >= 2) + p_dst[col - 2] = d[(col - 2) & 3]; + } + + /* handle the last two pixels */ + p_dst[col - 2] = d[(col - 2) & 3]; + p_dst[col - 1] = d[(col - 1) & 3]; + + /* next row */ + src_ptr += src_pixels_per_line; + dst_ptr += dst_pixels_per_line; + } +} + +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, + int cols, int flimit) { + int r, c, i; + + unsigned char *s = src; + unsigned char d[16]; + + for (r = 0; r < rows; r++) { + int sumsq = 0; + int sum = 0; + + for (i = -8; i < 0; i++) + s[i] = s[0]; + + /* 17 avoids valgrind warning - we buffer values in c in d + * and only write them when we've read 8 ahead... + */ + for (i = 0; i < 17; i++) + s[i + cols] = s[cols - 1]; + + for (i = -8; i <= 6; i++) { + sumsq += s[i] * s[i]; + sum += s[i]; + d[i + 8] = 0; + } + + for (c = 0; c < cols + 8; c++) { + int x = s[c + 7] - s[c - 8]; + int y = s[c + 7] + s[c - 8]; + + sum += x; + sumsq += x * y; + + d[c & 15] = s[c]; + + if (sumsq * 15 - sum * sum < flimit) { + d[c & 15] = (8 + sum + s[c]) >> 4; + } + + s[c - 8] = d[(c - 8) & 15]; + } + + s += pitch; + } +} + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, + int flimit) { + int r, c, i; + unsigned int seed; + const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)]; + + for (c = 0; c < cols; c++) { + unsigned char *s = &dst[c]; + int sumsq = 0; + int sum = 0; + unsigned char d[16]; + const int16_t *rv2 = rv3 + ((c * 17) & 127); + + for (i = -8; i < 0; i++) + s[i * pitch] = s[0]; + + /* 17 avoids valgrind warning - we buffer values in c in d + * and only write them when we've read 8 ahead... + */ + for (i = 0; i < 17; i++) + s[(i + rows) * pitch] = s[(rows - 1) * pitch]; + + for (i = -8; i <= 6; i++) { + sumsq += s[i * pitch] * s[i * pitch]; + sum += s[i * pitch]; + } + + for (r = 0; r < rows + 8; r++) { + sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch]; + sum += s[7 * pitch] - s[-8 * pitch]; + d[r & 15] = s[0]; + + if (sumsq * 15 - sum * sum < flimit) { + d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; + } + if (r >= 8) + s[-8 * pitch] = d[(r - 8) & 15]; + s += pitch; + } + } +} + +#if CONFIG_POSTPROC +static void vpx_de_mblock(YV12_BUFFER_CONFIG *post, + int q) { + vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); + vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); +} + +#endif diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c new file mode 100644 index 000000000..616721d8e --- /dev/null +++ b/vpx_dsp/mips/deblock_msa.c @@ -0,0 +1,683 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include "./macros_msa.h" + +extern int16_t vpx_rv[]; + +#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, \ + out4, out5, out6, out7, \ + out8, out9, out10, out11, \ + out12, out13, out14, out15) \ +{ \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ + temp0, temp1, temp2, temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ + ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ + temp0, temp1, temp2, temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out8, out10); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out12, out14); \ + out0 = (v16u8)temp6; \ + out2 = (v16u8)temp7; \ + out4 = (v16u8)temp8; \ + out6 = (v16u8)temp9; \ + out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ + out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ + out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ + out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ + out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ +} + +#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, \ + below1_in, below2_in, ref, out) \ +{ \ + v16u8 temp0, temp1; \ + \ + temp1 = __msa_aver_u_b(above2_in, above1_in); \ + temp0 = __msa_aver_u_b(below2_in, below1_in); \ + temp1 = __msa_aver_u_b(temp1, temp0); \ + out = __msa_aver_u_b(src_in, temp1); \ + temp0 = __msa_asub_u_b(src_in, above2_in); \ + temp1 = __msa_asub_u_b(src_in, above1_in); \ + temp0 = (temp0 < ref); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + temp1 = __msa_asub_u_b(src_in, below1_in); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + temp1 = __msa_asub_u_b(src_in, below2_in); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + out = __msa_bmz_v(out, src_in, temp0); \ +} + +#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15) \ +{ \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ + ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ + ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ + ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ + ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ + ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ + ILVRL_H2_SH(temp5, temp4, temp6, temp7); \ + ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \ + ILVRL_H2_SH(temp5, temp4, temp8, temp9); \ + ILVRL_W2_SH(temp8, temp6, temp4, temp5); \ + ILVRL_W2_SH(temp9, temp7, temp6, temp7); \ + ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \ + ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \ + in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \ + in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \ + ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \ + in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \ + in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \ + ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \ + temp2, temp3, temp4, temp5); \ + ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \ + temp6, temp7, temp8, temp9); \ + ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \ + in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \ + in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \ + ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \ + in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \ + in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \ +} + +#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \ + in6, in7, in8, in9, in10, in11) \ +{ \ + v8i16 temp0, temp1, temp2, temp3; \ + v8i16 temp4, temp5, temp6, temp7; \ + \ + ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ + ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ + ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ + ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ + ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \ + temp4 = __msa_ilvr_h(temp5, temp4); \ + ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \ + temp5 = __msa_ilvr_h(temp7, temp6); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + in0 = (v16u8)temp0; \ + in2 = (v16u8)temp1; \ + in4 = (v16u8)temp2; \ + in6 = (v16u8)temp3; \ + in8 = (v16u8)temp6; \ + in10 = (v16u8)temp7; \ + in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \ + in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \ + in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \ + in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \ + in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \ + in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \ +} + +static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, + int32_t src_stride, + int32_t dst_stride, int32_t cols, + uint8_t *f) { + uint8_t *p_src = src_ptr; + uint8_t *p_dst = dst_ptr; + uint8_t *f_orig = f; + uint8_t *p_dst_st = dst_ptr; + uint16_t col; + uint64_t out0, out1, out2, out3; + v16u8 above2, above1, below2, below1, src, ref, ref_temp; + v16u8 inter0, inter1, inter2, inter3, inter4, inter5; + v16u8 inter6, inter7, inter8, inter9, inter10, inter11; + + for (col = (cols / 16); col--;) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7, + p_dst, dst_stride); + + p_dst += 16; + p_src += 16; + f += 16; + } + + if (0 != (cols / 16)) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + out0 = __msa_copy_u_d((v2i64) inter0, 0); + out1 = __msa_copy_u_d((v2i64) inter1, 0); + out2 = __msa_copy_u_d((v2i64) inter2, 0); + out3 = __msa_copy_u_d((v2i64) inter3, 0); + SD4(out0, out1, out2, out3, p_dst, dst_stride); + + out0 = __msa_copy_u_d((v2i64) inter4, 0); + out1 = __msa_copy_u_d((v2i64) inter5, 0); + out2 = __msa_copy_u_d((v2i64) inter6, 0); + out3 = __msa_copy_u_d((v2i64) inter7, 0); + SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); + } + + f = f_orig; + p_dst = dst_ptr - 2; + LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, + inter6, inter7); + + for (col = 0; col < (cols / 8); ++col) { + ref = LD_UB(f); + f += 8; + VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5, + inter6, inter7, inter8, inter9, inter10, inter11); + if (0 == col) { + above2 = inter2; + above1 = inter2; + } else { + above2 = inter0; + above1 = inter1; + } + src = inter2; + below1 = inter3; + below2 = inter4; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); + above2 = inter5; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); + above1 = inter6; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); + src = inter7; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); + below1 = inter8; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); + below2 = inter9; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); + if (col == (cols / 8 - 1)) { + above2 = inter9; + } else { + above2 = inter10; + } + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); + if (col == (cols / 8 - 1)) { + above1 = inter9; + } else { + above1 = inter11; + } + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); + TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8, + inter9, inter2, inter3, inter4, inter5, inter6, inter7, + inter8, inter9); + p_dst += 8; + LD_UB2(p_dst, dst_stride, inter0, inter1); + ST8x1_UB(inter2, p_dst_st); + ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride)); + LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3); + ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride)); + ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride)); + LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5); + ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride)); + ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride)); + LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7); + ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride)); + ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride)); + p_dst_st += 8; + } +} + +static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, + int32_t src_stride, + int32_t dst_stride, int32_t cols, + uint8_t *f) { + uint8_t *p_src = src_ptr; + uint8_t *p_dst = dst_ptr; + uint8_t *p_dst_st = dst_ptr; + uint8_t *f_orig = f; + uint16_t col; + v16u8 above2, above1, below2, below1; + v16u8 src, ref, ref_temp; + v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6; + v16u8 inter7, inter8, inter9, inter10, inter11; + v16u8 inter12, inter13, inter14, inter15; + + for (col = (cols / 16); col--;) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + src = LD_UB(p_src + 10 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); + below1 = LD_UB(p_src + 11 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); + below2 = LD_UB(p_src + 12 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); + above2 = LD_UB(p_src + 13 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); + above1 = LD_UB(p_src + 14 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); + src = LD_UB(p_src + 15 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); + below1 = LD_UB(p_src + 16 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); + below2 = LD_UB(p_src + 17 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); + ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7, + p_dst, dst_stride); + ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15, + p_dst + 8 * dst_stride, dst_stride); + p_src += 16; + p_dst += 16; + f += 16; + } + + f = f_orig; + p_dst = dst_ptr - 2; + LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, + inter6, inter7); + LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11, + inter12, inter13, inter14, inter15); + + for (col = 0; col < cols / 8; ++col) { + ref = LD_UB(f); + f += 8; + TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6, + inter7, inter8, inter9, inter10, inter11, inter12, inter13, + inter14, inter15); + if (0 == col) { + above2 = inter2; + above1 = inter2; + } else { + above2 = inter0; + above1 = inter1; + } + + src = inter2; + below1 = inter3; + below2 = inter4; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); + above2 = inter5; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); + above1 = inter6; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); + src = inter7; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); + below1 = inter8; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); + below2 = inter9; + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); + if (col == (cols / 8 - 1)) { + above2 = inter9; + } else { + above2 = inter10; + } + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); + if (col == (cols / 8 - 1)) { + above1 = inter9; + } else { + above1 = inter11; + } + ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); + VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, + inter8, inter9, inter2, inter3, inter4, inter5, + inter6, inter7, inter8, inter9, inter10, inter11, + inter12, inter13, inter14, inter15, above2, above1); + + p_dst += 8; + LD_UB2(p_dst, dst_stride, inter0, inter1); + ST8x1_UB(inter2, p_dst_st); + ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride)); + LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3); + ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride)); + ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride)); + LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5); + ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride)); + ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride)); + LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7); + ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride)); + ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride)); + LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9); + ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride)); + ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride)); + LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11); + ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride)); + ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride)); + LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13); + ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride)); + ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride)); + LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15); + ST8x1_UB(above2, (p_dst_st + 14 * dst_stride)); + ST8x1_UB(above1, (p_dst_st + 15 * dst_stride)); + p_dst_st += 8; + } +} + +void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst, + int32_t src_stride, + int32_t dst_stride, int32_t cols, + uint8_t *f, int32_t size) { + if (8 == size) { + postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f); + } else if (16 == size) { + postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f); + } +} + +void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, + int32_t rows, int32_t cols, int32_t flimit) { + int32_t row, col, cnt; + uint8_t *src_dup = src_ptr; + v16u8 src0, src, tmp_orig; + v16u8 tmp = {0}; + v16i8 zero = {0}; + v8u16 sum_h, src_r_h, src_l_h; + v4u32 src_r_w, src_l_w; + v4i32 flimit_vec; + + flimit_vec = __msa_fill_w(flimit); + for (row = rows; row--;) { + int32_t sum_sq = 0; + int32_t sum = 0; + src0 = (v16u8) __msa_fill_b(src_dup[0]); + ST8x1_UB(src0, (src_dup - 8)); + + src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]); + ST_UB(src0, src_dup + cols); + src_dup[cols + 16] = src_dup[cols - 1]; + tmp_orig = (v16u8) __msa_ldi_b(0); + tmp_orig[15] = tmp[15]; + src = LD_UB(src_dup - 8); + src[15] = 0; + ILVRL_B2_UH(zero, src, src_r_h, src_l_h); + src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); + src_l_w = __msa_dotp_u_w(src_l_h, src_l_h); + sum_sq = HADD_SW_S32(src_r_w); + sum_sq += HADD_SW_S32(src_l_w); + sum_h = __msa_hadd_u_h(src, src); + sum = HADD_UH_U32(sum_h); + { + v16u8 src7, src8, src_r, src_l; + v16i8 mask; + v8u16 add_r, add_l; + v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1; + v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3; + v4i32 sub0, sub1, sub2, sub3; + v4i32 sum0_w, sum1_w, sum2_w, sum3_w; + v4i32 mul0, mul1, mul2, mul3; + v4i32 total0, total1, total2, total3; + v8i16 const8 = __msa_fill_h(8); + + src7 = LD_UB(src_dup + 7); + src8 = LD_UB(src_dup - 8); + for (col = 0; col < (cols >> 4); ++col) { + ILVRL_B2_UB(src7, src8, src_r, src_l); + HSUB_UB2_SH(src_r, src_l, sub_r, sub_l); + + sum_r[0] = sum + sub_r[0]; + for (cnt = 0; cnt < 7; ++cnt) { + sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1]; + } + sum_l[0] = sum_r[7] + sub_l[0]; + for (cnt = 0; cnt < 7; ++cnt) { + sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1]; + } + sum = sum_l[7]; + src = LD_UB(src_dup + 16 * col); + ILVRL_B2_UH(zero, src, src_r_h, src_l_h); + src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4); + src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4); + tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7); + + HADD_UB2_UH(src_r, src_l, add_r, add_l); + UNPCK_SH_SW(sub_r, sub0, sub1); + UNPCK_SH_SW(sub_l, sub2, sub3); + ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w); + ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w); + MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1, + mul2, mul3); + sum_sq0[0] = sum_sq + mul0[0]; + for (cnt = 0; cnt < 3; ++cnt) { + sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1]; + } + sum_sq1[0] = sum_sq0[3] + mul1[0]; + for (cnt = 0; cnt < 3; ++cnt) { + sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1]; + } + sum_sq2[0] = sum_sq1[3] + mul2[0]; + for (cnt = 0; cnt < 3; ++cnt) { + sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1]; + } + sum_sq3[0] = sum_sq2[3] + mul3[0]; + for (cnt = 0; cnt < 3; ++cnt) { + sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1]; + } + sum_sq = sum_sq3[3]; + + UNPCK_SH_SW(sum_r, sum0_w, sum1_w); + UNPCK_SH_SW(sum_l, sum2_w, sum3_w); + total0 = sum_sq0 * __msa_ldi_w(15); + total0 -= sum0_w * sum0_w; + total1 = sum_sq1 * __msa_ldi_w(15); + total1 -= sum1_w * sum1_w; + total2 = sum_sq2 * __msa_ldi_w(15); + total2 -= sum2_w * sum2_w; + total3 = sum_sq3 * __msa_ldi_w(15); + total3 -= sum3_w * sum3_w; + total0 = (total0 < flimit_vec); + total1 = (total1 < flimit_vec); + total2 = (total2 < flimit_vec); + total3 = (total3 < flimit_vec); + PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); + mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0); + tmp = __msa_bmz_v(tmp, src, (v16u8) mask); + + if (col == 0) { + uint64_t src_d; + + src_d = __msa_copy_u_d((v2i64) tmp_orig, 1); + SD(src_d, (src_dup - 8)); + } + + src7 = LD_UB(src_dup + 16 * (col + 1) + 7); + src8 = LD_UB(src_dup + 16 * (col + 1) - 8); + ST_UB(tmp, (src_dup + (16 * col))); + } + + src_dup += pitch; + } + } +} + +void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, + int32_t cols, int32_t flimit) { + int32_t row, col, cnt, i; + unsigned int seed; + const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)]; + v4i32 flimit_vec; + v16u8 dst7, dst8, dst_r_b, dst_l_b; + v16i8 mask; + v8u16 add_r, add_l; + v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1; + v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3; + + flimit_vec = __msa_fill_w(flimit); + + for (col = 0; col < (cols >> 4); ++col) { + uint8_t *dst_tmp = &dst_ptr[col << 4]; + v16u8 dst; + v16i8 zero = {0}; + v16u8 tmp[16]; + v8i16 mult0, mult1, rv2_0, rv2_1; + v8i16 sum0_h = {0}; + v8i16 sum1_h = {0}; + v4i32 mul0 = {0}; + v4i32 mul1 = {0}; + v4i32 mul2 = {0}; + v4i32 mul3 = {0}; + v4i32 sum0_w, sum1_w, sum2_w, sum3_w; + v4i32 add0, add1, add2, add3; + const int16_t *rv2[16]; + + dst = LD_UB(dst_tmp); + for (cnt = (col << 4), i = 0; i < 16; ++cnt) { + rv2[i] = rv3 + ((cnt * 17) & 127); + ++i; + } + for (cnt = -8; cnt < 0; ++cnt) { + ST_UB(dst, dst_tmp + cnt * pitch); + } + + dst = LD_UB((dst_tmp + (rows - 1) * pitch)); + for (cnt = rows; cnt < rows + 17; ++cnt) { + ST_UB(dst, dst_tmp + cnt * pitch); + } + for (cnt = -8; cnt <= 6; ++cnt) { + dst = LD_UB(dst_tmp + (cnt * pitch)); + UNPCK_UB_SH(dst, dst_r_h, dst_l_h); + MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1); + mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0); + mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0); + mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1); + mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1); + ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h); + } + + for (row = 0; row < (rows + 8); ++row) { + for (i = 0; i < 8; ++i) { + rv2_0[i] = *(rv2[i] + (row & 127)); + rv2_1[i] = *(rv2[i + 8] + (row & 127)); + } + dst7 = LD_UB(dst_tmp + (7 * pitch)); + dst8 = LD_UB(dst_tmp - (8 * pitch)); + ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b); + + HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l); + UNPCK_SH_SW(sub_r, sub0, sub1); + UNPCK_SH_SW(sub_l, sub2, sub3); + sum0_h += sub_r; + sum1_h += sub_l; + + HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l); + + ILVRL_H2_SW(zero, add_r, add0, add1); + ILVRL_H2_SW(zero, add_l, add2, add3); + mul0 += add0 * sub0; + mul1 += add1 * sub1; + mul2 += add2 * sub2; + mul3 += add3 * sub3; + dst = LD_UB(dst_tmp); + ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h); + dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4); + dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4); + tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7); + + UNPCK_SH_SW(sum0_h, sum0_w, sum1_w); + UNPCK_SH_SW(sum1_h, sum2_w, sum3_w); + total0 = mul0 * __msa_ldi_w(15); + total0 -= sum0_w * sum0_w; + total1 = mul1 * __msa_ldi_w(15); + total1 -= sum1_w * sum1_w; + total2 = mul2 * __msa_ldi_w(15); + total2 -= sum2_w * sum2_w; + total3 = mul3 * __msa_ldi_w(15); + total3 -= sum3_w * sum3_w; + total0 = (total0 < flimit_vec); + total1 = (total1 < flimit_vec); + total2 = (total2 < flimit_vec); + total3 = (total3 < flimit_vec); + PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); + mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0); + tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask); + + if (row >= 8) { + ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch)); + } + + dst_tmp += pitch; + } + } +} diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index 91e3615cf..ea59eafe9 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -1060,6 +1060,7 @@ ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ } #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) /* Description : Interleave left half of halfword elements from vectors @@ -1074,6 +1075,7 @@ out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ } #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) +#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) /* Description : Interleave left half of word elements from vectors Arguments : Inputs - in0, in1, in2, in3 @@ -1137,6 +1139,7 @@ out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ } #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3) { \ @@ -1215,6 +1218,7 @@ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ } +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index c73692a37..bd6d9382e 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -52,8 +52,11 @@ endif # CONFIG_VP9_HIGHBITDEPTH ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) DSP_SRCS-yes += add_noise.c +DSP_SRCS-yes += deblock.c DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm endif # CONFIG_POSTPROC DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 66d466a7f..8736e4698 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1894,6 +1894,18 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; specialize qw/vpx_plane_add_noise sse2 msa/; + + add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; + specialize qw/vpx_mbpost_proc_down sse2 msa/; + $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm; + + add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; + specialize qw/vpx_mbpost_proc_across_ip sse2 msa/; + $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm; + + add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; + specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/; + } } # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/vp8/common/x86/postproc_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm index 508b5e887..6df360df4 100644 --- a/vp8/common/x86/postproc_sse2.asm +++ b/vpx_dsp/x86/deblock_sse2.asm @@ -83,7 +83,7 @@ add rbx, 16 %endmacro -;void vp8_post_proc_down_and_across_mb_row_sse2 +;void vpx_post_proc_down_and_across_mb_row_sse2 ;( ; unsigned char *src_ptr, ; unsigned char *dst_ptr, @@ -93,8 +93,8 @@ ; int *flimits, ; int size ;) -global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE -sym(vp8_post_proc_down_and_across_mb_row_sse2): +global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE +sym(vpx_post_proc_down_and_across_mb_row_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -230,11 +230,11 @@ sym(vp8_post_proc_down_and_across_mb_row_sse2): ret %undef flimit -;void vp8_mbpost_proc_down_xmm(unsigned char *dst, +;void vpx_mbpost_proc_down_xmm(unsigned char *dst, ; int pitch, int rows, int cols,int flimit) -extern sym(vp8_rv) -global sym(vp8_mbpost_proc_down_xmm) PRIVATE -sym(vp8_mbpost_proc_down_xmm): +extern sym(vpx_rv) +global sym(vpx_mbpost_proc_down_xmm) PRIVATE +sym(vpx_mbpost_proc_down_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -257,7 +257,7 @@ sym(vp8_mbpost_proc_down_xmm): %define flimit4 [rsp+128] %if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vp8_rv))] + lea r8, [GLOBAL(sym(vpx_rv))] %endif ;rows +=8; @@ -403,13 +403,13 @@ sym(vp8_mbpost_proc_down_xmm): and rcx, 127 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 push rax - lea rax, [GLOBAL(sym(vp8_rv))] - movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] + lea rax, [GLOBAL(sym(vpx_rv))] + movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2] pop rax %elif ABI_IS_32BIT=0 - movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] + movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2] %else - movdqu xmm4, [sym(vp8_rv) + rcx*2] + movdqu xmm4, [sym(vpx_rv) + rcx*2] %endif paddw xmm1, xmm4 @@ -462,10 +462,10 @@ sym(vp8_mbpost_proc_down_xmm): %undef flimit4 -;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, +;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src, ; int pitch, int rows, int cols,int flimit) -global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE -sym(vp8_mbpost_proc_across_ip_xmm): +global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE +sym(vpx_mbpost_proc_across_ip_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 |