summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/pp_filter_test.cc24
-rw-r--r--vp8/common/mips/msa/postproc_msa.c801
-rw-r--r--vp8/common/postproc.c242
-rw-r--r--vp8/common/rtcd_defs.pl10
-rw-r--r--vp8/common/x86/postproc_mmx.asm253
-rw-r--r--vp8/vp8_common.mk3
-rw-r--r--vp9/common/vp9_alloccommon.c2
-rw-r--r--vp9/common/vp9_postproc.c354
-rw-r--r--vp9/common/vp9_postproc.h7
-rw-r--r--vp9/common/vp9_rtcd_defs.pl12
-rw-r--r--vp9/common/x86/vp9_postproc_sse2.asm632
-rw-r--r--vp9/encoder/vp9_encoder.c8
-rw-r--r--vp9/vp9_common.mk1
-rw-r--r--vpx_dsp/deblock.c204
-rw-r--r--vpx_dsp/mips/deblock_msa.c683
-rw-r--r--vpx_dsp/mips/macros_msa.h4
-rw-r--r--vpx_dsp/vpx_dsp.mk3
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl12
-rw-r--r--vpx_dsp/x86/deblock_sse2.asm (renamed from vp8/common/x86/postproc_sse2.asm)30
19 files changed, 1015 insertions, 2270 deletions
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index e4688dd8c..89349e48b 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -11,7 +11,7 @@
#include "test/register_state_check.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
@@ -25,7 +25,7 @@ typedef void (*PostProcFunc)(unsigned char *src_ptr,
namespace {
-class VP8PostProcessingFilterTest
+class VPxPostProcessingFilterTest
: public ::testing::TestWithParam<PostProcFunc> {
public:
virtual void TearDown() {
@@ -33,10 +33,10 @@ class VP8PostProcessingFilterTest
}
};
-// Test routine for the VP8 post-processing function
-// vp8_post_proc_down_and_across_mb_row_c.
+// Test routine for the VPx post-processing function
+// vpx_post_proc_down_and_across_mb_row_c.
-TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
+TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
// Size of the underlying data block that will be filtered.
const int block_width = 16;
const int block_height = 16;
@@ -92,7 +92,7 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
for (int i = 0; i < block_height; ++i) {
for (int j = 0; j < block_width; ++j) {
EXPECT_EQ(expected_data[i], pixel_ptr[j])
- << "VP8PostProcessingFilterTest failed with invalid filter output";
+ << "VPxPostProcessingFilterTest failed with invalid filter output";
}
pixel_ptr += output_stride;
}
@@ -102,17 +102,17 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
vpx_free(flimits);
};
-INSTANTIATE_TEST_CASE_P(C, VP8PostProcessingFilterTest,
- ::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
+INSTANTIATE_TEST_CASE_P(C, VPxPostProcessingFilterTest,
+ ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, VP8PostProcessingFilterTest,
- ::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
+INSTANTIATE_TEST_CASE_P(SSE2, VPxPostProcessingFilterTest,
+ ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
#endif
#if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, VP8PostProcessingFilterTest,
- ::testing::Values(vp8_post_proc_down_and_across_mb_row_msa));
+INSTANTIATE_TEST_CASE_P(MSA, VPxPostProcessingFilterTest,
+ ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
#endif
} // namespace
diff --git a/vp8/common/mips/msa/postproc_msa.c b/vp8/common/mips/msa/postproc_msa.c
deleted file mode 100644
index 23dcde2eb..000000000
--- a/vp8/common/mips/msa/postproc_msa.c
+++ /dev/null
@@ -1,801 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include "./vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vp8/common/mips/msa/vp8_macros_msa.h"
-
-static const int16_t vp8_rv_msa[] =
-{
- 8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
- 0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
- 10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
- 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
- 8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
- 1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
- 3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
- 11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
- 14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
- 4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
- 7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
- 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
- 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
- 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
- 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
- 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
- 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
- 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
- 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
- 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
- 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
- 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
- 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
- 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
- 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
- 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
- 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
- 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
- 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
- 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
- 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
- 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
- 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
- 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
- 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
- 3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
- 11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
- 14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
- 5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
- 0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
-};
-
-#define VP8_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, \
- out4, out5, out6, out7, \
- out8, out9, out10, out11, \
- out12, out13, out14, out15) \
-{ \
- v8i16 temp0, temp1, temp2, temp3, temp4; \
- v8i16 temp5, temp6, temp7, temp8, temp9; \
- \
- ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
- temp0, temp1, temp2, temp3); \
- ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
- ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
- ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
- ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
- ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
- temp0, temp1, temp2, temp3); \
- ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
- ILVRL_W2_UB(temp5, temp4, out8, out10); \
- ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
- ILVRL_W2_UB(temp5, temp4, out12, out14); \
- out0 = (v16u8)temp6; \
- out2 = (v16u8)temp7; \
- out4 = (v16u8)temp8; \
- out6 = (v16u8)temp9; \
- out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
- out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
- out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
- out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
- out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
- out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
- out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
- out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
-}
-
-#define VP8_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
- below1_in, below2_in, ref, out) \
-{ \
- v16u8 temp0, temp1; \
- \
- temp1 = __msa_aver_u_b(above2_in, above1_in); \
- temp0 = __msa_aver_u_b(below2_in, below1_in); \
- temp1 = __msa_aver_u_b(temp1, temp0); \
- out = __msa_aver_u_b(src_in, temp1); \
- temp0 = __msa_asub_u_b(src_in, above2_in); \
- temp1 = __msa_asub_u_b(src_in, above1_in); \
- temp0 = (temp0 < ref); \
- temp1 = (temp1 < ref); \
- temp0 = temp0 & temp1; \
- temp1 = __msa_asub_u_b(src_in, below1_in); \
- temp1 = (temp1 < ref); \
- temp0 = temp0 & temp1; \
- temp1 = __msa_asub_u_b(src_in, below2_in); \
- temp1 = (temp1 < ref); \
- temp0 = temp0 & temp1; \
- out = __msa_bmz_v(out, src_in, temp0); \
-}
-
-#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15) \
-{ \
- v8i16 temp0, temp1, temp2, temp3, temp4; \
- v8i16 temp5, temp6, temp7, temp8, temp9; \
- \
- ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
- ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
- ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
- ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
- ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
- ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
- ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
- ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
- ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
- ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
- ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
- ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
- ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
- ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
- ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
- in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
- in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
- ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
- ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
- in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
- in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
- ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
- temp2, temp3, temp4, temp5); \
- ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
- temp6, temp7, temp8, temp9); \
- ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
- in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
- in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
- ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
- in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
- in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
-}
-
-#define VP8_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
- in6, in7, in8, in9, in10, in11) \
-{ \
- v8i16 temp0, temp1, temp2, temp3; \
- v8i16 temp4, temp5, temp6, temp7; \
- \
- ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
- ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
- ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
- ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
- ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
- ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
- ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
- temp4 = __msa_ilvr_h(temp5, temp4); \
- ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
- temp5 = __msa_ilvr_h(temp7, temp6); \
- ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
- in0 = (v16u8)temp0; \
- in2 = (v16u8)temp1; \
- in4 = (v16u8)temp2; \
- in6 = (v16u8)temp3; \
- in8 = (v16u8)temp6; \
- in10 = (v16u8)temp7; \
- in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
- in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
- in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
- in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
- in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
- in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
-}
-
-static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
- int32_t src_stride,
- int32_t dst_stride,
- int32_t cols, uint8_t *f)
-{
- uint8_t *p_src = src_ptr;
- uint8_t *p_dst = dst_ptr;
- uint8_t *f_orig = f;
- uint8_t *p_dst_st = dst_ptr;
- uint16_t col;
- uint64_t out0, out1, out2, out3;
- v16u8 above2, above1, below2, below1, src, ref, ref_temp;
- v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
- v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
-
- for (col = (cols / 16); col--;)
- {
- ref = LD_UB(f);
- LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
- src = LD_UB(p_src);
- LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
- above2 = LD_UB(p_src + 3 * src_stride);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
- above1 = LD_UB(p_src + 4 * src_stride);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
- src = LD_UB(p_src + 5 * src_stride);
- VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
- below1 = LD_UB(p_src + 6 * src_stride);
- VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
- below2 = LD_UB(p_src + 7 * src_stride);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
- above2 = LD_UB(p_src + 8 * src_stride);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
- above1 = LD_UB(p_src + 9 * src_stride);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
- ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
- p_dst, dst_stride);
-
- p_dst += 16;
- p_src += 16;
- f += 16;
- }
-
- if (0 != (cols / 16))
- {
- ref = LD_UB(f);
- LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
- src = LD_UB(p_src);
- LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
- above2 = LD_UB(p_src + 3 * src_stride);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
- above1 = LD_UB(p_src + 4 * src_stride);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
- src = LD_UB(p_src + 5 * src_stride);
- VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
- below1 = LD_UB(p_src + 6 * src_stride);
- VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
- below2 = LD_UB(p_src + 7 * src_stride);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
- above2 = LD_UB(p_src + 8 * src_stride);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
- above1 = LD_UB(p_src + 9 * src_stride);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
- out0 = __msa_copy_u_d((v2i64)inter0, 0);
- out1 = __msa_copy_u_d((v2i64)inter1, 0);
- out2 = __msa_copy_u_d((v2i64)inter2, 0);
- out3 = __msa_copy_u_d((v2i64)inter3, 0);
- SD4(out0, out1, out2, out3, p_dst, dst_stride);
-
- out0 = __msa_copy_u_d((v2i64)inter4, 0);
- out1 = __msa_copy_u_d((v2i64)inter5, 0);
- out2 = __msa_copy_u_d((v2i64)inter6, 0);
- out3 = __msa_copy_u_d((v2i64)inter7, 0);
- SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
- }
-
- f = f_orig;
- p_dst = dst_ptr - 2;
- LD_UB8(p_dst, dst_stride,
- inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
-
- for (col = 0; col < (cols / 8); ++col)
- {
- ref = LD_UB(f);
- f += 8;
- VP8_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3,
- inter4, inter5, inter6, inter7,
- inter8, inter9, inter10, inter11);
- if (0 == col)
- {
- above2 = inter2;
- above1 = inter2;
- }
- else
- {
- above2 = inter0;
- above1 = inter1;
- }
- src = inter2;
- below1 = inter3;
- below2 = inter4;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
- ref_temp, inter2);
- above2 = inter5;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
- ref_temp, inter3);
- above1 = inter6;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
- ref_temp, inter4);
- src = inter7;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
- VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
- ref_temp, inter5);
- below1 = inter8;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
- VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
- ref_temp, inter6);
- below2 = inter9;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
- ref_temp, inter7);
- if (col == (cols / 8 - 1))
- {
- above2 = inter9;
- }
- else
- {
- above2 = inter10;
- }
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
- ref_temp, inter8);
- if (col == (cols / 8 - 1))
- {
- above1 = inter9;
- }
- else
- {
- above1 = inter11;
- }
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
- ref_temp, inter9);
- TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
- inter8, inter9, inter2, inter3, inter4, inter5,
- inter6, inter7, inter8, inter9);
- p_dst += 8;
- LD_UB2(p_dst, dst_stride, inter0, inter1);
- ST8x1_UB(inter2, p_dst_st);
- ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
- LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
- ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
- ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
- LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
- ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
- ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
- LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
- ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
- ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
- p_dst_st += 8;
- }
-}
-
-static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
- int32_t src_stride,
- int32_t dst_stride,
- int32_t cols, uint8_t *f)
-{
- uint8_t *p_src = src_ptr;
- uint8_t *p_dst = dst_ptr;
- uint8_t *p_dst_st = dst_ptr;
- uint8_t *f_orig = f;
- uint16_t col;
- v16u8 above2, above1, below2, below1;
- v16u8 src, ref, ref_temp;
- v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
- v16u8 inter7, inter8, inter9, inter10, inter11;
- v16u8 inter12, inter13, inter14, inter15;
-
- for (col = (cols / 16); col--;)
- {
- ref = LD_UB(f);
- LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
- src = LD_UB(p_src);
- LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
- above2 = LD_UB(p_src + 3 * src_stride);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
- above1 = LD_UB(p_src + 4 * src_stride);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
- src = LD_UB(p_src + 5 * src_stride);
- VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
- below1 = LD_UB(p_src + 6 * src_stride);
- VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
- below2 = LD_UB(p_src + 7 * src_stride);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
- above2 = LD_UB(p_src + 8 * src_stride);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
- above1 = LD_UB(p_src + 9 * src_stride);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
- src = LD_UB(p_src + 10 * src_stride);
- VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
- below1 = LD_UB(p_src + 11 * src_stride);
- VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
- below2 = LD_UB(p_src + 12 * src_stride);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
- above2 = LD_UB(p_src + 13 * src_stride);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
- above1 = LD_UB(p_src + 14 * src_stride);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
- src = LD_UB(p_src + 15 * src_stride);
- VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
- below1 = LD_UB(p_src + 16 * src_stride);
- VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
- below2 = LD_UB(p_src + 17 * src_stride);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
- ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
- p_dst, dst_stride);
- ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13,
- inter14, inter15, p_dst + 8 * dst_stride, dst_stride);
- p_src += 16;
- p_dst += 16;
- f += 16;
- }
-
- f = f_orig;
- p_dst = dst_ptr - 2;
- LD_UB8(p_dst, dst_stride,
- inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
- LD_UB8(p_dst + 8 * dst_stride, dst_stride,
- inter8, inter9, inter10, inter11, inter12, inter13,
- inter14, inter15);
-
- for (col = 0; col < cols / 8; ++col)
- {
- ref = LD_UB(f);
- f += 8;
- TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5,
- inter6, inter7, inter8, inter9, inter10, inter11,
- inter12, inter13, inter14, inter15);
- if (0 == col)
- {
- above2 = inter2;
- above1 = inter2;
- }
- else
- {
- above2 = inter0;
- above1 = inter1;
- }
-
- src = inter2;
- below1 = inter3;
- below2 = inter4;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
- ref_temp, inter2);
- above2 = inter5;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
- ref_temp, inter3);
- above1 = inter6;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
- ref_temp, inter4);
- src = inter7;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
- VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
- ref_temp, inter5);
- below1 = inter8;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
- VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
- ref_temp, inter6);
- below2 = inter9;
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
- VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
- ref_temp, inter7);
- if (col == (cols / 8 - 1))
- {
- above2 = inter9;
- }
- else
- {
- above2 = inter10;
- }
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
- VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
- ref_temp, inter8);
- if (col == (cols / 8 - 1))
- {
- above1 = inter9;
- }
- else
- {
- above1 = inter11;
- }
- ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
- VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
- ref_temp, inter9);
- VP8_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5,
- inter6, inter7, inter8, inter9,
- inter2, inter3, inter4, inter5,
- inter6, inter7, inter8, inter9,
- inter10, inter11, inter12, inter13,
- inter14, inter15, above2, above1);
-
- p_dst += 8;
- LD_UB2(p_dst, dst_stride, inter0, inter1);
- ST8x1_UB(inter2, p_dst_st);
- ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
- LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
- ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
- ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
- LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
- ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
- ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
- LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
- ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
- ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
- LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
- ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
- ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
- LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
- ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
- ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
- LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
- ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
- ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
- LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
- ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
- ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
- p_dst_st += 8;
- }
-}
-
-void vp8_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
- int32_t src_stride,
- int32_t dst_stride,
- int32_t cols, uint8_t *f,
- int32_t size)
-{
- if (8 == size)
- {
- postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride,
- cols, f);
- }
- else if (16 == size)
- {
- postproc_down_across_luma_msa(src, dst, src_stride, dst_stride,
- cols, f);
- }
-}
-
-void vp8_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
- int32_t rows, int32_t cols, int32_t flimit)
-{
- int32_t row, col, cnt;
- uint8_t *src_dup = src_ptr;
- v16u8 src0, src, tmp_orig;
- v16u8 tmp = { 0 };
- v16i8 zero = { 0 };
- v8u16 sum_h, src_r_h, src_l_h;
- v4u32 src_r_w, src_l_w;
- v4i32 flimit_vec;
-
- flimit_vec = __msa_fill_w(flimit);
- for (row = rows; row--;)
- {
- int32_t sum_sq = 0;
- int32_t sum = 0;
- src0 = (v16u8)__msa_fill_b(src_dup[0]);
- ST8x1_UB(src0, (src_dup - 8));
-
- src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
- ST_UB(src0, src_dup + cols);
- src_dup[cols + 16] = src_dup[cols - 1];
- tmp_orig = (v16u8)__msa_ldi_b(0);
- tmp_orig[15] = tmp[15];
- src = LD_UB(src_dup - 8);
- src[15] = 0;
- ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
- src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
- src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
- sum_sq = HADD_SW_S32(src_r_w);
- sum_sq += HADD_SW_S32(src_l_w);
- sum_h = __msa_hadd_u_h(src, src);
- sum = HADD_UH_U32(sum_h);
- {
- v16u8 src7, src8, src_r, src_l;
- v16i8 mask;
- v8u16 add_r, add_l;
- v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
- v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
- v4i32 sub0, sub1, sub2, sub3;
- v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
- v4i32 mul0, mul1, mul2, mul3;
- v4i32 total0, total1, total2, total3;
- v8i16 const8 = __msa_fill_h(8);
-
- src7 = LD_UB(src_dup + 7);
- src8 = LD_UB(src_dup - 8);
- for (col = 0; col < (cols >> 4); ++col)
- {
- ILVRL_B2_UB(src7, src8, src_r, src_l);
- HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
-
- sum_r[0] = sum + sub_r[0];
- for (cnt = 0; cnt < 7; ++cnt)
- {
- sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
- }
- sum_l[0] = sum_r[7] + sub_l[0];
- for (cnt = 0; cnt < 7; ++cnt)
- {
- sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
- }
- sum = sum_l[7];
- src = LD_UB(src_dup + 16 * col);
- ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
- src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
- src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
- tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
-
- HADD_UB2_UH(src_r, src_l, add_r, add_l);
- UNPCK_SH_SW(sub_r, sub0, sub1);
- UNPCK_SH_SW(sub_l, sub2, sub3);
- ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
- ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
- MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3,
- mul0, mul1, mul2, mul3);
- sum_sq0[0] = sum_sq + mul0[0];
- for (cnt = 0; cnt < 3; ++cnt)
- {
- sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
- }
- sum_sq1[0] = sum_sq0[3] + mul1[0];
- for (cnt = 0; cnt < 3; ++cnt)
- {
- sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
- }
- sum_sq2[0] = sum_sq1[3] + mul2[0];
- for (cnt = 0; cnt < 3; ++cnt)
- {
- sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
- }
- sum_sq3[0] = sum_sq2[3] + mul3[0];
- for (cnt = 0; cnt < 3; ++cnt)
- {
- sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
- }
- sum_sq = sum_sq3[3];
-
- UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
- UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
- total0 = sum_sq0 * __msa_ldi_w(15);
- total0 -= sum0_w * sum0_w;
- total1 = sum_sq1 * __msa_ldi_w(15);
- total1 -= sum1_w * sum1_w;
- total2 = sum_sq2 * __msa_ldi_w(15);
- total2 -= sum2_w * sum2_w;
- total3 = sum_sq3 * __msa_ldi_w(15);
- total3 -= sum3_w * sum3_w;
- total0 = (total0 < flimit_vec);
- total1 = (total1 < flimit_vec);
- total2 = (total2 < flimit_vec);
- total3 = (total3 < flimit_vec);
- PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
- mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
- tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
-
- if (col == 0)
- {
- uint64_t src_d;
-
- src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
- SD(src_d, (src_dup - 8));
- }
-
- src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
- src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
- ST_UB(tmp, (src_dup + (16 * col)));
- }
-
- src_dup += pitch;
- }
- }
-}
-
-void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
- int32_t cols, int32_t flimit)
-{
- int32_t row, col, cnt, i;
- const int16_t *rv3 = &vp8_rv_msa[63 & rand()];
- v4i32 flimit_vec;
- v16u8 dst7, dst8, dst_r_b, dst_l_b;
- v16i8 mask;
- v8u16 add_r, add_l;
- v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
- v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
-
- flimit_vec = __msa_fill_w(flimit);
-
- for (col = 0; col < (cols >> 4); ++col)
- {
- uint8_t *dst_tmp = &dst_ptr[col << 4];
- v16u8 dst;
- v16i8 zero = { 0 };
- v16u8 tmp[16];
- v8i16 mult0, mult1, rv2_0, rv2_1;
- v8i16 sum0_h = { 0 };
- v8i16 sum1_h = { 0 };
- v4i32 mul0 = { 0 };
- v4i32 mul1 = { 0 };
- v4i32 mul2 = { 0 };
- v4i32 mul3 = { 0 };
- v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
- v4i32 add0, add1, add2, add3;
- const int16_t *rv2[16];
-
- dst = LD_UB(dst_tmp);
- for (cnt = (col << 4), i = 0; i < 16; ++cnt)
- {
- rv2[i] = rv3 + ((cnt * 17) & 127);
- ++i;
- }
- for (cnt = -8; cnt < 0; ++cnt)
- {
- ST_UB(dst, dst_tmp + cnt * pitch);
- }
-
- dst = LD_UB((dst_tmp + (rows - 1) * pitch));
- for (cnt = rows; cnt < rows + 17; ++cnt)
- {
- ST_UB(dst, dst_tmp + cnt * pitch);
- }
- for (cnt = -8; cnt <= 6; ++cnt)
- {
- dst = LD_UB(dst_tmp + (cnt * pitch));
- UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
- MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
- mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
- mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
- mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
- mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
- ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
- }
-
- for (row = 0; row < (rows + 8); ++row)
- {
- for (i = 0; i < 8; ++i)
- {
- rv2_0[i] = *(rv2[i] + (row & 127));
- rv2_1[i] = *(rv2[i + 8] + (row & 127));
- }
- dst7 = LD_UB(dst_tmp + (7 * pitch));
- dst8 = LD_UB(dst_tmp - (8 * pitch));
- ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
-
- HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
- UNPCK_SH_SW(sub_r, sub0, sub1);
- UNPCK_SH_SW(sub_l, sub2, sub3);
- sum0_h += sub_r;
- sum1_h += sub_l;
-
- HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
-
- ILVRL_H2_SW(zero, add_r, add0, add1);
- ILVRL_H2_SW(zero, add_l, add2, add3);
- mul0 += add0 * sub0;
- mul1 += add1 * sub1;
- mul2 += add2 * sub2;
- mul3 += add3 * sub3;
- dst = LD_UB(dst_tmp);
- ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
- dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
- dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
- tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
-
- UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
- UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
- total0 = mul0 * __msa_ldi_w(15);
- total0 -= sum0_w * sum0_w;
- total1 = mul1 * __msa_ldi_w(15);
- total1 -= sum1_w * sum1_w;
- total2 = mul2 * __msa_ldi_w(15);
- total2 -= sum2_w * sum2_w;
- total3 = mul3 * __msa_ldi_w(15);
- total3 -= sum3_w * sum3_w;
- total0 = (total0 < flimit_vec);
- total1 = (total1 < flimit_vec);
- total2 = (total2 < flimit_vec);
- total3 = (total3 < flimit_vec);
- PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
- mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
- tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
-
- if (row >= 8)
- {
- ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
- }
-
- dst_tmp += pitch;
- }
- }
-}
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 6baf00f1e..cd3bb95f1 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -72,142 +72,11 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
};
#endif
-const short vp8_rv[] =
-{
- 8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
- 0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
- 10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
- 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
- 8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
- 1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
- 3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
- 11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
- 14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
- 4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
- 7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
- 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
- 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
- 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
- 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
- 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
- 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
- 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
- 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
- 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
- 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
- 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
- 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
- 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
- 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
- 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
- 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
- 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
- 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
- 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
- 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
- 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
- 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
- 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
- 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
- 3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
- 11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
- 14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
- 5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
- 0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
-};
extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
/***********************************************************************************************************
*/
-void vp8_post_proc_down_and_across_mb_row_c
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int cols,
- unsigned char *f,
- int size
-)
-{
- unsigned char *p_src, *p_dst;
- int row;
- int col;
- unsigned char v;
- unsigned char d[4];
-
- for (row = 0; row < size; row++)
- {
- /* post_proc_down for one row */
- p_src = src_ptr;
- p_dst = dst_ptr;
-
- for (col = 0; col < cols; col++)
- {
- unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
- unsigned char p_above1 = p_src[col - src_pixels_per_line];
- unsigned char p_below1 = p_src[col + src_pixels_per_line];
- unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
-
- v = p_src[col];
-
- if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
- && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
- {
- unsigned char k1, k2, k3;
- k1 = (p_above2 + p_above1 + 1) >> 1;
- k2 = (p_below2 + p_below1 + 1) >> 1;
- k3 = (k1 + k2 + 1) >> 1;
- v = (k3 + v + 1) >> 1;
- }
-
- p_dst[col] = v;
- }
-
- /* now post_proc_across */
- p_src = dst_ptr;
- p_dst = dst_ptr;
-
- p_src[-2] = p_src[-1] = p_src[0];
- p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
-
- for (col = 0; col < cols; col++)
- {
- v = p_src[col];
-
- if ((abs(v - p_src[col - 2]) < f[col])
- && (abs(v - p_src[col - 1]) < f[col])
- && (abs(v - p_src[col + 1]) < f[col])
- && (abs(v - p_src[col + 2]) < f[col]))
- {
- unsigned char k1, k2, k3;
- k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
- k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
- k3 = (k1 + k2 + 1) >> 1;
- v = (k3 + v + 1) >> 1;
- }
-
- d[col & 3] = v;
-
- if (col >= 2)
- p_dst[col - 2] = d[(col - 2) & 3];
- }
-
- /* handle the last two pixels */
- p_dst[col - 2] = d[(col - 2) & 3];
- p_dst[col - 1] = d[(col - 1) & 3];
-
- /* next row */
- src_ptr += src_pixels_per_line;
- dst_ptr += dst_pixels_per_line;
- }
-}
-
static int q2mbl(int x)
{
if (x < 20) x = 20;
@@ -216,108 +85,13 @@ static int q2mbl(int x)
return x * x / 3;
}
-void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
-{
- int r, c, i;
-
- unsigned char *s = src;
- unsigned char d[16];
-
- for (r = 0; r < rows; r++)
- {
- int sumsq = 0;
- int sum = 0;
-
- for (i = -8; i < 0; i++)
- s[i]=s[0];
-
- /* 17 avoids valgrind warning - we buffer values in c in d
- * and only write them when we've read 8 ahead...
- */
- for (i = 0; i < 17; i++)
- s[i+cols]=s[cols-1];
-
- for (i = -8; i <= 6; i++)
- {
- sumsq += s[i] * s[i];
- sum += s[i];
- d[i+8] = 0;
- }
-
- for (c = 0; c < cols + 8; c++)
- {
- int x = s[c+7] - s[c-8];
- int y = s[c+7] + s[c-8];
-
- sum += x;
- sumsq += x * y;
-
- d[c&15] = s[c];
-
- if (sumsq * 15 - sum * sum < flimit)
- {
- d[c&15] = (8 + sum + s[c]) >> 4;
- }
-
- s[c-8] = d[(c-8)&15];
- }
-
- s += pitch;
- }
-}
-
-void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-{
- int r, c, i;
- const short *rv3 = &vp8_rv[63&rand()];
-
- for (c = 0; c < cols; c++ )
- {
- unsigned char *s = &dst[c];
- int sumsq = 0;
- int sum = 0;
- unsigned char d[16];
- const short *rv2 = rv3 + ((c * 17) & 127);
-
- for (i = -8; i < 0; i++)
- s[i*pitch]=s[0];
-
- /* 17 avoids valgrind warning - we buffer values in c in d
- * and only write them when we've read 8 ahead...
- */
- for (i = 0; i < 17; i++)
- s[(i+rows)*pitch]=s[(rows-1)*pitch];
-
- for (i = -8; i <= 6; i++)
- {
- sumsq += s[i*pitch] * s[i*pitch];
- sum += s[i*pitch];
- }
-
- for (r = 0; r < rows + 8; r++)
- {
- sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch];
- sum += s[7*pitch] - s[-8*pitch];
- d[r&15] = s[0];
-
- if (sumsq * 15 - sum * sum < flimit)
- {
- d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
- }
- if (r >= 8)
- s[-8*pitch] = d[(r-8)&15];
- s += pitch;
- }
- }
-}
-
#if CONFIG_POSTPROC
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
int q)
{
- vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+ vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
- vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+ vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
}
@@ -365,16 +139,16 @@ void vp8_deblock(VP8_COMMON *cm,
}
mode_info_context++;
- vp8_post_proc_down_and_across_mb_row(
+ vpx_post_proc_down_and_across_mb_row(
source->y_buffer + 16 * mbr * source->y_stride,
post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
post->y_stride, source->y_width, ylimits, 16);
- vp8_post_proc_down_and_across_mb_row(
+ vpx_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
post->uv_stride, source->uv_width, uvlimits, 8);
- vp8_post_proc_down_and_across_mb_row(
+ vpx_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
post->uv_stride, source->uv_width, uvlimits, 8);
@@ -409,17 +183,17 @@ void vp8_de_noise(VP8_COMMON *cm,
/* TODO: The original code don't filter the 2 outer rows and columns. */
for (mbr = 0; mbr < mb_rows; mbr++)
{
- vp8_post_proc_down_and_across_mb_row(
+ vpx_post_proc_down_and_across_mb_row(
source->y_buffer + 16 * mbr * source->y_stride,
source->y_buffer + 16 * mbr * source->y_stride,
source->y_stride, source->y_stride, source->y_width, limits, 16);
if (uvfilter == 1) {
- vp8_post_proc_down_and_across_mb_row(
+ vpx_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
source->u_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits,
8);
- vp8_post_proc_down_and_across_mb_row(
+ vpx_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
source->v_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits,
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 856ede189..a440352f4 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -156,16 +156,6 @@ $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
# Postproc
#
if (vpx_config("CONFIG_POSTPROC") eq "yes") {
- add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
- specialize qw/vp8_mbpost_proc_down mmx sse2 msa/;
- $vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm;
-
- add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
- specialize qw/vp8_mbpost_proc_across_ip sse2 msa/;
- $vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm;
-
- add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
- specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
# no asm yet
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
deleted file mode 100644
index 61fe8854d..000000000
--- a/vp8/common/x86/postproc_mmx.asm
+++ /dev/null
@@ -1,253 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define VP8_FILTER_WEIGHT 128
-%define VP8_FILTER_SHIFT 7
-
-;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vp8_rv)
-global sym(vp8_mbpost_proc_down_mmx) PRIVATE
-sym(vp8_mbpost_proc_down_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 136
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
-%define flimit2 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vp8_rv))]
-%endif
-
- ;rows +=8;
- add dword ptr arg(2), 8
-
- ;for(c=0; c<cols; c+=4)
-.loop_col:
- mov rsi, arg(0) ;s
- pxor mm0, mm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
-
- ; this copies the last row down into the border 8 rows
- mov rdi, rsi
- mov rdx, arg(2)
- sub rdx, 9
- imul rdx, rax
- lea rdi, [rdi+rdx]
- movq mm1, QWORD ptr[rdi] ; first row
- mov rcx, 8
-.init_borderd: ; initialize borders
- lea rdi, [rdi + rax]
- movq [rdi], mm1
-
- dec rcx
- jne .init_borderd
-
- neg rax ; rax = -pitch
-
- ; this copies the first row up into the border 8 rows
- mov rdi, rsi
- movq mm1, QWORD ptr[rdi] ; first row
- mov rcx, 8
-.init_border: ; initialize borders
- lea rdi, [rdi + rax]
- movq [rdi], mm1
-
- dec rcx
- jne .init_border
-
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
-
- pxor mm5, mm5
- pxor mm6, mm6 ;
-
- pxor mm7, mm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movd mm1, DWORD PTR [rdi];
- punpcklbw mm1, mm0 ;
-
- paddw mm5, mm1 ;
- pmullw mm1, mm1 ;
-
- movq mm2, mm1 ;
- punpcklwd mm1, mm0 ;
-
- punpckhwd mm2, mm0 ;
- paddd mm6, mm1 ;
-
- paddd mm7, mm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
- movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw mm1, mm0
- punpcklbw mm2, mm0
-
- paddw mm5, mm2
- psubw mm5, mm1
-
- pmullw mm2, mm2
- movq mm4, mm2
-
- punpcklwd mm2, mm0
- punpckhwd mm4, mm0
-
- paddd mm6, mm2
- paddd mm7, mm4
-
- pmullw mm1, mm1
- movq mm2, mm1
-
- punpcklwd mm1, mm0
- psubd mm6, mm1
-
- punpckhwd mm2, mm0
- psubd mm7, mm2
-
-
- movq mm3, mm6
- pslld mm3, 4
-
- psubd mm3, mm6
- movq mm1, mm5
-
- movq mm4, mm5
- pmullw mm1, mm1
-
- pmulhw mm4, mm4
- movq mm2, mm1
-
- punpcklwd mm1, mm4
- punpckhwd mm2, mm4
-
- movq mm4, mm7
- pslld mm4, 4
-
- psubd mm4, mm7
-
- psubd mm3, mm1
- psubd mm4, mm2
-
- psubd mm3, flimit2
- psubd mm4, flimit2
-
- psrad mm3, 31
- psrad mm4, 31
-
- packssdw mm3, mm4
- packsswb mm3, mm0
-
- movd mm1, DWORD PTR [rsi+rax*8]
-
- movq mm2, mm1
- punpcklbw mm1, mm0
-
- paddw mm1, mm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vp8_rv))]
- movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
-%else
- movq mm4, [sym(vp8_rv) + rcx*2]
-%endif
- paddw mm1, mm4
- psraw mm1, 4
-
- packuswb mm1, mm0
- pand mm1, mm3
-
- pandn mm3, mm2
- por mm1, mm3
-
- and rcx, 15
- movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
-
- cmp edx, 8
- jl .skip_assignment
-
- mov rcx, rdx
- sub rcx, 8
- and rcx, 15
- movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
- movd [rsi], mm1
-
-.skip_assignment:
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
-
- add dword arg(0), 4 ; s += 4
- sub dword arg(3), 4 ; cols -= 4
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 136
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit2
-
-
-SECTION_RODATA
-align 16
-Blur:
- times 16 dw 16
- times 8 dw 64
- times 16 dw 16
- times 8 dw 0
-
-rd:
- times 4 dw 0x40
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 7c362d764..63a918838 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -96,9 +96,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
ifeq ($(CONFIG_POSTPROC),yes)
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
endif
ifeq ($(ARCH_X86_64),yes)
@@ -123,7 +121,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
-VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/postproc_msa.c
endif
# common (c)
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 7dd1005d3..b4b120bee 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -103,6 +103,8 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) {
#if CONFIG_VP9_POSTPROC
vpx_free_frame_buffer(&cm->post_proc_buffer);
vpx_free_frame_buffer(&cm->post_proc_buffer_int);
+ vpx_free(cm->postproc_state.limits);
+ cm->postproc_state.limits = 0;
#else
(void)cm;
#endif
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 5dad81d64..6a71a19fa 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -32,129 +32,9 @@ static const int16_t kernel5[] = {
1, 1, 4, 1, 1
};
-const int16_t vp9_rv[] = {
- 8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
- 0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
- 10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
- 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
- 8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
- 1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
- 3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
- 11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
- 14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
- 4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
- 7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
- 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
- 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
- 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
- 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
- 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
- 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
- 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
- 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
- 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
- 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
- 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
- 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
- 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
- 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
- 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
- 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
- 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
- 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
- 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
- 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
- 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
- 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
- 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
- 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
- 3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
- 11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
- 14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
- 5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
- 0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
-};
-
static const uint8_t q_diff_thresh = 20;
static const uint8_t last_q_thresh = 170;
-void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
- uint8_t *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int rows,
- int cols,
- int flimit) {
- uint8_t const *p_src;
- uint8_t *p_dst;
- int row, col, i, v, kernel;
- int pitch = src_pixels_per_line;
- uint8_t d[8];
- (void)dst_pixels_per_line;
-
- for (row = 0; row < rows; row++) {
- /* post_proc_down for one row */
- p_src = src_ptr;
- p_dst = dst_ptr;
-
- for (col = 0; col < cols; col++) {
- kernel = 4;
- v = p_src[col];
-
- for (i = -2; i <= 2; i++) {
- if (abs(v - p_src[col + i * pitch]) > flimit)
- goto down_skip_convolve;
-
- kernel += kernel5[2 + i] * p_src[col + i * pitch];
- }
-
- v = (kernel >> 3);
- down_skip_convolve:
- p_dst[col] = v;
- }
-
- /* now post_proc_across */
- p_src = dst_ptr;
- p_dst = dst_ptr;
-
- for (i = 0; i < 8; i++)
- d[i] = p_src[i];
-
- for (col = 0; col < cols; col++) {
- kernel = 4;
- v = p_src[col];
-
- d[col & 7] = v;
-
- for (i = -2; i <= 2; i++) {
- if (abs(v - p_src[col + i]) > flimit)
- goto across_skip_convolve;
-
- kernel += kernel5[2 + i] * p_src[col + i];
- }
-
- d[col & 7] = (kernel >> 3);
- across_skip_convolve:
-
- if (col >= 2)
- p_dst[col - 2] = d[(col - 2) & 7];
- }
-
- /* handle the last two pixels */
- p_dst[col - 2] = d[(col - 2) & 7];
- p_dst[col - 1] = d[(col - 1) & 7];
-
-
- /* next row */
- src_ptr += pitch;
- dst_ptr += pitch;
- }
-}
-
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
uint16_t *dst_ptr,
@@ -237,41 +117,6 @@ static int q2mbl(int x) {
return x * x / 3;
}
-void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
- int rows, int cols, int flimit) {
- int r, c, i;
- uint8_t *s = src;
- uint8_t d[16];
-
- for (r = 0; r < rows; r++) {
- int sumsq = 0;
- int sum = 0;
-
- for (i = -8; i <= 6; i++) {
- sumsq += s[i] * s[i];
- sum += s[i];
- d[i + 8] = 0;
- }
-
- for (c = 0; c < cols + 8; c++) {
- int x = s[c + 7] - s[c - 8];
- int y = s[c + 7] + s[c - 8];
-
- sum += x;
- sumsq += x * y;
-
- d[c & 15] = s[c];
-
- if (sumsq * 15 - sum * sum < flimit) {
- d[c & 15] = (8 + sum + s[c]) >> 4;
- }
-
- s[c - 8] = d[(c - 8) & 15];
- }
- s += pitch;
- }
-}
-
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
int rows, int cols, int flimit) {
@@ -312,43 +157,12 @@ void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
- int rows, int cols, int flimit) {
- int r, c, i;
- const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT
-
- for (c = 0; c < cols; c++) {
- uint8_t *s = &dst[c];
- int sumsq = 0;
- int sum = 0;
- uint8_t d[16];
- const int16_t *rv2 = rv3 + ((c * 17) & 127);
-
- for (i = -8; i <= 6; i++) {
- sumsq += s[i * pitch] * s[i * pitch];
- sum += s[i * pitch];
- }
-
- for (r = 0; r < rows + 8; r++) {
- sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
- sum += s[7 * pitch] - s[-8 * pitch];
- d[r & 15] = s[0];
-
- if (sumsq * 15 - sum * sum < flimit) {
- d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
- }
-
- s[-8 * pitch] = d[(r - 8) & 15];
- s += pitch;
- }
- }
-}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
int rows, int cols, int flimit) {
int r, c, i;
- const int16_t *rv3 = &vp9_rv[63 & rand()]; // NOLINT
+ const int16_t *rv3 = &vpx_rv[63 & rand()]; // NOLINT
for (c = 0; c < cols; c++) {
uint16_t *s = &dst[c];
@@ -382,14 +196,14 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
- int flag) {
- double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
- int ppl = (int)(level + .5);
+ int flag,
+ uint8_t *limits) {
(void) low_var_thresh;
(void) flag;
-
#if CONFIG_VP9_HIGHBITDEPTH
if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+ int ppl = (int)(level + .5);
vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
CONVERT_TO_SHORTPTR(post->y_buffer),
source->y_stride, post->y_stride,
@@ -415,124 +229,68 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
source->uv_height, source->uv_width,
ppl);
} else {
- vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
- source->y_stride, post->y_stride,
- source->y_height, source->y_width, ppl);
-
- vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ vp9_deblock(source, post, q, limits);
+ vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
-
- vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+ vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
-
- vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
- vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
+#if CONFIG_VP9_HIGHBITDEPTH
}
-#else
- vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
- source->y_stride, post->y_stride,
- source->y_height, source->y_width, ppl);
-
- vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
- post->y_width, q2mbl(q));
-
- vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
- post->y_width, q2mbl(q));
-
- vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
- vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
- int q) {
+ int q, uint8_t *limits) {
const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+ 0.0065 + 0.5);
- int i;
-
- const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
- const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
- const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
- const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
-
- uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
- const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
-
- for (i = 0; i < MAX_MB_PLANE; ++i) {
#if CONFIG_VP9_HIGHBITDEPTH
- assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
- (dst->flags & YV12_FLAG_HIGHBITDEPTH));
- if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int i;
+ const uint8_t * const srcs[3] =
+ {src->y_buffer, src->u_buffer, src->v_buffer};
+ const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+ const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+ const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
+
+ uint8_t * const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+ const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
CONVERT_TO_SHORTPTR(dsts[i]),
src_strides[i], dst_strides[i],
src_heights[i], src_widths[i], ppl);
- } else {
- vp9_post_proc_down_and_across(srcs[i], dsts[i],
- src_strides[i], dst_strides[i],
- src_heights[i], src_widths[i], ppl);
}
-#else
- vp9_post_proc_down_and_across(srcs[i], dsts[i],
- src_strides[i], dst_strides[i],
- src_heights[i], src_widths[i], ppl);
+ } else {
#endif // CONFIG_VP9_HIGHBITDEPTH
+ int mbr;
+ const int mb_rows = src->y_height / 16;
+ const int mb_cols = src->y_width / 16;
+
+ memset(limits, (unsigned char) ppl, 16 * mb_cols);
+
+ for (mbr = 0; mbr < mb_rows; mbr++) {
+ vpx_post_proc_down_and_across_mb_row(
+ src->y_buffer + 16 * mbr * src->y_stride,
+ dst->y_buffer + 16 * mbr * dst->y_stride, src->y_stride,
+ dst->y_stride, src->y_width, limits, 16);
+ vpx_post_proc_down_and_across_mb_row(
+ src->u_buffer + 8 * mbr * src->uv_stride,
+ dst->u_buffer + 8 * mbr * dst->uv_stride, src->uv_stride,
+ dst->uv_stride, src->uv_width, limits, 8);
+ vpx_post_proc_down_and_across_mb_row(
+ src->v_buffer + 8 * mbr * src->uv_stride,
+ dst->v_buffer + 8 * mbr * dst->uv_stride, src->uv_stride,
+ dst->uv_stride, src->uv_width, limits, 8);
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
- int q) {
- const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
- + 0.0065 + 0.5);
- int i;
-
- const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
- const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
- const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
- const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
-
- uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
- const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
-
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- const int src_stride = src_strides[i];
- const int src_width = src_widths[i] - 4;
- const int src_height = src_heights[i] - 4;
- const int dst_stride = dst_strides[i];
-
-#if CONFIG_VP9_HIGHBITDEPTH
- assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
- (dst->flags & YV12_FLAG_HIGHBITDEPTH));
- if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
- const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
- srcs[i] + 2 * src_stride + 2);
- uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
- dsts[i] + 2 * dst_stride + 2);
- vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
- dst_stride, src_height, src_width,
- ppl);
- } else {
- const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
- uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
-
- vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
- dst_stride, src_height, src_width, ppl);
- }
-#else
- const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
- uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
- vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
- src_height, src_width, ppl);
-#endif
- }
+ int q, uint8_t *limits) {
+ vp9_deblock(src, dst, q, limits);
}
static double gaussian(double sigma, double mu, double x) {
@@ -664,6 +422,14 @@ int vp9_post_proc_frame(struct VP9Common *cm,
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate post-processing buffer");
+ if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) {
+ if (!cm->postproc_state.limits) {
+ cm->postproc_state.limits = vpx_calloc(
+ cm->width, sizeof(*cm->postproc_state.limits));
+ }
+ }
+
+
if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
ppstate->last_frame_valid && cm->bit_depth == 8 &&
ppstate->last_base_qindex <= last_q_thresh &&
@@ -678,17 +444,19 @@ int vp9_post_proc_frame(struct VP9Common *cm,
if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
q + (ppflags->deblocking_level - 5) * 10,
- 1, 0);
+ 1, 0, cm->postproc_state.limits);
} else if (flags & VP9D_DEBLOCK) {
- vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+ vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
+ cm->postproc_state.limits);
} else {
vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
}
} else if (flags & VP9D_DEMACROBLOCK) {
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
- q + (ppflags->deblocking_level - 5) * 10, 1, 0);
+ q + (ppflags->deblocking_level - 5) * 10, 1, 0,
+ cm->postproc_state.limits);
} else if (flags & VP9D_DEBLOCK) {
- vp9_deblock(cm->frame_to_show, ppbuf, q);
+ vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
} else {
vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
}
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index 035c9cdf8..60e6f5232 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -33,6 +33,7 @@ struct postproc_state {
DECLARE_ALIGNED(16, char, blackclamp[16]);
DECLARE_ALIGNED(16, char, whiteclamp[16]);
DECLARE_ALIGNED(16, char, bothclamp[16]);
+ uint8_t *limits;
};
struct VP9Common;
@@ -42,9 +43,11 @@ struct VP9Common;
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
-void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
+ uint8_t *limits);
-void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
+ uint8_t *limits);
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 276f14554..f315a3b85 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -35,18 +35,6 @@ if ($opts{arch} eq "x86_64") {
# post proc
#
if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
-add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
-specialize qw/vp9_mbpost_proc_down sse2/;
-$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
-
-add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
-specialize qw/vp9_mbpost_proc_across_ip sse2/;
-$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
-
-add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
-specialize qw/vp9_post_proc_down_and_across sse2/;
-$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
-
add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm
deleted file mode 100644
index 430762815..000000000
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ /dev/null
@@ -1,632 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_post_proc_down_and_across_xmm
-;(
-; unsigned char *src_ptr,
-; unsigned char *dst_ptr,
-; int src_pixels_per_line,
-; int dst_pixels_per_line,
-; int rows,
-; int cols,
-; int flimit
-;)
-global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
-sym(vp9_post_proc_down_and_across_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- ALIGN_STACK 16, rax
- ; move the global rd onto the stack, since we don't have enough registers
- ; to do PIC addressing
- movdqa xmm0, [GLOBAL(rd42)]
- sub rsp, 16
- movdqa [rsp], xmm0
-%define RD42 [rsp]
-%else
-%define RD42 [GLOBAL(rd42)]
-%endif
-
-
- movd xmm2, dword ptr arg(6) ;flimit
- punpcklwd xmm2, xmm2
- punpckldq xmm2, xmm2
- punpcklqdq xmm2, xmm2
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
- movsxd rcx, DWORD PTR arg(4) ;rows
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
- pxor xmm0, xmm0 ; mm0 = 00000000
-
-.nextrow:
-
- xor rdx, rdx ; clear out rdx for use as loop counter
-.nextcol:
- movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
- punpcklbw xmm3, xmm0 ; mm3 = p0..p3
- movdqa xmm1, xmm3 ; mm1 = p0..p3
- psllw xmm3, 2 ;
-
- movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm6
-
- ; thresholding
- movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
- psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
- paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw xmm7, xmm2
-
- movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- neg rax
- movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
- punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
- paddusw xmm3, xmm4 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
- paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- paddusw xmm3, RD42 ; mm3 += round value
- psraw xmm3, 3 ; mm3 /= 8
-
- pand xmm1, xmm7 ; mm1 select vals > thresh from source
- pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ; combination
-
- packuswb xmm1, xmm0 ; pack to bytes
- movq QWORD PTR [rdi], xmm1 ;
-
- neg rax ; pitch is positive
- add rsi, 8
- add rdi, 8
-
- add rdx, 8
- cmp edx, dword arg(5) ;cols
-
- jl .nextcol
-
- ; done with the all cols, start the across filtering in place
- sub rsi, rdx
- sub rdi, rdx
-
- xor rdx, rdx
- movq mm0, QWORD PTR [rdi-8];
-
-.acrossnextcol:
- movq xmm7, QWORD PTR [rdi +rdx -2]
- movd xmm4, DWORD PTR [rdi +rdx +6]
-
- pslldq xmm4, 8
- por xmm4, xmm7
-
- movdqa xmm3, xmm4
- psrldq xmm3, 2
- punpcklbw xmm3, xmm0 ; mm3 = p0..p3
- movdqa xmm1, xmm3 ; mm1 = p0..p3
- psllw xmm3, 2
-
-
- movdqa xmm5, xmm4
- psrldq xmm5, 3
- punpcklbw xmm5, xmm0 ; mm5 = p1..p4
- paddusw xmm3, xmm5 ; mm3 += mm6
-
- ; thresholding
- movdqa xmm7, xmm1 ; mm7 = p0..p3
- psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm7, xmm2
-
- movdqa xmm5, xmm4
- psrldq xmm5, 4
- punpcklbw xmm5, xmm0 ; mm5 = p2..p5
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- movdqa xmm5, xmm4 ; mm5 = p-2..p5
- punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- psrldq xmm4, 1 ; mm4 = p-1..p5
- punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
- paddusw xmm3, xmm4 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
- psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- paddusw xmm3, RD42 ; mm3 += round value
- psraw xmm3, 3 ; mm3 /= 8
-
- pand xmm1, xmm7 ; mm1 select vals > thresh from source
- pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ; combination
-
- packuswb xmm1, xmm0 ; pack to bytes
- movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
- movdq2q mm0, xmm1
-
- add rdx, 8
- cmp edx, dword arg(5) ;cols
- jl .acrossnextcol;
-
- ; last 8 pixels
- movq QWORD PTR [rdi+rdx-8], mm0
-
- ; done with this rwo
- add rsi,rax ; next line
- mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
- add rdi,rax ; next destination
- mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- add rsp,16
- pop rsp
-%endif
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef RD42
-
-
-;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_xmm) PRIVATE
-sym(vp9_mbpost_proc_down_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 128+16
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
- mov [rsp+128+8], eax
- mov [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vp9_rv))]
-%endif
-
- ;rows +=8;
- add dword arg(2), 8
-
- ;for(c=0; c<cols; c+=8)
-.loop_col:
- mov rsi, arg(0) ; s
- pxor xmm0, xmm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
- neg rax ; rax = -pitch
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
-
- pxor xmm5, xmm5
- pxor xmm6, xmm6 ;
-
- pxor xmm7, xmm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movq xmm1, QWORD PTR [rdi];
- punpcklbw xmm1, xmm0 ;
-
- paddw xmm5, xmm1 ;
- pmullw xmm1, xmm1 ;
-
- movdqa xmm2, xmm1 ;
- punpcklwd xmm1, xmm0 ;
-
- punpckhwd xmm2, xmm0 ;
- paddd xmm6, xmm1 ;
-
- paddd xmm7, xmm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
- movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- paddw xmm5, xmm2
- psubw xmm5, xmm1
-
- pmullw xmm2, xmm2
- movdqa xmm4, xmm2
-
- punpcklwd xmm2, xmm0
- punpckhwd xmm4, xmm0
-
- paddd xmm6, xmm2
- paddd xmm7, xmm4
-
- pmullw xmm1, xmm1
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm0
- psubd xmm6, xmm1
-
- punpckhwd xmm2, xmm0
- psubd xmm7, xmm2
-
-
- movdqa xmm3, xmm6
- pslld xmm3, 4
-
- psubd xmm3, xmm6
- movdqa xmm1, xmm5
-
- movdqa xmm4, xmm5
- pmullw xmm1, xmm1
-
- pmulhw xmm4, xmm4
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm4
- punpckhwd xmm2, xmm4
-
- movdqa xmm4, xmm7
- pslld xmm4, 4
-
- psubd xmm4, xmm7
-
- psubd xmm3, xmm1
- psubd xmm4, xmm2
-
- psubd xmm3, flimit4
- psubd xmm4, flimit4
-
- psrad xmm3, 31
- psrad xmm4, 31
-
- packssdw xmm3, xmm4
- packsswb xmm3, xmm0
-
- movq xmm1, QWORD PTR [rsi+rax*8]
-
- movq xmm2, xmm1
- punpcklbw xmm1, xmm0
-
- paddw xmm1, xmm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vp9_rv))]
- movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
- movdqu xmm4, [sym(vp9_rv) + rcx*2]
-%endif
-
- paddw xmm1, xmm4
- ;paddw xmm1, eight8s
- psraw xmm1, 4
-
- packuswb xmm1, xmm0
- pand xmm1, xmm3
-
- pandn xmm3, xmm2
- por xmm1, xmm3
-
- and rcx, 15
- movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
-
- mov rcx, rdx
- sub rcx, 8
-
- and rcx, 15
- movq mm0, [rsp + rcx*8] ;d[rcx*8]
-
- movq [rsi], mm0
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
- add dword arg(0), 8 ; s += 8
- sub dword arg(3), 8 ; cols -= 8
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 128+16
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit4
-
-
-;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
-; int pitch, int rows, int cols,int flimit)
-global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
-sym(vp9_mbpost_proc_across_ip_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16
-
- ; create flimit4 at [rsp]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp], eax
- mov [rsp+4], eax
- mov [rsp+8], eax
- mov [rsp+12], eax
-%define flimit4 [rsp]
-
-
- ;for(r=0;r<rows;r++)
-.ip_row_loop:
-
- xor rdx, rdx ;sumsq=0;
- xor rcx, rcx ;sum=0;
- mov rsi, arg(0); s
- mov rdi, -8
-.ip_var_loop:
- ;for(i=-8;i<=6;i++)
- ;{
- ; sumsq += s[i]*s[i];
- ; sum += s[i];
- ;}
- movzx eax, byte [rsi+rdi]
- add ecx, eax
- mul al
- add edx, eax
- add rdi, 1
- cmp rdi, 6
- jle .ip_var_loop
-
-
- ;mov rax, sumsq
- ;movd xmm7, rax
- movd xmm7, edx
-
- ;mov rax, sum
- ;movd xmm6, rax
- movd xmm6, ecx
-
- mov rsi, arg(0) ;s
- xor rcx, rcx
-
- movsxd rdx, dword arg(3) ;cols
- add rdx, 8
- pxor mm0, mm0
- pxor mm1, mm1
-
- pxor xmm0, xmm0
-.nextcol4:
-
- movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
- movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
-
- punpcklbw xmm1, xmm0 ; expanding
- punpcklbw xmm2, xmm0 ; expanding
-
- punpcklwd xmm1, xmm0 ; expanding to dwords
- punpcklwd xmm2, xmm0 ; expanding to dwords
-
- psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
- paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
-
- paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
- pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
-
- paddd xmm6, xmm2
- paddd xmm7, xmm1
-
- pshufd xmm6, xmm6, 0 ; duplicate the last ones
- pshufd xmm7, xmm7, 0 ; duplicate the last ones
-
- psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
- psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
-
- pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
- pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
-
- paddd xmm6, xmm4
- paddd xmm7, xmm3
-
- pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
- pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
- pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- movdqa xmm3, xmm6
- pmaddwd xmm3, xmm3
-
- movdqa xmm5, xmm7
- pslld xmm5, 4
-
- psubd xmm5, xmm7
- psubd xmm5, xmm3
-
- psubd xmm5, flimit4
- psrad xmm5, 31
-
- packssdw xmm5, xmm0
- packsswb xmm5, xmm0
-
- movd xmm1, DWORD PTR [rsi+rcx]
- movq xmm2, xmm1
-
- punpcklbw xmm1, xmm0
- punpcklwd xmm1, xmm0
-
- paddd xmm1, xmm6
- paddd xmm1, [GLOBAL(four8s)]
-
- psrad xmm1, 4
- packssdw xmm1, xmm0
-
- packuswb xmm1, xmm0
- pand xmm1, xmm5
-
- pandn xmm5, xmm2
- por xmm5, xmm1
-
- movd [rsi+rcx-8], mm0
- movq mm0, mm1
-
- movdq2q mm1, xmm5
- psrldq xmm7, 12
-
- psrldq xmm6, 12
- add rcx, 4
-
- cmp rcx, rdx
- jl .nextcol4
-
- ;s+=pitch;
- movsxd rax, dword arg(1)
- add arg(0), rax
-
- sub dword arg(2), 1 ;rows-=1
- cmp dword arg(2), 0
- jg .ip_row_loop
-
- add rsp, 16
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit4
-
-
-SECTION_RODATA
-align 16
-rd42:
- times 8 dw 0x04
-four8s:
- times 4 dd 8
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 9413a436f..f113d240f 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3114,7 +3114,11 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
l = 150;
break;
}
- vp9_denoise(cpi->Source, cpi->Source, l);
+ if (!cpi->common.postproc_state.limits) {
+ cpi->common.postproc_state.limits = vpx_calloc(
+ cpi->common.width, sizeof(*cpi->common.postproc_state.limits));
+ }
+ vp9_denoise(cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits);
}
#endif // CONFIG_VP9_POSTPROC
}
@@ -4914,7 +4918,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
vp9_deblock(cm->frame_to_show, pp,
- cm->lf.filter_level * 10 / 6);
+ cm->lf.filter_level * 10 / 6, cm->postproc_state.limits);
#endif
vpx_clear_system_state();
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index a7871d6ec..2fd42960e 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -67,7 +67,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
new file mode 100644
index 000000000..411bc7754
--- /dev/null
+++ b/vpx_dsp/deblock.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+
+const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3,
+ 14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+ 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13,
+ 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10,
+ 8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5,
+ 4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3,
+ 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0,
+ 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7,
+ 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+ 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3,
+ 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10,
+ 8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2,
+ 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13,
+ 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11,
+ 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14,
+ 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+ 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6,
+ 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13,
+ 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12,
+ 13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, };
+
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line, int cols,
+ unsigned char *f, int size) {
+ unsigned char *p_src, *p_dst;
+ int row;
+ int col;
+ unsigned char v;
+ unsigned char d[4];
+
+ for (row = 0; row < size; row++) {
+ /* post_proc_down for one row */
+ p_src = src_ptr;
+ p_dst = dst_ptr;
+
+ for (col = 0; col < cols; col++) {
+ unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+ unsigned char p_above1 = p_src[col - src_pixels_per_line];
+ unsigned char p_below1 = p_src[col + src_pixels_per_line];
+ unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+
+ v = p_src[col];
+
+ if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
+ && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+ unsigned char k1, k2, k3;
+ k1 = (p_above2 + p_above1 + 1) >> 1;
+ k2 = (p_below2 + p_below1 + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
+ }
+
+ p_dst[col] = v;
+ }
+
+ /* now post_proc_across */
+ p_src = dst_ptr;
+ p_dst = dst_ptr;
+
+ p_src[-2] = p_src[-1] = p_src[0];
+ p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
+
+ for (col = 0; col < cols; col++) {
+ v = p_src[col];
+
+ if ((abs(v - p_src[col - 2]) < f[col])
+ && (abs(v - p_src[col - 1]) < f[col])
+ && (abs(v - p_src[col + 1]) < f[col])
+ && (abs(v - p_src[col + 2]) < f[col])) {
+ unsigned char k1, k2, k3;
+ k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+ k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
+ }
+
+ d[col & 3] = v;
+
+ if (col >= 2)
+ p_dst[col - 2] = d[(col - 2) & 3];
+ }
+
+ /* handle the last two pixels */
+ p_dst[col - 2] = d[(col - 2) & 3];
+ p_dst[col - 1] = d[(col - 1) & 3];
+
+ /* next row */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += dst_pixels_per_line;
+ }
+}
+
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
+ int cols, int flimit) {
+ int r, c, i;
+
+ unsigned char *s = src;
+ unsigned char d[16];
+
+ for (r = 0; r < rows; r++) {
+ int sumsq = 0;
+ int sum = 0;
+
+ for (i = -8; i < 0; i++)
+ s[i] = s[0];
+
+ /* 17 avoids valgrind warning - we buffer values in c in d
+ * and only write them when we've read 8 ahead...
+ */
+ for (i = 0; i < 17; i++)
+ s[i + cols] = s[cols - 1];
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i] * s[i];
+ sum += s[i];
+ d[i + 8] = 0;
+ }
+
+ for (c = 0; c < cols + 8; c++) {
+ int x = s[c + 7] - s[c - 8];
+ int y = s[c + 7] + s[c - 8];
+
+ sum += x;
+ sumsq += x * y;
+
+ d[c & 15] = s[c];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[c & 15] = (8 + sum + s[c]) >> 4;
+ }
+
+ s[c - 8] = d[(c - 8) & 15];
+ }
+
+ s += pitch;
+ }
+}
+
+void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
+ int flimit) {
+ int r, c, i;
+ unsigned int seed;
+ const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)];
+
+ for (c = 0; c < cols; c++) {
+ unsigned char *s = &dst[c];
+ int sumsq = 0;
+ int sum = 0;
+ unsigned char d[16];
+ const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+ for (i = -8; i < 0; i++)
+ s[i * pitch] = s[0];
+
+ /* 17 avoids valgrind warning - we buffer values in c in d
+ * and only write them when we've read 8 ahead...
+ */
+ for (i = 0; i < 17; i++)
+ s[(i + rows) * pitch] = s[(rows - 1) * pitch];
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i * pitch] * s[i * pitch];
+ sum += s[i * pitch];
+ }
+
+ for (r = 0; r < rows + 8; r++) {
+ sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+ sum += s[7 * pitch] - s[-8 * pitch];
+ d[r & 15] = s[0];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+ }
+ if (r >= 8)
+ s[-8 * pitch] = d[(r - 8) & 15];
+ s += pitch;
+ }
+ }
+}
+
+#if CONFIG_POSTPROC
+static void vpx_de_mblock(YV12_BUFFER_CONFIG *post,
+ int q) {
+ vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+ vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+}
+
+#endif
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
new file mode 100644
index 000000000..616721d8e
--- /dev/null
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -0,0 +1,683 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+extern int16_t vpx_rv[];
+
+#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7, \
+ out8, out9, out10, out11, \
+ out12, out13, out14, out15) \
+{ \
+ v8i16 temp0, temp1, temp2, temp3, temp4; \
+ v8i16 temp5, temp6, temp7, temp8, temp9; \
+ \
+ ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
+ temp0, temp1, temp2, temp3); \
+ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
+ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
+ ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
+ temp0, temp1, temp2, temp3); \
+ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_UB(temp5, temp4, out8, out10); \
+ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_UB(temp5, temp4, out12, out14); \
+ out0 = (v16u8)temp6; \
+ out2 = (v16u8)temp7; \
+ out4 = (v16u8)temp8; \
+ out6 = (v16u8)temp9; \
+ out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
+ out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
+ out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
+ out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
+ out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+ out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
+ out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
+}
+
+#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
+ below1_in, below2_in, ref, out) \
+{ \
+ v16u8 temp0, temp1; \
+ \
+ temp1 = __msa_aver_u_b(above2_in, above1_in); \
+ temp0 = __msa_aver_u_b(below2_in, below1_in); \
+ temp1 = __msa_aver_u_b(temp1, temp0); \
+ out = __msa_aver_u_b(src_in, temp1); \
+ temp0 = __msa_asub_u_b(src_in, above2_in); \
+ temp1 = __msa_asub_u_b(src_in, above1_in); \
+ temp0 = (temp0 < ref); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ temp1 = __msa_asub_u_b(src_in, below1_in); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ temp1 = __msa_asub_u_b(src_in, below2_in); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ out = __msa_bmz_v(out, src_in, temp0); \
+}
+
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15) \
+{ \
+ v8i16 temp0, temp1, temp2, temp3, temp4; \
+ v8i16 temp5, temp6, temp7, temp8, temp9; \
+ \
+ ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
+ ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
+ ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
+ ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
+ ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
+ ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
+ ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
+ ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
+ ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
+ ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
+ ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
+ ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
+ ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
+ in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
+ in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
+ ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
+ in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
+ in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
+ ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
+ temp2, temp3, temp4, temp5); \
+ ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
+ temp6, temp7, temp8, temp9); \
+ ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
+ in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
+ in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
+ ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
+ in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
+ in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
+}
+
+#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
+ in6, in7, in8, in9, in10, in11) \
+{ \
+ v8i16 temp0, temp1, temp2, temp3; \
+ v8i16 temp4, temp5, temp6, temp7; \
+ \
+ ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
+ ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
+ ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
+ ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
+ ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
+ temp4 = __msa_ilvr_h(temp5, temp4); \
+ ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
+ temp5 = __msa_ilvr_h(temp7, temp6); \
+ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
+ in0 = (v16u8)temp0; \
+ in2 = (v16u8)temp1; \
+ in4 = (v16u8)temp2; \
+ in6 = (v16u8)temp3; \
+ in8 = (v16u8)temp6; \
+ in10 = (v16u8)temp7; \
+ in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
+ in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
+ in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
+ in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
+ in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
+ in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
+}
+
+static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f) {
+ uint8_t *p_src = src_ptr;
+ uint8_t *p_dst = dst_ptr;
+ uint8_t *f_orig = f;
+ uint8_t *p_dst_st = dst_ptr;
+ uint16_t col;
+ uint64_t out0, out1, out2, out3;
+ v16u8 above2, above1, below2, below1, src, ref, ref_temp;
+ v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
+ v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
+
+ for (col = (cols / 16); col--;) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+ p_dst, dst_stride);
+
+ p_dst += 16;
+ p_src += 16;
+ f += 16;
+ }
+
+ if (0 != (cols / 16)) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ out0 = __msa_copy_u_d((v2i64) inter0, 0);
+ out1 = __msa_copy_u_d((v2i64) inter1, 0);
+ out2 = __msa_copy_u_d((v2i64) inter2, 0);
+ out3 = __msa_copy_u_d((v2i64) inter3, 0);
+ SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64) inter4, 0);
+ out1 = __msa_copy_u_d((v2i64) inter5, 0);
+ out2 = __msa_copy_u_d((v2i64) inter6, 0);
+ out3 = __msa_copy_u_d((v2i64) inter7, 0);
+ SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+ }
+
+ f = f_orig;
+ p_dst = dst_ptr - 2;
+ LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7);
+
+ for (col = 0; col < (cols / 8); ++col) {
+ ref = LD_UB(f);
+ f += 8;
+ VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7, inter8, inter9, inter10, inter11);
+ if (0 == col) {
+ above2 = inter2;
+ above1 = inter2;
+ } else {
+ above2 = inter0;
+ above1 = inter1;
+ }
+ src = inter2;
+ below1 = inter3;
+ below2 = inter4;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+ above2 = inter5;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+ above1 = inter6;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+ src = inter7;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+ below1 = inter8;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+ below2 = inter9;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+ if (col == (cols / 8 - 1)) {
+ above2 = inter9;
+ } else {
+ above2 = inter10;
+ }
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+ if (col == (cols / 8 - 1)) {
+ above1 = inter9;
+ } else {
+ above1 = inter11;
+ }
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+ TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
+ inter9, inter2, inter3, inter4, inter5, inter6, inter7,
+ inter8, inter9);
+ p_dst += 8;
+ LD_UB2(p_dst, dst_stride, inter0, inter1);
+ ST8x1_UB(inter2, p_dst_st);
+ ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+ LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+ ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+ ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+ LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+ ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+ ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+ LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+ ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+ ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+ p_dst_st += 8;
+ }
+}
+
+static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f) {
+ uint8_t *p_src = src_ptr;
+ uint8_t *p_dst = dst_ptr;
+ uint8_t *p_dst_st = dst_ptr;
+ uint8_t *f_orig = f;
+ uint16_t col;
+ v16u8 above2, above1, below2, below1;
+ v16u8 src, ref, ref_temp;
+ v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
+ v16u8 inter7, inter8, inter9, inter10, inter11;
+ v16u8 inter12, inter13, inter14, inter15;
+
+ for (col = (cols / 16); col--;) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ src = LD_UB(p_src + 10 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+ below1 = LD_UB(p_src + 11 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+ below2 = LD_UB(p_src + 12 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+ above2 = LD_UB(p_src + 13 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+ above1 = LD_UB(p_src + 14 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+ src = LD_UB(p_src + 15 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+ below1 = LD_UB(p_src + 16 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+ below2 = LD_UB(p_src + 17 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+ ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+ p_dst, dst_stride);
+ ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
+ p_dst + 8 * dst_stride, dst_stride);
+ p_src += 16;
+ p_dst += 16;
+ f += 16;
+ }
+
+ f = f_orig;
+ p_dst = dst_ptr - 2;
+ LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7);
+ LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
+ inter12, inter13, inter14, inter15);
+
+ for (col = 0; col < cols / 8; ++col) {
+ ref = LD_UB(f);
+ f += 8;
+ TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
+ inter7, inter8, inter9, inter10, inter11, inter12, inter13,
+ inter14, inter15);
+ if (0 == col) {
+ above2 = inter2;
+ above1 = inter2;
+ } else {
+ above2 = inter0;
+ above1 = inter1;
+ }
+
+ src = inter2;
+ below1 = inter3;
+ below2 = inter4;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+ above2 = inter5;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+ above1 = inter6;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+ src = inter7;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+ below1 = inter8;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+ below2 = inter9;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+ if (col == (cols / 8 - 1)) {
+ above2 = inter9;
+ } else {
+ above2 = inter10;
+ }
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+ if (col == (cols / 8 - 1)) {
+ above1 = inter9;
+ } else {
+ above1 = inter11;
+ }
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+ VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
+ inter8, inter9, inter2, inter3, inter4, inter5,
+ inter6, inter7, inter8, inter9, inter10, inter11,
+ inter12, inter13, inter14, inter15, above2, above1);
+
+ p_dst += 8;
+ LD_UB2(p_dst, dst_stride, inter0, inter1);
+ ST8x1_UB(inter2, p_dst_st);
+ ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+ LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+ ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+ ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+ LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+ ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+ ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+ LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+ ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+ ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+ LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
+ ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
+ ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
+ LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
+ ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
+ ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
+ LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
+ ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
+ ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
+ LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
+ ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
+ ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
+ p_dst_st += 8;
+ }
+}
+
+void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f, int32_t size) {
+ if (8 == size) {
+ postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
+ } else if (16 == size) {
+ postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
+ }
+}
+
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
+ int32_t rows, int32_t cols, int32_t flimit) {
+ int32_t row, col, cnt;
+ uint8_t *src_dup = src_ptr;
+ v16u8 src0, src, tmp_orig;
+ v16u8 tmp = {0};
+ v16i8 zero = {0};
+ v8u16 sum_h, src_r_h, src_l_h;
+ v4u32 src_r_w, src_l_w;
+ v4i32 flimit_vec;
+
+ flimit_vec = __msa_fill_w(flimit);
+ for (row = rows; row--;) {
+ int32_t sum_sq = 0;
+ int32_t sum = 0;
+ src0 = (v16u8) __msa_fill_b(src_dup[0]);
+ ST8x1_UB(src0, (src_dup - 8));
+
+ src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]);
+ ST_UB(src0, src_dup + cols);
+ src_dup[cols + 16] = src_dup[cols - 1];
+ tmp_orig = (v16u8) __msa_ldi_b(0);
+ tmp_orig[15] = tmp[15];
+ src = LD_UB(src_dup - 8);
+ src[15] = 0;
+ ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+ src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
+ src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
+ sum_sq = HADD_SW_S32(src_r_w);
+ sum_sq += HADD_SW_S32(src_l_w);
+ sum_h = __msa_hadd_u_h(src, src);
+ sum = HADD_UH_U32(sum_h);
+ {
+ v16u8 src7, src8, src_r, src_l;
+ v16i8 mask;
+ v8u16 add_r, add_l;
+ v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
+ v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
+ v4i32 sub0, sub1, sub2, sub3;
+ v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+ v4i32 mul0, mul1, mul2, mul3;
+ v4i32 total0, total1, total2, total3;
+ v8i16 const8 = __msa_fill_h(8);
+
+ src7 = LD_UB(src_dup + 7);
+ src8 = LD_UB(src_dup - 8);
+ for (col = 0; col < (cols >> 4); ++col) {
+ ILVRL_B2_UB(src7, src8, src_r, src_l);
+ HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
+
+ sum_r[0] = sum + sub_r[0];
+ for (cnt = 0; cnt < 7; ++cnt) {
+ sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
+ }
+ sum_l[0] = sum_r[7] + sub_l[0];
+ for (cnt = 0; cnt < 7; ++cnt) {
+ sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
+ }
+ sum = sum_l[7];
+ src = LD_UB(src_dup + 16 * col);
+ ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+ src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4);
+ src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4);
+ tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7);
+
+ HADD_UB2_UH(src_r, src_l, add_r, add_l);
+ UNPCK_SH_SW(sub_r, sub0, sub1);
+ UNPCK_SH_SW(sub_l, sub2, sub3);
+ ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
+ ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
+ MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
+ mul2, mul3);
+ sum_sq0[0] = sum_sq + mul0[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
+ }
+ sum_sq1[0] = sum_sq0[3] + mul1[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
+ }
+ sum_sq2[0] = sum_sq1[3] + mul2[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
+ }
+ sum_sq3[0] = sum_sq2[3] + mul3[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
+ }
+ sum_sq = sum_sq3[3];
+
+ UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
+ UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
+ total0 = sum_sq0 * __msa_ldi_w(15);
+ total0 -= sum0_w * sum0_w;
+ total1 = sum_sq1 * __msa_ldi_w(15);
+ total1 -= sum1_w * sum1_w;
+ total2 = sum_sq2 * __msa_ldi_w(15);
+ total2 -= sum2_w * sum2_w;
+ total3 = sum_sq3 * __msa_ldi_w(15);
+ total3 -= sum3_w * sum3_w;
+ total0 = (total0 < flimit_vec);
+ total1 = (total1 < flimit_vec);
+ total2 = (total2 < flimit_vec);
+ total3 = (total3 < flimit_vec);
+ PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+ mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
+ tmp = __msa_bmz_v(tmp, src, (v16u8) mask);
+
+ if (col == 0) {
+ uint64_t src_d;
+
+ src_d = __msa_copy_u_d((v2i64) tmp_orig, 1);
+ SD(src_d, (src_dup - 8));
+ }
+
+ src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
+ src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
+ ST_UB(tmp, (src_dup + (16 * col)));
+ }
+
+ src_dup += pitch;
+ }
+ }
+}
+
+void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
+ int32_t cols, int32_t flimit) {
+ int32_t row, col, cnt, i;
+ unsigned int seed;
+ const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)];
+ v4i32 flimit_vec;
+ v16u8 dst7, dst8, dst_r_b, dst_l_b;
+ v16i8 mask;
+ v8u16 add_r, add_l;
+ v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
+ v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
+
+ flimit_vec = __msa_fill_w(flimit);
+
+ for (col = 0; col < (cols >> 4); ++col) {
+ uint8_t *dst_tmp = &dst_ptr[col << 4];
+ v16u8 dst;
+ v16i8 zero = {0};
+ v16u8 tmp[16];
+ v8i16 mult0, mult1, rv2_0, rv2_1;
+ v8i16 sum0_h = {0};
+ v8i16 sum1_h = {0};
+ v4i32 mul0 = {0};
+ v4i32 mul1 = {0};
+ v4i32 mul2 = {0};
+ v4i32 mul3 = {0};
+ v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+ v4i32 add0, add1, add2, add3;
+ const int16_t *rv2[16];
+
+ dst = LD_UB(dst_tmp);
+ for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
+ rv2[i] = rv3 + ((cnt * 17) & 127);
+ ++i;
+ }
+ for (cnt = -8; cnt < 0; ++cnt) {
+ ST_UB(dst, dst_tmp + cnt * pitch);
+ }
+
+ dst = LD_UB((dst_tmp + (rows - 1) * pitch));
+ for (cnt = rows; cnt < rows + 17; ++cnt) {
+ ST_UB(dst, dst_tmp + cnt * pitch);
+ }
+ for (cnt = -8; cnt <= 6; ++cnt) {
+ dst = LD_UB(dst_tmp + (cnt * pitch));
+ UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
+ MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
+ mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0);
+ mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0);
+ mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1);
+ mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1);
+ ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
+ }
+
+ for (row = 0; row < (rows + 8); ++row) {
+ for (i = 0; i < 8; ++i) {
+ rv2_0[i] = *(rv2[i] + (row & 127));
+ rv2_1[i] = *(rv2[i + 8] + (row & 127));
+ }
+ dst7 = LD_UB(dst_tmp + (7 * pitch));
+ dst8 = LD_UB(dst_tmp - (8 * pitch));
+ ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
+
+ HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
+ UNPCK_SH_SW(sub_r, sub0, sub1);
+ UNPCK_SH_SW(sub_l, sub2, sub3);
+ sum0_h += sub_r;
+ sum1_h += sub_l;
+
+ HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
+
+ ILVRL_H2_SW(zero, add_r, add0, add1);
+ ILVRL_H2_SW(zero, add_l, add2, add3);
+ mul0 += add0 * sub0;
+ mul1 += add1 * sub1;
+ mul2 += add2 * sub2;
+ mul3 += add3 * sub3;
+ dst = LD_UB(dst_tmp);
+ ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
+ dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
+ dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
+ tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7);
+
+ UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
+ UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
+ total0 = mul0 * __msa_ldi_w(15);
+ total0 -= sum0_w * sum0_w;
+ total1 = mul1 * __msa_ldi_w(15);
+ total1 -= sum1_w * sum1_w;
+ total2 = mul2 * __msa_ldi_w(15);
+ total2 -= sum2_w * sum2_w;
+ total3 = mul3 * __msa_ldi_w(15);
+ total3 -= sum3_w * sum3_w;
+ total0 = (total0 < flimit_vec);
+ total1 = (total1 < flimit_vec);
+ total2 = (total2 < flimit_vec);
+ total3 = (total3 < flimit_vec);
+ PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+ mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
+ tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask);
+
+ if (row >= 8) {
+ ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
+ }
+
+ dst_tmp += pitch;
+ }
+ }
+}
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index 91e3615cf..ea59eafe9 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -1060,6 +1060,7 @@
ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
}
#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
/* Description : Interleave left half of halfword elements from vectors
@@ -1074,6 +1075,7 @@
out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
}
#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
/* Description : Interleave left half of word elements from vectors
Arguments : Inputs - in0, in1, in2, in3
@@ -1137,6 +1139,7 @@
out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
}
#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) { \
@@ -1215,6 +1218,7 @@
out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
}
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index c73692a37..bd6d9382e 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -52,8 +52,11 @@ endif # CONFIG_VP9_HIGHBITDEPTH
ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
DSP_SRCS-yes += add_noise.c
+DSP_SRCS-yes += deblock.c
DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
endif # CONFIG_POSTPROC
DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 66d466a7f..8736e4698 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1894,6 +1894,18 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
specialize qw/vpx_plane_add_noise sse2 msa/;
+
+ add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+ specialize qw/vpx_mbpost_proc_down sse2 msa/;
+ $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
+
+ add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+ specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
+ $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
+
+ add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
+ specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
+
}
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vp8/common/x86/postproc_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm
index 508b5e887..6df360df4 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vpx_dsp/x86/deblock_sse2.asm
@@ -83,7 +83,7 @@
add rbx, 16
%endmacro
-;void vp8_post_proc_down_and_across_mb_row_sse2
+;void vpx_post_proc_down_and_across_mb_row_sse2
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
@@ -93,8 +93,8 @@
; int *flimits,
; int size
;)
-global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
-sym(vp8_post_proc_down_and_across_mb_row_sse2):
+global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vpx_post_proc_down_and_across_mb_row_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -230,11 +230,11 @@ sym(vp8_post_proc_down_and_across_mb_row_sse2):
ret
%undef flimit
-;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
+;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
-extern sym(vp8_rv)
-global sym(vp8_mbpost_proc_down_xmm) PRIVATE
-sym(vp8_mbpost_proc_down_xmm):
+extern sym(vpx_rv)
+global sym(vpx_mbpost_proc_down_xmm) PRIVATE
+sym(vpx_mbpost_proc_down_xmm):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -257,7 +257,7 @@ sym(vp8_mbpost_proc_down_xmm):
%define flimit4 [rsp+128]
%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vp8_rv))]
+ lea r8, [GLOBAL(sym(vpx_rv))]
%endif
;rows +=8;
@@ -403,13 +403,13 @@ sym(vp8_mbpost_proc_down_xmm):
and rcx, 127
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
push rax
- lea rax, [GLOBAL(sym(vp8_rv))]
- movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
+ lea rax, [GLOBAL(sym(vpx_rv))]
+ movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2]
pop rax
%elif ABI_IS_32BIT=0
- movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
+ movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2]
%else
- movdqu xmm4, [sym(vp8_rv) + rcx*2]
+ movdqu xmm4, [sym(vpx_rv) + rcx*2]
%endif
paddw xmm1, xmm4
@@ -462,10 +462,10 @@ sym(vp8_mbpost_proc_down_xmm):
%undef flimit4
-;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
+;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
; int pitch, int rows, int cols,int flimit)
-global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
-sym(vp8_mbpost_proc_across_ip_xmm):
+global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vpx_mbpost_proc_across_ip_xmm):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5