summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShiyou Yin <yinshiyou-hf@loongson.cn>2017-10-12 00:33:17 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2017-10-12 00:33:17 +0000
commitbc4098a8e969f1ab3281a801a4ba3b78d788cec3 (patch)
treec8461ee0c1381fdaa24bbaf73be176fc78682427
parent72c69e14ad7b5375fed113c025dca70d20fcaf0a (diff)
parente8ed2bb76258f5d0d2def256b43ed6938b6d6a45 (diff)
downloadlibvpx-bc4098a8e969f1ab3281a801a4ba3b78d788cec3.tar
libvpx-bc4098a8e969f1ab3281a801a4ba3b78d788cec3.tar.gz
libvpx-bc4098a8e969f1ab3281a801a4ba3b78d788cec3.tar.bz2
libvpx-bc4098a8e969f1ab3281a801a4ba3b78d788cec3.zip
Merge "vp8: [loongson] optimize quantize with mmi"
-rw-r--r--test/quantize_test.cc8
-rw-r--r--vp8/common/rtcd_defs.pl4
-rw-r--r--vp8/encoder/mips/mmi/vp8_quantize_mmi.c237
-rw-r--r--vp8/vp8cx.mk2
4 files changed, 249 insertions, 2 deletions
diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index 69da8994c..40bb2642e 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -200,4 +200,12 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
#endif // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+ MMI, QuantizeTest,
+ ::testing::Values(
+ make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
+ make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
+#endif // HAVE_MMI
} // namespace
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 7a04ef17f..3bcfdc0d6 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -188,10 +188,10 @@ specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
# Quantizer
#
add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/;
add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
#
# Block subtraction
diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
new file mode 100644
index 000000000..22b12bbab
--- /dev/null
+++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define REGULAR_SELECT_EOB(i, rc) \
+ z = coeff_ptr[rc]; \
+ sz = (z >> 31); \
+ x = (z ^ sz) - sz; \
+ if (x >= (zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value)) { \
+ x += round_ptr[rc]; \
+ y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
+ x = (y ^ sz) - sz; \
+ qcoeff_ptr[rc] = x; \
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \
+ if (y) { \
+ eob = i; \
+ zbin_boost_ptr = b->zrun_zbin_boost; \
+ } \
+ }
+
+void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+ const int16_t *coeff_ptr = b->coeff;
+ const int16_t *round_ptr = b->round;
+ const int16_t *quant_ptr = b->quant_fast;
+ int16_t *qcoeff_ptr = d->qcoeff;
+ int16_t *dqcoeff_ptr = d->dqcoeff;
+ const int16_t *dequant_ptr = d->dequant;
+ const int16_t *inv_zig_zag = vp8_default_inv_zig_zag;
+
+ double ftmp[13];
+ uint64_t tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
+ int eob = 0;
+
+ __asm__ volatile(
+ // loop 0 ~ 7
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t"
+ "li %[tmp0], 0x0f \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t"
+
+ "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t"
+ "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+ "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t"
+ "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t"
+ "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[round_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x0f(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x08(%[round_ptr]) \n\t"
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[quant_ptr]) \n\t"
+ "gsldlc1 %[ftmp8], 0x0f(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x08(%[quant_ptr]) \n\t"
+ "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+
+ "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t"
+ "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t"
+ "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp8], 0x0f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x08(%[qcoeff_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp1], 0x07(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[inv_zig_zag]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t"
+ "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ones] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ones] \n\t"
+ "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[dequant_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x0f(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x08(%[dequant_ptr]) \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+ "gssdlc1 %[ftmp5], 0x07(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp6], 0x0f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x08(%[dqcoeff_ptr]) \n\t"
+
+ // loop 8 ~ 15
+ "gsldlc1 %[ftmp1], 0x17(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[coeff_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t"
+
+ "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t"
+ "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+ "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t"
+ "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t"
+ "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x10(%[round_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x1f(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x18(%[round_ptr]) \n\t"
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp7], 0x17(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x10(%[quant_ptr]) \n\t"
+ "gsldlc1 %[ftmp8], 0x1f(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x18(%[quant_ptr]) \n\t"
+ "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+
+ "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t"
+ "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t"
+ "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x10(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp8], 0x1f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x18(%[qcoeff_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp1], 0x17(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[inv_zig_zag]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t"
+ "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ones] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ones] \n\t"
+ "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x10(%[dequant_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x1f(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x18(%[dequant_ptr]) \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+ "gssdlc1 %[ftmp5], 0x17(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x10(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t"
+
+ "li %[tmp0], 0x10 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "li %[tmp0], 0xaa \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "li %[tmp0], 0xffff \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "and %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t"
+ "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [coeff_ptr] "r"((mips_reg)coeff_ptr),
+ [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
+ [dequant_ptr] "r"((mips_reg)dequant_ptr),
+ [round_ptr] "r"((mips_reg)round_ptr),
+ [quant_ptr] "r"((mips_reg)quant_ptr),
+ [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
+ [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
+ [ones] "f"(ones)
+ : "memory");
+
+ *d->eob = eob;
+}
+
+void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+ int eob;
+ int x, y, z, sz;
+ const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+ const int16_t *coeff_ptr = b->coeff;
+ const int16_t *zbin_ptr = b->zbin;
+ const int16_t *round_ptr = b->round;
+ const int16_t *quant_ptr = b->quant;
+ const int16_t *quant_shift_ptr = b->quant_shift;
+ int16_t *qcoeff_ptr = d->qcoeff;
+ int16_t *dqcoeff_ptr = d->dqcoeff;
+ const int16_t *dequant_ptr = d->dequant;
+ const int16_t zbin_oq_value = b->zbin_extra;
+
+ memset(qcoeff_ptr, 0, 32);
+ memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ REGULAR_SELECT_EOB(0, 0);
+ REGULAR_SELECT_EOB(1, 1);
+ REGULAR_SELECT_EOB(2, 4);
+ REGULAR_SELECT_EOB(3, 8);
+ REGULAR_SELECT_EOB(4, 5);
+ REGULAR_SELECT_EOB(5, 2);
+ REGULAR_SELECT_EOB(6, 3);
+ REGULAR_SELECT_EOB(7, 6);
+ REGULAR_SELECT_EOB(8, 9);
+ REGULAR_SELECT_EOB(9, 12);
+ REGULAR_SELECT_EOB(10, 13);
+ REGULAR_SELECT_EOB(11, 10);
+ REGULAR_SELECT_EOB(12, 7);
+ REGULAR_SELECT_EOB(13, 11);
+ REGULAR_SELECT_EOB(14, 14);
+ REGULAR_SELECT_EOB(15, 15);
+
+ *d->eob = (char)(eob + 1);
+}
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 9111a2257..23d65d416 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -110,6 +110,8 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c
+
ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
endif