diff options
-rw-r--r-- | test/blockiness_test.cc | 4 | ||||
-rw-r--r-- | test/consistency_test.cc | 4 | ||||
-rw-r--r-- | test/convolve_test.cc | 4 | ||||
-rw-r--r-- | test/cq_test.cc | 4 | ||||
-rw-r--r-- | vp8/common/mips/mmi/idctllm_mmi.c | 43 | ||||
-rw-r--r-- | vp8/common/mips/mmi/loopfilter_filters_mmi.c | 282 | ||||
-rw-r--r-- | vp8/common/mips/mmi/sixtap_filter_mmi.c | 45 | ||||
-rw-r--r-- | vp8/encoder/mips/mmi/dct_mmi.c | 53 | ||||
-rw-r--r-- | vp8/encoder/mips/mmi/vp8_quantize_mmi.c | 27 | ||||
-rw-r--r-- | vpx_dsp/mips/sad_mmi.c | 35 | ||||
-rw-r--r-- | vpx_dsp/mips/variance_mmi.c | 146 |
11 files changed, 422 insertions, 225 deletions
diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc index 75aa2938e..1ad444a04 100644 --- a/test/blockiness_test.cc +++ b/test/blockiness_test.cc @@ -35,14 +35,14 @@ class BlockinessTestBase : public ::testing::Test { public: BlockinessTestBase(int width, int height) : width_(width), height_(height) {} - static void SetUpTestCase() { + static void SetUpTestSuite() { source_data_ = reinterpret_cast<uint8_t *>( vpx_memalign(kDataAlignment, kDataBufferSize)); reference_data_ = reinterpret_cast<uint8_t *>( vpx_memalign(kDataAlignment, kDataBufferSize)); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(source_data_); source_data_ = NULL; vpx_free(reference_data_); diff --git a/test/consistency_test.cc b/test/consistency_test.cc index 69ebaf70c..f31fd8c92 100644 --- a/test/consistency_test.cc +++ b/test/consistency_test.cc @@ -39,7 +39,7 @@ class ConsistencyTestBase : public ::testing::Test { public: ConsistencyTestBase(int width, int height) : width_(width), height_(height) {} - static void SetUpTestCase() { + static void SetUpTestSuite() { source_data_[0] = reinterpret_cast<uint8_t *>( vpx_memalign(kDataAlignment, kDataBufferSize)); reference_data_[0] = reinterpret_cast<uint8_t *>( @@ -52,7 +52,7 @@ class ConsistencyTestBase : public ::testing::Test { } static void ClearSsim() { memset(ssim_array_, 0, kDataBufferSize / 16); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(source_data_[0]); source_data_[0] = NULL; vpx_free(reference_data_[0]); diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 7330e97db..6eef26f93 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -341,7 +341,7 @@ void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { public: - static void SetUpTestCase() { + static void SetUpTestSuite() { // Force input_ to be unaligned, output to be 16 byte aligned. input_ = reinterpret_cast<uint8_t *>( vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + @@ -363,7 +363,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { virtual void TearDown() { libvpx_test::ClearSystemState(); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(input_ - 1); input_ = NULL; vpx_free(output_); diff --git a/test/cq_test.cc b/test/cq_test.cc index 3126f3b4e..292adb0d0 100644 --- a/test/cq_test.cc +++ b/test/cq_test.cc @@ -29,9 +29,9 @@ class CQTest : public ::libvpx_test::EncoderTest, // maps the cqlevel to the bitrate produced. typedef std::map<int, uint32_t> BitrateMap; - static void SetUpTestCase() { bitrates_.clear(); } + static void SetUpTestSuite() { bitrates_.clear(); } - static void TearDownTestCase() { + static void TearDownTestSuite() { ASSERT_TRUE(!HasFailure()) << "skipping bitrate validation due to earlier failure."; uint32_t prev_actual_bitrate = kCQTargetBitrate; diff --git a/vp8/common/mips/mmi/idctllm_mmi.c b/vp8/common/mips/mmi/idctllm_mmi.c index 4fad1d347..a35689dd3 100644 --- a/vp8/common/mips/mmi/idctllm_mmi.c +++ b/vp8/common/mips/mmi/idctllm_mmi.c @@ -41,14 +41,18 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { double ftmp[12]; - uint32_t tmp[0]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL }; + uint64_t tmp[1]; + double ff_ph_04, ff_ph_4e7b, ff_ph_22a3; __asm__ volatile ( + "dli %[tmp0], 0x0004000400040004 \n\t" + "dmtc1 %[tmp0], %[ff_ph_04] \n\t" + "dli %[tmp0], 0x4e7b4e7b4e7b4e7b \n\t" + "dmtc1 %[tmp0], %[ff_ph_4e7b] \n\t" + "dli %[tmp0], 0x22a322a322a322a3 \n\t" + "dmtc1 %[tmp0], %[ff_ph_22a3] \n\t" MMI_LI(%[tmp0], 0x02) - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" @@ -186,9 +190,10 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), - [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr) - : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3), - [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04), + [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr), + [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04), + [ff_ph_22a3]"=&f"(ff_ph_22a3) + : [ip]"r"(input), [pred_stride]"r"((mips_reg)pred_stride), [dst_stride]"r"((mips_reg)dst_stride) : "memory" @@ -198,12 +203,13 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { - int a1 = ((input_dc + 4) >> 3); - double ftmp[5]; + int a0 = ((input_dc + 4) >> 3); + double a1, ftmp[5]; int low32; __asm__ volatile ( "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[a0], %[a1] \n\t" "pshufh %[a1], %[a1], %[ftmp0] \n\t" "ulw %[low32], 0x00(%[pred_ptr]) \n\t" "mtc1 %[low32], %[ftmp1] \n\t" @@ -244,9 +250,9 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32), - [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr) + [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1) : [dst_stride]"r"((mips_reg)dst_stride), - [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1) + [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0) : "memory" ); } @@ -254,14 +260,15 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { int i; int16_t output[16]; - double ftmp[12]; - uint32_t tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL }; + double ff_ph_03, ftmp[12]; + uint64_t tmp[1]; __asm__ volatile ( + "dli %[tmp0], 0x0003000300030003 \n\t" + "dmtc1 %[tmp0], %[ff_ph_03] \n\t" MMI_LI(%[tmp0], 0x03) "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" @@ -317,8 +324,8 @@ void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), - [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]) - : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03) + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03) + : [ip]"r"(input), [op]"r"(output) : "memory" ); diff --git a/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/vp8/common/mips/mmi/loopfilter_filters_mmi.c index fc1240cc2..a07a7e3b4 100644 --- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c +++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c @@ -13,28 +13,25 @@ #include "vp8/common/onyxc_int.h" #include "vpx_ports/asmdefs_mmi.h" -DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; -DECLARE_ALIGNED(8, static const uint64_t, - ff_ph_003f) = { 0x003f003f003f003fULL }; -DECLARE_ALIGNED(8, static const uint64_t, - ff_ph_0900) = { 0x0900090009000900ULL }; -DECLARE_ALIGNED(8, static const uint64_t, - ff_ph_1200) = { 0x1200120012001200ULL }; -DECLARE_ALIGNED(8, static const uint64_t, - ff_ph_1b00) = { 0x1b001b001b001b00ULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL }; - void vp8_loop_filter_horizontal_edge_mmi( unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { - uint32_t tmp[1]; + uint64_t tmp[1]; mips_reg addr[2]; double ftmp[12]; + double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" "1: \n\t" "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t" "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t" @@ -91,9 +88,9 @@ void vp8_loop_filter_horizontal_edge_mmi( "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" - "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t" @@ -134,8 +131,8 @@ void vp8_loop_filter_horizontal_edge_mmi( "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t" @@ -149,8 +146,8 @@ void vp8_loop_filter_horizontal_edge_mmi( "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t" @@ -188,17 +185,18 @@ void vp8_loop_filter_horizontal_edge_mmi( [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_03]"=&f"(ff_pb_03) : [limit]"r"(limit), [blimit]"r"(blimit), [thresh]"r"(thresh), [src_pixel_step]"r"((mips_reg)src_pixel_step), [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), - [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), - [ff_ph_01]"f"(ff_ph_01), [ff_pb_fe]"f"(ff_pb_fe), - [ff_pb_80]"f"(ff_pb_80), [ff_pb_04]"f"(ff_pb_04), - [ff_pb_03]"f"(ff_pb_03) + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)) : "memory" ); + /* clang-format on */ } void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, @@ -206,11 +204,23 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { - uint32_t tmp[1]; + uint64_t tmp[1]; mips_reg addr[2]; double ftmp[13]; + double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) @@ -315,8 +325,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, /* abs (p1-q1) */ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp1] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp1] \n\t" "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t" "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t" @@ -354,8 +364,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t" "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t" @@ -379,8 +389,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t" @@ -450,15 +460,16 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_fe]"=&f"(ff_pb_fe) : [limit]"r"(limit), [blimit]"r"(blimit), [thresh]"r"(thresh), - [src_pixel_step]"r"((mips_reg)src_pixel_step), - [ff_ph_01]"f"(ff_ph_01), [ff_pb_03]"f"(ff_pb_03), - [ff_pb_04]"f"(ff_pb_04), [ff_pb_80]"f"(ff_pb_80), - [ff_pb_fe]"f"(ff_pb_fe) + [src_pixel_step]"r"((mips_reg)src_pixel_step) : "memory" ); + /* clang-format on */ } /* clang-format off */ @@ -484,10 +495,29 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, void vp8_mbloop_filter_horizontal_edge_mmi( unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { - uint32_t tmp[1]; + uint64_t tmp[1]; double ftmp[13]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900, + ff_ph_1200, ff_ph_1b00; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0x1200120012001200 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1200] \n\t" + "dli %[tmp0], 0x1b001b001b001b00 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1b00] \n\t" MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) "1: \n\t" @@ -550,8 +580,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t" @@ -584,8 +614,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t" "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t" VP8_MBLOOP_HPSRAB "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" @@ -593,8 +623,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi( VP8_MBLOOP_HPSRAB "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" - "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00]) @@ -649,18 +679,20 @@ void vp8_mbloop_filter_horizontal_edge_mmi( [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03), + [ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00), + [ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f) : [limit]"r"(limit), [blimit]"r"(blimit), [thresh]"r"(thresh), - [src_pixel_step]"r"((mips_reg)src_pixel_step), - [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), - [ff_pb_04]"f"(ff_pb_04), [ff_pb_03]"f"(ff_pb_03), - [ff_ph_0900]"f"(ff_ph_0900), [ff_ph_1b00]"f"(ff_ph_1b00), - [ff_ph_1200]"f"(ff_ph_1200), [ff_ph_003f]"f"(ff_ph_003f) + [src_pixel_step]"r"((mips_reg)src_pixel_step) : "memory" ); + /* clang-format on */ } +/* clang-format off */ #define VP8_MBLOOP_VPSRAB_ADDH \ "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ @@ -673,15 +705,30 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \ "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" +/* clang-format on */ void vp8_mbloop_filter_vertical_edge_mmi( unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { mips_reg tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, srct[1]); + DECLARE_ALIGNED(8, const uint64_t, srct[2]); double ftmp[14]; + double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) "1: \n\t" @@ -783,8 +830,8 @@ void vp8_mbloop_filter_vertical_edge_mmi( /* abs (p1-q1) / 2 */ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t" "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t" @@ -824,8 +871,8 @@ void vp8_mbloop_filter_vertical_edge_mmi( "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t" "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp12] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" @@ -842,8 +889,8 @@ void vp8_mbloop_filter_vertical_edge_mmi( /* ftmp6: ps0 */ "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" - "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp12] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" VP8_MBLOOP_VPSRAB_ADDH "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t" @@ -948,17 +995,19 @@ void vp8_mbloop_filter_vertical_edge_mmi( [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), - [count]"+&r"(count) + [count]"+&r"(count), + [ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900), + [ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe) : [limit]"r"(limit), [blimit]"r"(blimit), [srct]"r"(srct), [thresh]"r"(thresh), - [src_pixel_step]"r"((mips_reg)src_pixel_step), - [ff_ph_003f]"f"(ff_ph_003f), [ff_ph_0900]"f"(ff_ph_0900), - [ff_pb_03]"f"(ff_pb_03), [ff_pb_04]"f"(ff_pb_04), - [ff_pb_80]"f"(ff_pb_80), [ff_pb_fe]"f"(ff_pb_fe) + [src_pixel_step]"r"((mips_reg)src_pixel_step) : "memory" ); + /* clang-format on */ } +/* clang-format off */ #define VP8_SIMPLE_HPSRAB \ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \ @@ -966,23 +1015,38 @@ void vp8_mbloop_filter_vertical_edge_mmi( "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \ "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" +/* clang-format on */ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit) { - uint32_t tmp[1], count = 2; + uint64_t tmp[1], count = 2; mips_reg addr[2]; double ftmp[12]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + /* clang-format off */ __asm__ volatile ( - "li %[tmp0], 0x08 \n\t" - "mtc1 %[tmp0], %[ftmp8] \n\t" - "li %[tmp0], 0x03 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" "1: \n\t" "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t" @@ -996,7 +1060,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t" - "and %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" + "pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) @@ -1020,7 +1084,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" - "and %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t" "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" VP8_SIMPLE_HPSRAB @@ -1048,30 +1112,43 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) : [blimit]"r"(blimit), [src_pixel_step]"r"((mips_reg)src_pixel_step), - [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), - [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), - [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01) + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)) : "memory" ); + /* clang-format on */ } void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit) { - uint32_t tmp[1], count = 2; + uint64_t tmp[1], count = 2; mips_reg addr[2]; - DECLARE_ALIGNED(8, const uint64_t, srct[1]); - double ftmp[12]; + DECLARE_ALIGNED(8, const uint64_t, srct[2]); + double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + /* clang-format off */ __asm__ volatile ( - "li %[tmp0], 0x08 \n\t" - "mtc1 %[tmp0], %[ftmp8] \n\t" - "li %[tmp0], 0x20 \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" - + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4]) MMI_SUBU(%[src_ptr], %[src_ptr], 0x02) @@ -1118,8 +1195,8 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t" "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t" "pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" @@ -1149,14 +1226,14 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t" "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" - "li %[tmp0], 0x03 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t" "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" @@ -1164,14 +1241,14 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" - "li %[tmp0], 0x03 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" "por %[ftmp0], %[ftmp0], %[ftmp5] \n\t" @@ -1235,16 +1312,17 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) : [blimit]"r"(blimit), [srct]"r"(srct), [src_pixel_step]"r"((mips_reg)src_pixel_step), [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), - [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)), - [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), - [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01) + [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)) : "memory" ); + /* clang-format on */ } /* Horizontal MB filtering */ diff --git a/vp8/common/mips/mmi/sixtap_filter_mmi.c b/vp8/common/mips/mmi/sixtap_filter_mmi.c index dbe35d09f..b85f73fdf 100644 --- a/vp8/common/mips/mmi/sixtap_filter_mmi.c +++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -70,9 +70,8 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, unsigned int output_height, unsigned int output_width, const int16_t *vp8_filter) { - uint32_t tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - + uint64_t tmp[1]; + double ff_ph_40; #if _MIPS_SIM == _ABIO32 register double fzero asm("$f0"); register double ftmp0 asm("$f2"); @@ -103,7 +102,10 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, register double ftmp11 asm("$f12"); #endif // _MIPS_SIM == _ABIO32 + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" @@ -111,10 +113,10 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" "pxor %[fzero], %[fzero], %[fzero] \n\t" - "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp7] \n\t" - "li %[tmp0], 0x08 \n\t" - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "1: \n\t" "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" @@ -166,21 +168,22 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]), [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), - [src_ptr]"+&r"(src_ptr) + [src_ptr]"+&r"(src_ptr), [ff_ph_40]"=&f"(ff_ph_40) : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), - [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width), - [ff_ph_40]"f"(ff_ph_40) + [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width) : "memory" ); + /* clang-format on */ } /* Horizontal filter: pixel_step is always W */ static INLINE void vp8_filter_block1dc_v6_mmi( uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - uint32_t tmp[1]; + double ff_ph_40; + uint64_t tmp[1]; mips_reg addr[1]; + #if _MIPS_SIM == _ABIO32 register double fzero asm("$f0"); register double ftmp0 asm("$f2"); @@ -215,7 +218,10 @@ static INLINE void vp8_filter_block1dc_v6_mmi( register double ftmp13 asm("$f14"); #endif // _MIPS_SIM == _ABIO32 + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" @@ -223,8 +229,8 @@ static INLINE void vp8_filter_block1dc_v6_mmi( "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" "pxor %[fzero], %[fzero], %[fzero] \n\t" - "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp13] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp13] \n\t" /* In order to make full use of memory load delay slot, * Operation of memory loading and calculating has been rearranged. @@ -285,15 +291,16 @@ static INLINE void vp8_filter_block1dc_v6_mmi( [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12), [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr), - [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [ff_ph_40]"=&f"(ff_ph_40) : [pixels_per_line]"r"((mips_reg)pixels_per_line), [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), [vp8_filter]"r"(vp8_filter), - [output_pitch]"r"((mips_reg)output_pitch), - [ff_ph_40]"f"(ff_ph_40) + [output_pitch]"r"((mips_reg)output_pitch) : "memory" ); + /* clang-format on */ } /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, @@ -313,6 +320,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi( register double ftmp1 asm("$f2"); #endif // _MIPS_SIM == _ABIO32 + /* clang-format off */ __asm__ volatile ( "pxor %[fzero], %[fzero], %[fzero] \n\t" @@ -335,6 +343,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi( [output_width]"r"(output_width) : "memory" ); + /* clang-format on */ } static INLINE void vp8_filter_block1dc_v6_filter0_mmi( @@ -350,6 +359,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi( register double ftmp1 asm("$f2"); #endif // _MIPS_SIM == _ABIO32 + /* clang-format on */ __asm__ volatile ( "pxor %[fzero], %[fzero], %[fzero] \n\t" @@ -371,6 +381,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi( [output_pitch]"r"((mips_reg)output_pitch) : "memory" ); + /* clang-format on */ } #define sixtapNxM(n, m) \ diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c index b5ecf0f1c..0fd25fcda 100644 --- a/vp8/encoder/mips/mmi/dct_mmi.c +++ b/vp8/encoder/mips/mmi/dct_mmi.c @@ -46,6 +46,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { uint64_t tmp[1]; int16_t *ip = input; + double ff_ph_op1, ff_ph_op3; #if _MIPS_SIM == _ABIO32 register double ftmp0 asm("$f0"); @@ -83,13 +84,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x14e808a914e808a9 \n\t" + "dmtc1 %[tmp0], %[ff_ph_op1] \n\t" + "dli %[tmp0], 0xeb1808a9eb1808a9 \n\t" + "dmtc1 %[tmp0], %[ff_ph_op3] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" @@ -129,7 +133,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12 MMI_LI(%[tmp0], 0x0c) - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" @@ -169,7 +173,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" MMI_LI(%[tmp0], 0x04) - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" @@ -211,15 +215,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), - [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip) + [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip), + [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3) : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), - [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3), [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500), [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000), [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217), [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output) : "memory" ); + /* clang-format on */ } void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { @@ -228,17 +233,22 @@ void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { } void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { - double ftmp[13]; - uint32_t tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL }; + double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask; + uint64_t tmp[1]; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0000000100000001 \n\t" + "dmtc1 %[tmp0], %[ff_pw_01] \n\t" + "dli %[tmp0], 0x0000000300000003 \n\t" + "dmtc1 %[tmp0], %[ff_pw_03] \n\t" + "dli %[tmp0], 0x0001000000010000 \n\t" + "dmtc1 %[tmp0], %[ff_pw_mask] \n\t" MMI_LI(%[tmp0], 0x02) "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" @@ -337,7 +347,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t" MMI_LI(%[tmp0], 0x03) - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" @@ -393,7 +403,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" MMI_LI(%[tmp0], 0x72) - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" @@ -413,13 +423,12 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), - [ftmp12]"=&f"(ftmp[12]), - [tmp0]"=&r"(tmp[0]), - [ip]"+&r"(input) - : [op]"r"(output), - [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch), - [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask), - [ff_ph_01]"f"(ff_ph_01) + [ftmp12]"=&f"(ftmp[12]), [ff_pw_mask]"=&f"(ff_pw_mask), + [tmp0]"=&r"(tmp[0]), [ff_pw_01]"=&f"(ff_pw_01), + [ip]"+&r"(input), [ff_pw_03]"=&f"(ff_pw_03), + [ff_ph_01]"=&f"(ff_ph_01) + : [op]"r"(output), [pitch]"r"((mips_reg)pitch) : "memory" ); + /* clang-format on */ } diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c index 69a9e5e01..1986444aa 100644 --- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c +++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c @@ -42,16 +42,17 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { double ftmp[13]; uint64_t tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL }; - int eob = 0; + int64_t eob = 0; + double ones; __asm__ volatile( // loop 0 ~ 7 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pcmpeqh %[ones], %[ones], %[ones] \n\t" "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t" - "li %[tmp0], 0x0f \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0f \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t" "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t" @@ -165,18 +166,18 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t" "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t" - "li %[tmp0], 0x10 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x10 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t" "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" - "li %[tmp0], 0xaa \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0xaa \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t" "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" - "li %[tmp0], 0xffff \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0xffff \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pand %[ftmp10], %[ftmp10], %[ftmp9] \n\t" "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t" "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t" @@ -184,15 +185,15 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), - [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones) : [coeff_ptr] "r"((mips_reg)coeff_ptr), [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr), [dequant_ptr] "r"((mips_reg)dequant_ptr), [round_ptr] "r"((mips_reg)round_ptr), [quant_ptr] "r"((mips_reg)quant_ptr), [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr), - [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob), - [ones] "f"(ones) + [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob) : "memory"); *d->eob = eob; diff --git a/vpx_dsp/mips/sad_mmi.c b/vpx_dsp/mips/sad_mmi.c index 5dee3164b..eaca4773f 100644 --- a/vpx_dsp/mips/sad_mmi.c +++ b/vpx_dsp/mips/sad_mmi.c @@ -364,6 +364,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -383,6 +384,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -405,7 +407,9 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -424,11 +428,12 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -450,6 +455,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -469,6 +475,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -493,7 +500,9 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -512,11 +521,12 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -539,6 +549,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -558,6 +569,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -586,7 +598,9 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -605,11 +619,12 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -632,6 +647,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" @@ -651,6 +667,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -679,7 +696,9 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" @@ -697,11 +716,12 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, "mfc1 %[sad], %[ftmp3] \n\t" : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -724,6 +744,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" @@ -743,6 +764,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -767,7 +789,9 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" @@ -785,11 +809,12 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, "mfc1 %[sad], %[ftmp3] \n\t" : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } diff --git a/vpx_dsp/mips/variance_mmi.c b/vpx_dsp/mips/variance_mmi.c index 29e52a1a8..c2adcfa01 100644 --- a/vpx_dsp/mips/variance_mmi.c +++ b/vpx_dsp/mips/variance_mmi.c @@ -414,6 +414,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -496,6 +497,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (64 * high)); } @@ -519,6 +521,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -577,6 +580,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, [sse]"r"(sse) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / 2048); } @@ -590,6 +594,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -653,6 +658,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (32 * high)); } @@ -676,6 +682,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -729,6 +736,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (16 * high)); } @@ -753,6 +761,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -801,6 +810,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (8 * high)); } @@ -825,6 +835,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp10] \n\t" @@ -872,6 +883,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (4 * high)); } @@ -894,6 +906,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -925,6 +938,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse) : "memory" ); + /* clang-format on */ return *sse; } @@ -947,6 +961,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -978,6 +993,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse) : "memory" ); + /* clang-format on */ return *sse; } @@ -1021,22 +1037,39 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, uint8_t *temp2_ptr = temp2; mips_reg l_counter = counter; double ftmp[15]; + double ff_ph_40, mask; + double filter_x0, filter_x1, filter_y0, filter_y1; mips_reg tmp[2]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + uint64_t x0, x1, y0, y1, all; const uint8_t *filter_x = bilinear_filters[x_offset]; const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp14]) + "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp14]) - "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" - "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" - "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" - "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" - + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) // fdata3: fdata3[0] ~ fdata3[15] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A @@ -1072,15 +1105,13 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), - [counter]"+&r"(l_counter) - : [filter_x0] "f"((uint64_t)filter_x[0]), - [filter_x1] "f"((uint64_t)filter_x[1]), - [filter_y0] "f"((uint64_t)filter_y[0]), - [filter_y1] "f"((uint64_t)filter_y[1]), - [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), - [mask] "f"(mask) + [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) : "memory" ); + /* clang-format on */ } #define SUBPIX_VAR16XN(H) \ @@ -1105,19 +1136,38 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, mips_reg l_counter = counter; double ftmp[15]; mips_reg tmp[2]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + double ff_ph_40, mask; + uint64_t x0, x1, y0, y1, all; + double filter_x0, filter_x1, filter_y0, filter_y1; const uint8_t *filter_x = bilinear_filters[x_offset]; const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp14]) + "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp14]) - "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" - "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" - "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" - "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) // fdata3: fdata3[0] ~ fdata3[7] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A @@ -1154,15 +1204,13 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), - [counter]"+&r"(l_counter) - : [filter_x0] "f"((uint64_t)filter_x[0]), - [filter_x1] "f"((uint64_t)filter_x[1]), - [filter_y0] "f"((uint64_t)filter_y[0]), - [filter_y1] "f"((uint64_t)filter_y[1]), - [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), - [mask] "f"(mask) + [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) : "memory" ); + /* clang-format on */ } #define SUBPIX_VAR8XN(H) \ @@ -1188,19 +1236,38 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, mips_reg l_counter = counter; double ftmp[7]; mips_reg tmp[2]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + double ff_ph_40, mask; + uint64_t x0, x1, y0, y1, all; + double filter_x0, filter_x1, filter_y0, filter_y1; const uint8_t *filter_x = bilinear_filters[x_offset]; const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp6]) + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp6]) - "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" - "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" - "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" - "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) // fdata3: fdata3[0] ~ fdata3[3] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A @@ -1232,15 +1299,14 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), - [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) - : [filter_x0] "f"((uint64_t)filter_x[0]), - [filter_x1] "f"((uint64_t)filter_x[1]), - [filter_y0] "f"((uint64_t)filter_y[0]), - [filter_y1] "f"((uint64_t)filter_y[1]), - [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), - [mask] "f"(mask) + [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter), + [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) : "memory" ); + /* clang-format on */ } #define SUBPIX_VAR4XN(H) \ |