diff options
28 files changed, 283 insertions, 2364 deletions
@@ -50,6 +50,9 @@ CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS)) include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS)) +include $(SRC_PATH_BARE)/vpx_thread/vpx_thread.mk +CODEC_SRCS-yes += $(addprefix vpx_thread/,$(call enabled,THREAD_SRCS)) + ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),) VP8_PREFIX=vp8/ include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk diff --git a/test/variance_test.cc b/test/variance_test.cc index 614c4d999..220170048 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -2058,84 +2058,5 @@ INSTANTIATE_TEST_CASE_P( make_tuple(3, 2, variance8x4_msa, 0), make_tuple(2, 3, variance4x8_msa, 0), make_tuple(2, 2, variance4x4_msa, 0))); - -#if CONFIG_VP9_ENCODER -const SubpixVarMxNFunc subpel_variance4x4_msa = vp9_sub_pixel_variance4x4_msa; -const SubpixVarMxNFunc subpel_variance4x8_msa = vp9_sub_pixel_variance4x8_msa; -const SubpixVarMxNFunc subpel_variance8x4_msa = vp9_sub_pixel_variance8x4_msa; -const SubpixVarMxNFunc subpel_variance8x8_msa = vp9_sub_pixel_variance8x8_msa; -const SubpixVarMxNFunc subpel_variance8x16_msa = vp9_sub_pixel_variance8x16_msa; -const SubpixVarMxNFunc subpel_variance16x8_msa = vp9_sub_pixel_variance16x8_msa; -const SubpixVarMxNFunc subpel_variance16x16_msa = - vp9_sub_pixel_variance16x16_msa; -const SubpixVarMxNFunc subpel_variance16x32_msa = - vp9_sub_pixel_variance16x32_msa; -const SubpixVarMxNFunc subpel_variance32x16_msa = - vp9_sub_pixel_variance32x16_msa; -const SubpixVarMxNFunc subpel_variance32x32_msa = - vp9_sub_pixel_variance32x32_msa; -const SubpixVarMxNFunc subpel_variance32x64_msa = - vp9_sub_pixel_variance32x64_msa; -const SubpixVarMxNFunc subpel_variance64x32_msa = - vp9_sub_pixel_variance64x32_msa; -const SubpixVarMxNFunc subpel_variance64x64_msa = - vp9_sub_pixel_variance64x64_msa; -INSTANTIATE_TEST_CASE_P( - MSA, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_variance4x4_msa, 0), - make_tuple(2, 3, subpel_variance4x8_msa, 0), - make_tuple(3, 2, subpel_variance8x4_msa, 0), - make_tuple(3, 3, subpel_variance8x8_msa, 0), - make_tuple(3, 4, subpel_variance8x16_msa, 0), - make_tuple(4, 3, subpel_variance16x8_msa, 0), - make_tuple(4, 4, subpel_variance16x16_msa, 0), - make_tuple(4, 5, subpel_variance16x32_msa, 0), - make_tuple(5, 4, subpel_variance32x16_msa, 0), - make_tuple(5, 5, subpel_variance32x32_msa, 0), - make_tuple(5, 6, subpel_variance32x64_msa, 0), - make_tuple(6, 5, subpel_variance64x32_msa, 0), - make_tuple(6, 6, subpel_variance64x64_msa, 0))); -const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_msa = - vp9_sub_pixel_avg_variance4x4_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_msa = - vp9_sub_pixel_avg_variance4x8_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_msa = - vp9_sub_pixel_avg_variance8x4_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_msa = - vp9_sub_pixel_avg_variance8x8_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_msa = - vp9_sub_pixel_avg_variance8x16_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_msa = - vp9_sub_pixel_avg_variance16x8_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_msa = - vp9_sub_pixel_avg_variance16x16_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_msa = - vp9_sub_pixel_avg_variance16x32_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_msa = - vp9_sub_pixel_avg_variance32x16_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_msa = - vp9_sub_pixel_avg_variance32x32_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_msa = - vp9_sub_pixel_avg_variance32x64_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_msa = - vp9_sub_pixel_avg_variance64x32_msa; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_msa = - vp9_sub_pixel_avg_variance64x64_msa; -INSTANTIATE_TEST_CASE_P( - MSA, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_msa, 0), - make_tuple(2, 3, subpel_avg_variance4x8_msa, 0), - make_tuple(3, 2, subpel_avg_variance8x4_msa, 0), - make_tuple(3, 3, subpel_avg_variance8x8_msa, 0), - make_tuple(3, 4, subpel_avg_variance8x16_msa, 0), - make_tuple(4, 3, subpel_avg_variance16x8_msa, 0), - make_tuple(4, 4, subpel_avg_variance16x16_msa, 0), - make_tuple(4, 5, subpel_avg_variance16x32_msa, 0), - make_tuple(5, 4, subpel_avg_variance32x16_msa, 0), - make_tuple(5, 5, subpel_avg_variance32x32_msa, 0), - make_tuple(5, 6, subpel_avg_variance32x64_msa, 0), - make_tuple(6, 5, subpel_avg_variance64x32_msa, 0), - make_tuple(6, 6, subpel_avg_variance64x64_msa, 0))); -#endif // CONFIG_VP9_ENCODER #endif // HAVE_MSA } // namespace diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 902a6fc35..bde00ea04 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -18,33 +18,33 @@ #if CONFIG_WEBM_IO #include "test/webm_video_source.h" #endif -#include "vp9/common/vp9_thread.h" +#include "vpx_thread/vpx_thread.h" namespace { using std::string; -class VP9WorkerThreadTest : public ::testing::TestWithParam<bool> { +class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> { protected: - virtual ~VP9WorkerThreadTest() {} + virtual ~VPxWorkerThreadTest() {} virtual void SetUp() { - vp9_get_worker_interface()->init(&worker_); + vpx_get_worker_interface()->init(&worker_); } virtual void TearDown() { - vp9_get_worker_interface()->end(&worker_); + vpx_get_worker_interface()->end(&worker_); } - void Run(VP9Worker* worker) { + void Run(VPxWorker* worker) { const bool synchronous = GetParam(); if (synchronous) { - vp9_get_worker_interface()->execute(worker); + vpx_get_worker_interface()->execute(worker); } else { - vp9_get_worker_interface()->launch(worker); + vpx_get_worker_interface()->launch(worker); } } - VP9Worker worker_; + VPxWorker worker_; }; int ThreadHook(void* data, void* return_value) { @@ -53,12 +53,12 @@ int ThreadHook(void* data, void* return_value) { return *reinterpret_cast<int*>(return_value); } -TEST_P(VP9WorkerThreadTest, HookSuccess) { +TEST_P(VPxWorkerThreadTest, HookSuccess) { // should be a no-op. - EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0); + EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0); for (int i = 0; i < 2; ++i) { - EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0); + EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0); int hook_data = 0; int return_value = 1; // return successfully from the hook @@ -67,17 +67,17 @@ TEST_P(VP9WorkerThreadTest, HookSuccess) { worker_.data2 = &return_value; Run(&worker_); - EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0); + EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0); EXPECT_FALSE(worker_.had_error); EXPECT_EQ(5, hook_data); // should be a no-op. - EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0); + EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0); } } -TEST_P(VP9WorkerThreadTest, HookFailure) { - EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0); +TEST_P(VPxWorkerThreadTest, HookFailure) { + EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0); int hook_data = 0; int return_value = 0; // return failure from the hook @@ -86,29 +86,29 @@ TEST_P(VP9WorkerThreadTest, HookFailure) { worker_.data2 = &return_value; Run(&worker_); - EXPECT_FALSE(vp9_get_worker_interface()->sync(&worker_)); + EXPECT_FALSE(vpx_get_worker_interface()->sync(&worker_)); EXPECT_EQ(1, worker_.had_error); // Ensure _reset() clears the error and _launch() can be called again. return_value = 1; - EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0); + EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0); EXPECT_FALSE(worker_.had_error); - vp9_get_worker_interface()->launch(&worker_); - EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0); + vpx_get_worker_interface()->launch(&worker_); + EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0); EXPECT_FALSE(worker_.had_error); } -TEST_P(VP9WorkerThreadTest, EndWithoutSync) { +TEST_P(VPxWorkerThreadTest, EndWithoutSync) { // Create a large number of threads to increase the chances of detecting a // race. Doing more work in the hook is no guarantee as any race would occur // post hook execution in the main thread loop driver. static const int kNumWorkers = 64; - VP9Worker workers[kNumWorkers]; + VPxWorker workers[kNumWorkers]; int hook_data[kNumWorkers]; int return_value[kNumWorkers]; for (int n = 0; n < kNumWorkers; ++n) { - vp9_get_worker_interface()->init(&workers[n]); + vpx_get_worker_interface()->init(&workers[n]); return_value[n] = 1; // return successfully from the hook workers[n].hook = ThreadHook; workers[n].data1 = &hook_data[n]; @@ -117,7 +117,7 @@ TEST_P(VP9WorkerThreadTest, EndWithoutSync) { for (int i = 0; i < 2; ++i) { for (int n = 0; n < kNumWorkers; ++n) { - EXPECT_NE(vp9_get_worker_interface()->reset(&workers[n]), 0); + EXPECT_NE(vpx_get_worker_interface()->reset(&workers[n]), 0); hook_data[n] = 0; } @@ -126,16 +126,16 @@ TEST_P(VP9WorkerThreadTest, EndWithoutSync) { } for (int n = kNumWorkers - 1; n >= 0; --n) { - vp9_get_worker_interface()->end(&workers[n]); + vpx_get_worker_interface()->end(&workers[n]); } } } -TEST(VP9WorkerThreadTest, TestInterfaceAPI) { - EXPECT_EQ(0, vp9_set_worker_interface(NULL)); - EXPECT_TRUE(vp9_get_worker_interface() != NULL); +TEST(VPxWorkerThreadTest, TestInterfaceAPI) { + EXPECT_EQ(0, vpx_set_worker_interface(NULL)); + EXPECT_TRUE(vpx_get_worker_interface() != NULL); for (int i = 0; i < 6; ++i) { - VP9WorkerInterface winterface = *vp9_get_worker_interface(); + VPxWorkerInterface winterface = *vpx_get_worker_interface(); switch (i) { default: case 0: winterface.init = NULL; break; @@ -145,7 +145,7 @@ TEST(VP9WorkerThreadTest, TestInterfaceAPI) { case 4: winterface.execute = NULL; break; case 5: winterface.end = NULL; break; } - EXPECT_EQ(0, vp9_set_worker_interface(&winterface)); + EXPECT_EQ(0, vpx_set_worker_interface(&winterface)); } } @@ -202,21 +202,21 @@ void DecodeFiles(const FileList files[]) { // hang. namespace impl { -void Init(VP9Worker *const worker) { memset(worker, 0, sizeof(*worker)); } -int Reset(VP9Worker *const /*worker*/) { return 1; } -int Sync(VP9Worker *const worker) { return !worker->had_error; } +void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); } +int Reset(VPxWorker *const /*worker*/) { return 1; } +int Sync(VPxWorker *const worker) { return !worker->had_error; } -void Execute(VP9Worker *const worker) { +void Execute(VPxWorker *const worker) { worker->had_error |= !worker->hook(worker->data1, worker->data2); } -void Launch(VP9Worker *const worker) { Execute(worker); } -void End(VP9Worker *const /*worker*/) {} +void Launch(VPxWorker *const worker) { Execute(worker); } +void End(VPxWorker *const /*worker*/) {} } // namespace impl -TEST(VP9WorkerThreadTest, TestSerialInterface) { - static const VP9WorkerInterface serial_interface = { +TEST(VPxWorkerThreadTest, TestSerialInterface) { + static const VPxWorkerInterface serial_interface = { impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End }; // TODO(jzern): Avoid using a file that will use the row-based thread @@ -225,13 +225,13 @@ TEST(VP9WorkerThreadTest, TestSerialInterface) { // progress in the row above before proceeding. static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc"; static const char filename[] = "vp90-2-03-size-226x226.webm"; - VP9WorkerInterface default_interface = *vp9_get_worker_interface(); + VPxWorkerInterface default_interface = *vpx_get_worker_interface(); - EXPECT_NE(vp9_set_worker_interface(&serial_interface), 0); + EXPECT_NE(vpx_set_worker_interface(&serial_interface), 0); EXPECT_EQ(expected_md5, DecodeFile(filename, 2)); // Reset the interface. - EXPECT_NE(vp9_set_worker_interface(&default_interface), 0); + EXPECT_NE(vpx_set_worker_interface(&default_interface), 0); EXPECT_EQ(expected_md5, DecodeFile(filename, 2)); } @@ -309,6 +309,6 @@ TEST(VP9DecodeMultiThreadedTest, Decode3) { } #endif // CONFIG_WEBM_IO -INSTANTIATE_TEST_CASE_P(Synchronous, VP9WorkerThreadTest, ::testing::Bool()); +INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool()); } // namespace diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 1811d76df..bacbd8a22 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" +#include "vpx_thread/vpx_thread.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" @@ -21,7 +22,6 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_frame_buffers.h" #include "vp9/common/vp9_quant_common.h" -#include "vp9/common/vp9_thread.h" #include "vp9/common/vp9_tile_common.h" #if CONFIG_VP9_POSTPROC @@ -80,7 +80,7 @@ typedef struct { // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means // that no FrameWorker owns, or is decoding, this buffer. - VP9Worker *frame_worker_owner; + VPxWorker *frame_worker_owner; // row and col indicate which position frame has been decoded to in real // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index 0aac4a9e6..1f1632573 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -9,8 +9,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <limits.h> - #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" @@ -339,43 +337,3 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } -// Returns a context number for the given MB prediction signal -// The mode info data structure has a one element border above and to the -// left of the entries corresponding to real blocks. -// The prediction flags in these dummy entries are initialized to 0. -int vp9_get_tx_size_context(const MACROBLOCKD *xd) { - const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type]; - const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; - const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; - const int has_above = xd->up_available; - const int has_left = xd->left_available; - int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size - : max_tx_size; - int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size - : max_tx_size; - if (!has_left) - left_ctx = above_ctx; - - if (!has_above) - above_ctx = left_ctx; - - return (above_ctx + left_ctx) > max_tx_size; -} - -int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, - BLOCK_SIZE bsize, int mi_row, int mi_col) { - const int mi_offset = mi_row * cm->mi_cols + mi_col; - const int bw = num_8x8_blocks_wide_lookup[bsize]; - const int bh = num_8x8_blocks_high_lookup[bsize]; - const int xmis = MIN(cm->mi_cols - mi_col, bw); - const int ymis = MIN(cm->mi_rows - mi_row, bh); - int x, y, segment_id = INT_MAX; - - for (y = 0; y < ymis; y++) - for (x = 0; x < xmis; x++) - segment_id = MIN(segment_id, - segment_ids[mi_offset + y * cm->mi_cols + x]); - - assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); - return segment_id; -} diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index bc19d28b9..76161444b 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -18,8 +18,24 @@ extern "C" { #endif -int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, - BLOCK_SIZE bsize, int mi_row, int mi_col); +static INLINE int get_segment_id(const VP9_COMMON *cm, + const uint8_t *segment_ids, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = MIN(cm->mi_cols - mi_col, bw); + const int ymis = MIN(cm->mi_rows - mi_row, bh); + int x, y, segment_id = MAX_SEGMENTS; + + for (y = 0; y < ymis; ++y) + for (x = 0; x < xmis; ++x) + segment_id = MIN(segment_id, + segment_ids[mi_offset + y * cm->mi_cols + x]); + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + return segment_id; +} static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { const MODE_INFO *const above_mi = xd->above_mi; @@ -88,7 +104,28 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; } -int vp9_get_tx_size_context(const MACROBLOCKD *xd); +// Returns a context number for the given MB prediction signal +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real blocks. +// The prediction flags in these dummy entries are initialized to 0. +static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { + const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type]; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size + : max_tx_size; + int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size + : max_tx_size; + if (!has_left) + left_ctx = above_ctx; + + if (!has_above) + above_ctx = left_ctx; + + return (above_ctx + left_ctx) > max_tx_size; +} static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, const struct tx_probs *tx_probs) { @@ -108,7 +145,7 @@ static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd, const struct tx_probs *tx_probs) { - return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs); + return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs); } static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index af2307eb0..d8c7e1447 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -805,84 +805,84 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { # variance add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance32x32 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance8x8 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance4x8/, "$sse_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; #vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; specialize qw/vp9_avg_8x8 sse2 neon msa/; diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index cba57ff41..6b11c93cc 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -157,9 +157,9 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only, - VP9Worker *workers, int nworkers, + VPxWorker *workers, int nworkers, VP9LfSync *lf_sync) { - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; // Decoder may allocate more threads than number of tiles based on user's @@ -186,10 +186,10 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, // because of contention. If the multithreading code changes in the future // then the number of workers used by the loopfilter should be revisited. for (i = 0; i < num_workers; ++i) { - VP9Worker *const worker = &workers[i]; + VPxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; - worker->hook = (VP9WorkerHook)loop_filter_row_worker; + worker->hook = (VPxWorkerHook)loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; @@ -218,7 +218,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct macroblockd_plane planes[MAX_MB_PLANE], int frame_filter_level, int y_only, int partial_frame, - VP9Worker *workers, int num_workers, + VPxWorker *workers, int num_workers, VP9LfSync *lf_sync) { int start_mi_row, end_mi_row, mi_rows_to_filter; diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h index 3b3a6996a..94e6dfe7f 100644 --- a/vp9/common/vp9_thread_common.h +++ b/vp9/common/vp9_thread_common.h @@ -12,7 +12,7 @@ #define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ #include "./vpx_config.h" #include "vp9/common/vp9_loopfilter.h" -#include "vp9/common/vp9_thread.h" +#include "vpx_thread/vpx_thread.h" struct VP9Common; struct FRAME_COUNTS; @@ -48,7 +48,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct macroblockd_plane planes[MAX_MB_PLANE], int frame_filter_level, int y_only, int partial_frame, - VP9Worker *workers, int num_workers, + VPxWorker *workers, int num_workers, VP9LfSync *lf_sync); void vp9_accumulate_frame_counts(struct VP9Common *cm, diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 4fc10b4e7..a7e9fdaa6 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -18,6 +18,7 @@ #include "vpx_ports/mem.h" #include "vpx_ports/mem_ops.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_thread/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_common.h" @@ -30,7 +31,6 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_thread.h" #include "vp9/common/vp9_tile_common.h" #include "vp9/decoder/vp9_decodeframe.h" @@ -1259,7 +1259,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end) { VP9_COMMON *const cm = &pbi->common; - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; @@ -1272,7 +1272,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, pbi->lf_worker.data1 == NULL) { CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_memalign(32, sizeof(LFWorkerData))); - pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; + pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker; if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Loop filter thread creation failed"); @@ -1434,7 +1434,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end) { VP9_COMMON *const cm = &pbi->common; - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); const uint8_t *bit_reader_end = NULL; const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; @@ -1464,7 +1464,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, CHECK_MEM_ERROR(cm, pbi->tile_worker_info, vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info))); for (i = 0; i < num_threads; ++i) { - VP9Worker *const worker = &pbi->tile_workers[i]; + VPxWorker *const worker = &pbi->tile_workers[i]; ++pbi->num_tile_workers; winterface->init(worker); @@ -1477,9 +1477,9 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { - VP9Worker *const worker = &pbi->tile_workers[n]; + VPxWorker *const worker = &pbi->tile_workers[n]; winterface->sync(worker); - worker->hook = (VP9WorkerHook)tile_worker_hook; + worker->hook = (VPxWorkerHook)tile_worker_hook; worker->data1 = &pbi->tile_worker_data[n]; worker->data2 = &pbi->tile_worker_info[n]; } @@ -1529,7 +1529,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, while (n < tile_cols) { int i; for (i = 0; i < num_workers && n < tile_cols; ++i) { - VP9Worker *const worker = &pbi->tile_workers[i]; + VPxWorker *const worker = &pbi->tile_workers[i]; TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; TileInfo *const tile = (TileInfo*)worker->data2; TileBuffer *const buf = &tile_buffers[0][n]; @@ -1561,7 +1561,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, } for (; i > 0; --i) { - VP9Worker *const worker = &pbi->tile_workers[i - 1]; + VPxWorker *const worker = &pbi->tile_workers[i - 1]; // TODO(jzern): The tile may have specific error data associated with // its vpx_internal_error_info which could be propagated to the main info // in cm. Additionally once the threads have been synced and an error is @@ -2020,7 +2020,7 @@ void vp9_decode_frame(VP9Decoder *pbi, // If encoded in frame parallel mode, frame context is ready after decoding // the frame header. if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) { - VP9Worker *const worker = pbi->frame_worker_owner; + VPxWorker *const worker = pbi->frame_worker_owner; FrameWorkerData *const frame_worker_data = worker->data1; if (cm->refresh_frame_context) { context_updated = 1; diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index cd20c84cf..d587c8fa3 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -66,7 +66,7 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) { static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_SIZE max_tx_size, vp9_reader *r) { FRAME_COUNTS *counts = xd->counts; - const int ctx = vp9_get_tx_size_context(xd); + const int ctx = get_tx_size_context(xd); const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs); int tx_size = vp9_read(r, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { @@ -155,7 +155,7 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, return 0; // Default for disabled segmentation predicted_segment_id = cm->last_frame_seg_map ? - vp9_get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0; + get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0; if (!seg->update_map) { copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map, diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 7991a39e6..cbb3266d3 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -20,6 +20,7 @@ #include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_thread/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" @@ -30,7 +31,6 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_systemdependent.h" -#include "vp9/common/vp9_thread.h" #include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_decoder.h" @@ -118,7 +118,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { cm->error.setjmp = 0; - vp9_get_worker_interface()->init(&pbi->lf_worker); + vpx_get_worker_interface()->init(&pbi->lf_worker); return pbi; } @@ -126,12 +126,12 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { void vp9_decoder_remove(VP9Decoder *pbi) { int i; - vp9_get_worker_interface()->end(&pbi->lf_worker); + vpx_get_worker_interface()->end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); vpx_free(pbi->tile_data); for (i = 0; i < pbi->num_tile_workers; ++i) { - VP9Worker *const worker = &pbi->tile_workers[i]; - vp9_get_worker_interface()->end(worker); + VPxWorker *const worker = &pbi->tile_workers[i]; + vpx_get_worker_interface()->end(worker); } vpx_free(pbi->tile_worker_data); vpx_free(pbi->tile_worker_info); @@ -311,7 +311,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, pbi->hold_ref_buf = 0; if (pbi->frame_parallel_decode) { - VP9Worker *const worker = pbi->frame_worker_owner; + VPxWorker *const worker = pbi->frame_worker_owner; vp9_frameworker_lock_stats(worker); frame_bufs[cm->new_fb_idx].frame_worker_owner = worker; // Reset decoding progress. @@ -325,7 +325,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, if (setjmp(cm->error.jmp)) { - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); int i; cm->error.setjmp = 0; @@ -387,7 +387,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, if (pbi->frame_parallel_decode) { // Need to lock the mutex here as another thread may // be accessing this buffer. - VP9Worker *const worker = pbi->frame_worker_owner; + VPxWorker *const worker = pbi->frame_worker_owner; FrameWorkerData *const frame_worker_data = worker->data1; vp9_frameworker_lock_stats(worker); diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index c19f0ac3b..50c03c18e 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -15,10 +15,11 @@ #include "vpx/vpx_codec.h" #include "vpx_scale/yv12config.h" +#include "vpx_thread/vpx_thread.h" + #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" -#include "vp9/common/vp9_thread.h" #include "vp9/decoder/vp9_dthread.h" #include "vp9/decoder/vp9_reader.h" @@ -56,9 +57,9 @@ typedef struct VP9Decoder { // the same. RefCntBuffer *cur_buf; // Current decoding frame buffer. - VP9Worker *frame_worker_owner; // frame_worker that owns this pbi. - VP9Worker lf_worker; - VP9Worker *tile_workers; + VPxWorker *frame_worker_owner; // frame_worker that owns this pbi. + VPxWorker lf_worker; + VPxWorker *tile_workers; TileWorkerData *tile_worker_data; TileInfo *tile_worker_info; int num_tile_workers; diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c index c22617edb..b33c3b718 100644 --- a/vp9/decoder/vp9_dsubexp.c +++ b/vp9/decoder/vp9_dsubexp.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> + #include "vp9/common/vp9_entropy.h" #include "vp9/decoder/vp9_dsubexp.h" @@ -16,7 +18,7 @@ static int inv_recenter_nonneg(int v, int m) { if (v > 2 * m) return v; - return v % 2 ? m - (v + 1) / 2 : m + v / 2; + return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1); } static int decode_uniform(vp9_reader *r) { @@ -27,34 +29,32 @@ static int decode_uniform(vp9_reader *r) { } static int inv_remap_prob(int v, int m) { - static int inv_map_table[MAX_PROB - 1] = { - 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188, - 201, 214, 227, 240, 253, 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, - 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, - 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, - 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, - 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, - 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, - 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, - 141, 142, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156, - 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, - 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, - 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205, - 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221, - 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, - 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252 + static int inv_map_table[MAX_PROB] = { + 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176, 189, + 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, + 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, + 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, + 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, + 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, + 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, + 174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, + 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, + 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222, + 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, + 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 253 }; - // The clamp is not necessary for conforming VP9 stream, it is added to - // prevent out of bound access for bad input data - v = clamp(v, 0, 253); + assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0]))); v = inv_map_table[v]; m--; if ((m << 1) <= MAX_PROB) { - return 1 + inv_recenter_nonneg(v + 1, m); + return 1 + inv_recenter_nonneg(v, m); } else { - return MAX_PROB - inv_recenter_nonneg(v + 1, MAX_PROB - 1 - m); + return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m); } } diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index 96a63bd9e..14a71448f 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -17,7 +17,7 @@ // #define DEBUG_THREAD // TODO(hkuang): Clean up all the #ifdef in this file. -void vp9_frameworker_lock_stats(VP9Worker *const worker) { +void vp9_frameworker_lock_stats(VPxWorker *const worker) { #if CONFIG_MULTITHREAD FrameWorkerData *const worker_data = worker->data1; pthread_mutex_lock(&worker_data->stats_mutex); @@ -26,7 +26,7 @@ void vp9_frameworker_lock_stats(VP9Worker *const worker) { #endif } -void vp9_frameworker_unlock_stats(VP9Worker *const worker) { +void vp9_frameworker_unlock_stats(VPxWorker *const worker) { #if CONFIG_MULTITHREAD FrameWorkerData *const worker_data = worker->data1; pthread_mutex_unlock(&worker_data->stats_mutex); @@ -35,7 +35,7 @@ void vp9_frameworker_unlock_stats(VP9Worker *const worker) { #endif } -void vp9_frameworker_signal_stats(VP9Worker *const worker) { +void vp9_frameworker_signal_stats(VPxWorker *const worker) { #if CONFIG_MULTITHREAD FrameWorkerData *const worker_data = worker->data1; @@ -59,7 +59,7 @@ void vp9_frameworker_signal_stats(VP9Worker *const worker) { #endif // TODO(hkuang): Remove worker parameter as it is only used in debug code. -void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, +void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, int row) { #if CONFIG_MULTITHREAD if (!ref_buf) @@ -74,7 +74,7 @@ void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, { // Find the worker thread that owns the reference frame. If the reference // frame has been fully decoded, it may not have owner. - VP9Worker *const ref_worker = ref_buf->frame_worker_owner; + VPxWorker *const ref_worker = ref_buf->frame_worker_owner; FrameWorkerData *const ref_worker_data = (FrameWorkerData *)ref_worker->data1; const VP9Decoder *const pbi = ref_worker_data->pbi; @@ -114,7 +114,7 @@ void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) { #if CONFIG_MULTITHREAD - VP9Worker *worker = buf->frame_worker_owner; + VPxWorker *worker = buf->frame_worker_owner; #ifdef DEBUG_THREAD { @@ -134,8 +134,8 @@ void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) { #endif // CONFIG_MULTITHREAD } -void vp9_frameworker_copy_context(VP9Worker *const dst_worker, - VP9Worker *const src_worker) { +void vp9_frameworker_copy_context(VPxWorker *const dst_worker, + VPxWorker *const src_worker) { #if CONFIG_MULTITHREAD FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1; FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1; diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h index 979cb3d8b..862880167 100644 --- a/vp9/decoder/vp9_dthread.h +++ b/vp9/decoder/vp9_dthread.h @@ -12,7 +12,7 @@ #define VP9_DECODER_VP9_DTHREAD_H_ #include "./vpx_config.h" -#include "vp9/common/vp9_thread.h" +#include "vpx_thread/vpx_thread.h" #include "vpx/internal/vpx_codec_internal.h" struct VP9Common; @@ -44,15 +44,15 @@ typedef struct FrameWorkerData { int frame_decoded; // Finished decoding current frame. } FrameWorkerData; -void vp9_frameworker_lock_stats(VP9Worker *const worker); -void vp9_frameworker_unlock_stats(VP9Worker *const worker); -void vp9_frameworker_signal_stats(VP9Worker *const worker); +void vp9_frameworker_lock_stats(VPxWorker *const worker); +void vp9_frameworker_unlock_stats(VPxWorker *const worker); +void vp9_frameworker_signal_stats(VPxWorker *const worker); // Wait until ref_buf has been decoded to row in real pixel unit. // Note: worker may already finish decoding ref_buf and release it in order to // start decoding next frame. So need to check whether worker is still decoding // ref_buf. -void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, +void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, int row); // FrameWorker broadcasts its decoding progress so other workers that are @@ -60,7 +60,7 @@ void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row); // Copy necessary decoding context from src worker to dst worker. -void vp9_frameworker_copy_context(VP9Worker *const dst_worker, - VP9Worker *const src_worker); +void vp9_frameworker_copy_context(VPxWorker *const dst_worker, + VPxWorker *const src_worker); #endif // VP9_DECODER_VP9_DTHREAD_H_ diff --git a/vp9/encoder/mips/msa/vp9_variance_msa.c b/vp9/encoder/mips/msa/vp9_variance_msa.c deleted file mode 100644 index 33fb4966b..000000000 --- a/vp9/encoder/mips/msa/vp9_variance_msa.c +++ /dev/null @@ -1,2011 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_ports/mem.h" -#include "vp9/common/vp9_filter.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" - -DECLARE_ALIGNED(256, static const int8_t, vp9_bilinear_filters_msa[15][2]) = { - { 120, 8 }, - { 112, 16 }, - { 104, 24 }, - { 96, 32 }, - { 88, 40 }, - { 80, 48 }, - { 72, 56 }, - { 64, 64 }, - { 56, 72 }, - { 48, 80 }, - { 40, 88 }, - { 32, 96 }, - { 24, 104 }, - { 16, 112 }, - { 8, 120 } -}; - -#define CALC_MSE_AVG_B(src, ref, var, sub) { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - \ - sub += res_l0_m + res_l1_m; \ -} - -#define VARIANCE_WxH(sse, diff, shift) \ - sse - (((uint32_t)diff * diff) >> shift) - -#define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) - -static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, - int32_t *diff) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16u8 pred, src = { 0 }; - v16u8 ref = { 0 }; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, - src0, src1, ref0, ref1); - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src, ref, pred; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1, pred0, pred1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1, pred0, pred1; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, - src0, src1, src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, - src0, src1, src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v8i16 avg2 = { 0 }; - v8i16 avg3 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 32; ht_cnt--;) { - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, - src0, src1, src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, - src0, src1, src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - vec += __msa_hadd_s_w(avg2, avg2); - vec += __msa_hadd_s_w(avg3, avg3); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 filt0, ref = { 0 }; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8u16 const255; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - const255 = (v8u16)__msa_ldi_h(255); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - vec0, vec1, vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, - src0, src1, src2, src3); - ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); - src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); - CALC_MSE_AVG_B(src0, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 filt0, out, ref0, ref1, ref2, ref3; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3, const255; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - const255 = (v8u16)__msa_ldi_h(255); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - vec0, vec1, vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, - src0, src1, src2, src3); - out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); - CALC_MSE_AVG_B(out, ref0, var, avg); - out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); - CALC_MSE_AVG_B(out, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v16u8 dst0, dst1, dst2, dst3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8u16 const255; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - const255 = (v8u16)__msa_ldi_h(255); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - out0, out1, out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, - out4, out5, out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - MIN_UH4_UH(out0, out1, out2, out3, const255); - MIN_UH4_UH(out4, out5, out6, out7, const255); - PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, - src0, src1, src2, src3); - CALC_MSE_AVG_B(src0, dst0, var, avg); - CALC_MSE_AVG_B(src1, dst1, var, avg); - CALC_MSE_AVG_B(src2, dst2, var, avg); - CALC_MSE_AVG_B(src3, dst3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4, out; - v16u8 src10_r, src32_r, src21_r, src43_r; - v16u8 ref = { 0 }; - v16u8 src2110, src4332; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - v8u16 tmp0, tmp1; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, - src10_r, src21_r, src32_r, src43_r); - ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4; - v16u8 ref0, ref1, ref2, ref3; - v8u16 vec0, vec1, vec2, vec3; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, - vec0, vec1, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - tmp0, tmp1, tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1, out2, out3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - SAT_UH2_UH(tmp2, tmp3, 7); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - SAT_UH2_UH(tmp2, tmp3, 7); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - src0 = src4; - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter_horiz, - const int8_t *filter_vert, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out, ref = { 0 }; - v16u8 filt_vt, filt_hz, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; - v8u16 tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter_horiz, - const int8_t *filter_vert, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt_vt, filt_hz, vec0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - SAT_UH2_UH(tmp2, tmp3, 7); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter_horiz, - const int8_t *filter_vert, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 ref0, ref1, ref2, ref3; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3; - v8u16 tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - LD_UB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - CALC_MSE_AVG_B(src2, ref2, var, avg); - CALC_MSE_AVG_B(src3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter_horiz, - const int8_t *filter_vert, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height, - &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const int8_t *filter_horiz, - const int8_t *filter_vert, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height, - &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 out, pred, filt0, ref = { 0 }; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8u16 const255; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - const255 = (v8u16)__msa_ldi_h(255); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - vec0, vec1, vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, - src0, src1, src2, src3); - ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); - out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 out, pred, filt0; - v16u8 ref0, ref1, ref2, ref3; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8u16 const255; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - const255 = (v8u16)__msa_ldi_h(255); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - vec0, vec1, vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, - src0, src1, src2, src3); - out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); - - pred = LD_UB(sec_pred); - sec_pred += 16; - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref0, var, avg); - out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); - pred = LD_UB(sec_pred); - sec_pred += 16; - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff, - int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v16u8 dst0, dst1, dst2, dst3; - v16u8 tmp0, tmp1, tmp2, tmp3; - v16u8 pred0, pred1, pred2, pred3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8u16 const255; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - const255 = (v8u16)__msa_ldi_h(255); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - dst += (4 * dst_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - out0, out1, out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, - out4, out5, out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - MIN_UH4_UH(out0, out1, out2, out3, const255); - MIN_UH4_UH(out4, out5, out6, out7, const255); - PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, - tmp0, tmp1, tmp2, tmp3); - AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, - tmp0, tmp1, tmp2, tmp3); - - CALC_MSE_AVG_B(tmp0, dst0, var, avg); - CALC_MSE_AVG_B(tmp1, dst1, var, avg); - CALC_MSE_AVG_B(tmp2, dst2, var, avg); - CALC_MSE_AVG_B(tmp3, dst3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, - &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, - &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 src10_r, src32_r, src21_r, src43_r; - v16u8 out, pred, ref = { 0 }; - v16u8 src2110, src4332, filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - v8u16 tmp0, tmp1; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, - src10_r, src21_r, src32_r, src43_r); - ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, filt0; - v8u16 vec0, vec1, vec2, vec3; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, - vec0, vec1, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - tmp0, tmp1, tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff, - int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1, out2, out3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - SAT_UH2_UH(tmp2, tmp3, 7); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - SAT_UH2_UH(tmp2, tmp3, 7); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - src0 = src4; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, - out0, out1, out2, out3); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, - &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter, - int32_t height, - int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, - &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter_horiz, const int8_t *filter_vert, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 out, pred, ref = { 0 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter_horiz, const int8_t *filter_vert, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 pred0, pred1, out0, out1; - v16u8 filt_hz, filt_vt, vec0; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - SAT_UH2_UH(tmp2, tmp3, 7); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter_horiz, - const int8_t *filter_vert, - int32_t height, - int32_t *diff, - int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v16u8 out0, out1, out2, out3; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - LD_UB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - SAT_UH2_UH(tmp0, tmp1, 7); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, - out0, out1, out2, out3); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter_horiz, const int8_t *filter_vert, - int32_t height, int32_t *diff) { - return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter_horiz, const int8_t *filter_vert, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const int8_t *filter_horiz, const int8_t *filter_vert, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); -#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); -#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); - -#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); -#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); - -#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ -uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \ - int32_t src_stride, \ - int32_t xoffset, \ - int32_t yoffset, \ - const uint8_t *ref, \ - int32_t ref_stride, \ - uint32_t *sse) { \ - int32_t diff; \ - uint32_t var; \ - const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1]; \ - const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride, \ - ref, ref_stride, \ - h_filter, v_filter, \ - ht, &diff); \ - } else { \ - *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride, \ - ref, ref_stride, \ - v_filter, ht, &diff); \ - } \ - \ - var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride, \ - ref, ref_stride, \ - h_filter, ht, &diff); \ - \ - var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } else { \ - var = vpx_variance##wd##x##ht##_msa(src, src_stride, \ - ref, ref_stride, sse); \ - } \ - } \ - \ - return var; \ -} - -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8); - -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16); - -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32); - -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64); - -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); - -#define VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ -uint32_t vp9_sub_pixel_avg_variance##wd##x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, \ - int32_t xoffset, int32_t yoffset, \ - const uint8_t *ref_ptr, int32_t ref_stride, \ - uint32_t *sse, const uint8_t *sec_pred) { \ - int32_t diff; \ - const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1]; \ - const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, h_filter, \ - v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, v_filter, \ - ht, &diff); \ - } \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, h_filter, \ - ht, &diff); \ - } else { \ - *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, ht, &diff); \ - } \ - } \ - \ - return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ -} - -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4); -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8); - -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4); -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8); -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16); - -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8); -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16); -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32); - -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16); -VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); - -uint32_t vp9_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - int32_t xoffset, - int32_t yoffset, - const uint8_t *ref_ptr, - int32_t ref_stride, - uint32_t *sse, - const uint8_t *sec_pred) { - int32_t diff; - const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1]; - const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1]; - - if (yoffset) { - if (xoffset) { - *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride, - ref_ptr, ref_stride, - sec_pred, h_filter, - v_filter, 64, &diff); - } else { - *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, - ref_ptr, ref_stride, - sec_pred, v_filter, - 64, &diff); - } - } else { - if (xoffset) { - *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, - ref_ptr, ref_stride, - sec_pred, h_filter, - 64, &diff); - } else { - *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, - sec_pred, &diff); - } - } - - return VARIANCE_32Wx64H(*sse, diff); -} - -#define VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ -uint32_t vp9_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr, \ - int32_t src_stride, \ - int32_t xoffset, \ - int32_t yoffset, \ - const uint8_t *ref_ptr, \ - int32_t ref_stride, \ - uint32_t *sse, \ - const uint8_t *sec_pred) { \ - int32_t diff; \ - const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1]; \ - const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, h_filter, \ - v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, v_filter, \ - ht, &diff); \ - } \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, h_filter, \ - ht, &diff); \ - } else { \ - *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, &diff); \ - } \ - } \ - \ - return VARIANCE_64Wx##ht##H(*sse, diff); \ -} - -VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); -VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index cb99af781..e94d43b14 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -224,7 +224,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->oxcf.aq_mode != VARIANCE_AQ) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); } vp9_init_plane_quantizers(cpi, x); @@ -678,7 +678,7 @@ static int choose_partitioning(VP9_COMP *cpi, if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); + segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); if (cyclic_refresh_segment_id_boosted(segment_id)) { int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); @@ -1002,7 +1002,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; mi_addr->mbmi.segment_id = - vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + get_segment_id(cm, map, bsize, mi_row, mi_col); } // Else for cyclic refresh mode update the segment map, set the segment id // and then update the quantizer. @@ -1237,7 +1237,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, } else { const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); } x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); } else if (aq_mode == COMPLEXITY_AQ) { @@ -1247,7 +1247,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, : cm->last_frame_seg_map; // If segment is boosted, use rdmult for that segment. if (cyclic_refresh_segment_id_boosted( - vp9_get_segment_id(cm, map, bsize, mi_row, mi_col))) + get_segment_id(cm, map, bsize, mi_row, mi_col))) x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); } @@ -1698,7 +1698,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, cpi->oxcf.aq_mode == VARIANCE_AQ ) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); } else { // Setting segmentation map for cyclic_refresh. vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize, @@ -2799,7 +2799,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, if (seg->enabled) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); + int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); } @@ -3568,7 +3568,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, if (seg->enabled) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); + int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); if (seg_skip) { partition_search_type = FIXED_PARTITION; @@ -4198,7 +4198,7 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 && !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) { - ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd), + ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd), &td->counts->tx)[mbmi->tx_size]; } else { int x, y; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 45b5df4d1..b9b11064d 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2013,11 +2013,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #endif for (t = 0; t < cpi->num_workers; ++t) { - VP9Worker *const worker = &cpi->workers[t]; + VPxWorker *const worker = &cpi->workers[t]; EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; // Deallocate allocated threads. - vp9_get_worker_interface()->end(worker); + vpx_get_worker_interface()->end(worker); // Deallocate allocated thread data. if (t < cpi->num_workers - 1) { diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index afe3ae9da..8b324a77b 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -16,13 +16,13 @@ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vp8cx.h" +#include "vpx_thread/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_ppflags.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" -#include "vp9/common/vp9_thread.h" #include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_context_tree.h" @@ -497,7 +497,7 @@ typedef struct VP9_COMP { // Multi-threading int num_workers; - VP9Worker *workers; + VPxWorker *workers; struct EncWorkerData *tile_thr_data; VP9LfSync lf_row_sync; } VP9_COMP; diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index 4ae3fbc54..b4132d815 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -69,7 +69,7 @@ static int get_max_tile_cols(VP9_COMP *cpi) { void vp9_encode_tiles_mt(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols); int i; @@ -94,7 +94,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { sizeof(*cpi->tile_thr_data))); for (i = 0; i < allocated_workers; i++) { - VP9Worker *const worker = &cpi->workers[i]; + VPxWorker *const worker = &cpi->workers[i]; EncWorkerData *thread_data = &cpi->tile_thr_data[i]; ++cpi->num_workers; @@ -132,10 +132,10 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { } for (i = 0; i < num_workers; i++) { - VP9Worker *const worker = &cpi->workers[i]; + VPxWorker *const worker = &cpi->workers[i]; EncWorkerData *thread_data; - worker->hook = (VP9WorkerHook)enc_worker_hook; + worker->hook = (VPxWorkerHook)enc_worker_hook; worker->data1 = &cpi->tile_thr_data[i]; worker->data2 = NULL; thread_data = (EncWorkerData*)worker->data1; @@ -170,7 +170,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { // Encode a frame for (i = 0; i < num_workers; i++) { - VP9Worker *const worker = &cpi->workers[i]; + VPxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; // Set the starting tile for each thread. @@ -184,12 +184,12 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { // Encoding ends. for (i = 0; i < num_workers; i++) { - VP9Worker *const worker = &cpi->workers[i]; + VPxWorker *const worker = &cpi->workers[i]; winterface->sync(worker); } for (i = 0; i < num_workers; i++) { - VP9Worker *const worker = &cpi->workers[i]; + VPxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; // Accumulate counters. diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index 9b15072e9..1f0d4dfee 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -129,8 +129,8 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd, if (cm->frame_type != KEY_FRAME) { const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; // Test to see if the segment id matches the predicted value. - const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map, - bsize, mi_row, mi_col); + const int pred_segment_id = get_segment_id(cm, cm->last_frame_seg_map, + bsize, mi_row, mi_col); const int pred_flag = pred_segment_id == segment_id; const int pred_context = vp9_get_pred_context_seg_id(xd); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 8a180387f..c257d91b4 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -51,8 +51,6 @@ VP9_COMMON_SRCS-yes += common/vp9_seg_common.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.c VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h VP9_COMMON_SRCS-yes += common/vp9_textblit.h -VP9_COMMON_SRCS-yes += common/vp9_thread.h -VP9_COMMON_SRCS-yes += common/vp9_thread.c VP9_COMMON_SRCS-yes += common/vp9_tile_common.h VP9_COMMON_SRCS-yes += common/vp9_tile_common.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 4080d64c1..ab5bcba1c 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -17,10 +17,10 @@ #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vp8dx.h" #include "vpx/vpx_decoder.h" +#include "vpx_thread/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_frame_buffers.h" -#include "vp9/common/vp9_thread.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_decodeframe.h" @@ -59,7 +59,7 @@ struct vpx_codec_alg_priv { // Frame parallel related. int frame_parallel_decode; // frame-based threading. - VP9Worker *frame_workers; + VPxWorker *frame_workers; int num_frame_workers; int next_submit_worker_id; int last_submit_worker_id; @@ -112,10 +112,10 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { if (ctx->frame_workers != NULL) { int i; for (i = 0; i < ctx->num_frame_workers; ++i) { - VP9Worker *const worker = &ctx->frame_workers[i]; + VPxWorker *const worker = &ctx->frame_workers[i]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - vp9_get_worker_interface()->end(worker); + vpx_get_worker_interface()->end(worker); vp9_remove_common(&frame_worker_data->pbi->common); #if CONFIG_VP9_POSTPROC vp9_free_postproc_buffers(&frame_worker_data->pbi->common); @@ -279,7 +279,7 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { int i; for (i = 0; i < ctx->num_frame_workers; ++i) { - VP9Worker *const worker = &ctx->frame_workers[i]; + VPxWorker *const worker = &ctx->frame_workers[i]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; VP9_COMMON *const cm = &frame_worker_data->pbi->common; BufferPool *const pool = cm->buffer_pool; @@ -336,7 +336,7 @@ static int frame_worker_hook(void *arg1, void *arg2) { // the compressed data. if (frame_worker_data->result != 0 || frame_worker_data->data + frame_worker_data->data_size - 1 > data) { - VP9Worker *const worker = frame_worker_data->pbi->frame_worker_owner; + VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner; BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool; // Signal all the other threads that are waiting for this frame. vp9_frameworker_lock_stats(worker); @@ -359,7 +359,7 @@ static int frame_worker_hook(void *arg1, void *arg2) { static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { int i; - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); ctx->last_show_frame = -1; ctx->next_submit_worker_id = 0; @@ -387,7 +387,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { } #endif - ctx->frame_workers = (VP9Worker *) + ctx->frame_workers = (VPxWorker *) vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers)); if (ctx->frame_workers == NULL) { set_error_detail(ctx, "Failed to allocate frame_workers"); @@ -395,7 +395,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { } for (i = 0; i < ctx->num_frame_workers; ++i) { - VP9Worker *const worker = &ctx->frame_workers[i]; + VPxWorker *const worker = &ctx->frame_workers[i]; FrameWorkerData *frame_worker_data = NULL; winterface->init(worker); worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData)); @@ -435,7 +435,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode; frame_worker_data->pbi->common.frame_parallel_decode = ctx->frame_parallel_decode; - worker->hook = (VP9WorkerHook)frame_worker_hook; + worker->hook = (VPxWorkerHook)frame_worker_hook; if (!winterface->reset(worker)) { set_error_detail(ctx, "Frame Worker thread creation failed"); return VPX_CODEC_MEM_ERROR; @@ -464,7 +464,7 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, void *user_priv, int64_t deadline) { - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); (void)deadline; // Determine the stream parameters. Note that we rely on peek_si to @@ -483,7 +483,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, } if (!ctx->frame_parallel_decode) { - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->data = *data; frame_worker_data->data_size = data_sz; @@ -506,7 +506,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, check_resync(ctx, frame_worker_data->pbi); } else { - VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id]; + VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; // Copy context from last worker thread to next worker thread. if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) @@ -554,8 +554,8 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) { YV12_BUFFER_CONFIG sd; vp9_ppflags_t flags = {0, 0, 0}; - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); - VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; ctx->next_output_worker_id = (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; @@ -746,8 +746,8 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, do { YV12_BUFFER_CONFIG sd; vp9_ppflags_t flags = {0, 0, 0}; - const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); - VP9Worker *const worker = + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; @@ -819,7 +819,7 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, if (data) { vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); return vp9_set_reference_dec(&frame_worker_data->pbi->common, @@ -842,7 +842,7 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, if (data) { vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data; YV12_BUFFER_CONFIG sd; - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); return vp9_copy_reference_dec(frame_worker_data->pbi, @@ -864,7 +864,7 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, if (data) { YV12_BUFFER_CONFIG* fb; - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx); if (fb == NULL) return VPX_CODEC_ERROR; @@ -913,7 +913,7 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, if (update_info) { if (ctx->frame_workers) { - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; *update_info = frame_worker_data->pbi->refresh_frame_flags; @@ -932,7 +932,7 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, if (corrupted) { if (ctx->frame_workers) { - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; RefCntBuffer *const frame_bufs = @@ -962,7 +962,7 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx, if (frame_size) { if (ctx->frame_workers) { - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const VP9_COMMON *const cm = &frame_worker_data->pbi->common; @@ -989,7 +989,7 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx, if (display_size) { if (ctx->frame_workers) { - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const VP9_COMMON *const cm = &frame_worker_data->pbi->common; @@ -1007,7 +1007,7 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx, va_list args) { unsigned int *const bit_depth = va_arg(args, unsigned int *); - VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; + VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; if (bit_depth) { if (worker) { @@ -1053,7 +1053,7 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, ctx->byte_alignment = byte_alignment; if (ctx->frame_workers) { - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->pbi->common.byte_alignment = byte_alignment; @@ -1066,7 +1066,7 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx, ctx->skip_loop_filter = va_arg(args, int); if (ctx->frame_workers) { - VP9Worker *const worker = ctx->frame_workers; + VPxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter; } diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index dca9a6cbd..5a039accc 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -163,6 +163,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/vp9/common/vp9_thread.c b/vpx_thread/vpx_thread.c index 1c6aec032..0bb0125bd 100644 --- a/vp9/common/vp9_thread.c +++ b/vpx_thread/vpx_thread.c @@ -15,12 +15,12 @@ #include <assert.h> #include <string.h> // for memset() -#include "./vp9_thread.h" +#include "./vpx_thread.h" #include "vpx_mem/vpx_mem.h" #if CONFIG_MULTITHREAD -struct VP9WorkerImpl { +struct VPxWorkerImpl { pthread_mutex_t mutex_; pthread_cond_t condition_; pthread_t thread_; @@ -28,10 +28,10 @@ struct VP9WorkerImpl { //------------------------------------------------------------------------------ -static void execute(VP9Worker *const worker); // Forward declaration. +static void execute(VPxWorker *const worker); // Forward declaration. static THREADFN thread_loop(void *ptr) { - VP9Worker *const worker = (VP9Worker*)ptr; + VPxWorker *const worker = (VPxWorker*)ptr; int done = 0; while (!done) { pthread_mutex_lock(&worker->impl_->mutex_); @@ -52,8 +52,8 @@ static THREADFN thread_loop(void *ptr) { } // main thread state control -static void change_state(VP9Worker *const worker, - VP9WorkerStatus new_status) { +static void change_state(VPxWorker *const worker, + VPxWorkerStatus new_status) { // No-op when attempting to change state on a thread that didn't come up. // Checking status_ without acquiring the lock first would result in a data // race. @@ -78,12 +78,12 @@ static void change_state(VP9Worker *const worker, //------------------------------------------------------------------------------ -static void init(VP9Worker *const worker) { +static void init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); worker->status_ = NOT_OK; } -static int sync(VP9Worker *const worker) { +static int sync(VPxWorker *const worker) { #if CONFIG_MULTITHREAD change_state(worker, OK); #endif @@ -91,12 +91,12 @@ static int sync(VP9Worker *const worker) { return !worker->had_error; } -static int reset(VP9Worker *const worker) { +static int reset(VPxWorker *const worker) { int ok = 1; worker->had_error = 0; if (worker->status_ < OK) { #if CONFIG_MULTITHREAD - worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_)); + worker->impl_ = (VPxWorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_)); if (worker->impl_ == NULL) { return 0; } @@ -129,13 +129,13 @@ static int reset(VP9Worker *const worker) { return ok; } -static void execute(VP9Worker *const worker) { +static void execute(VPxWorker *const worker) { if (worker->hook != NULL) { worker->had_error |= !worker->hook(worker->data1, worker->data2); } } -static void launch(VP9Worker *const worker) { +static void launch(VPxWorker *const worker) { #if CONFIG_MULTITHREAD change_state(worker, WORK); #else @@ -143,7 +143,7 @@ static void launch(VP9Worker *const worker) { #endif } -static void end(VP9Worker *const worker) { +static void end(VPxWorker *const worker) { #if CONFIG_MULTITHREAD if (worker->impl_ != NULL) { change_state(worker, NOT_OK); @@ -162,11 +162,11 @@ static void end(VP9Worker *const worker) { //------------------------------------------------------------------------------ -static VP9WorkerInterface g_worker_interface = { +static VPxWorkerInterface g_worker_interface = { init, reset, sync, launch, execute, end }; -int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) { +int vpx_set_worker_interface(const VPxWorkerInterface* const winterface) { if (winterface == NULL || winterface->init == NULL || winterface->reset == NULL || winterface->sync == NULL || winterface->launch == NULL || @@ -177,7 +177,7 @@ int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) { return 1; } -const VP9WorkerInterface *vp9_get_worker_interface(void) { +const VPxWorkerInterface *vpx_get_worker_interface(void) { return &g_worker_interface; } diff --git a/vp9/common/vp9_thread.h b/vpx_thread/vpx_thread.h index 12848fede..de63c4da0 100644 --- a/vp9/common/vp9_thread.h +++ b/vpx_thread/vpx_thread.h @@ -13,8 +13,8 @@ // http://git.chromium.org/webm/libwebp.git // 100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38 src/utils/thread.h -#ifndef VP9_DECODER_VP9_THREAD_H_ -#define VP9_DECODER_VP9_THREAD_H_ +#ifndef VPX_THREAD_H_ +#define VPX_THREAD_H_ #include "./vpx_config.h" @@ -160,59 +160,59 @@ typedef enum { NOT_OK = 0, // object is unusable OK, // ready to work WORK // busy finishing the current task -} VP9WorkerStatus; +} VPxWorkerStatus; // Function to be called by the worker thread. Takes two opaque pointers as // arguments (data1 and data2), and should return false in case of error. -typedef int (*VP9WorkerHook)(void*, void*); +typedef int (*VPxWorkerHook)(void*, void*); // Platform-dependent implementation details for the worker. -typedef struct VP9WorkerImpl VP9WorkerImpl; +typedef struct VPxWorkerImpl VPxWorkerImpl; // Synchronization object used to launch job in the worker thread typedef struct { - VP9WorkerImpl *impl_; - VP9WorkerStatus status_; - VP9WorkerHook hook; // hook to call + VPxWorkerImpl *impl_; + VPxWorkerStatus status_; + VPxWorkerHook hook; // hook to call void *data1; // first argument passed to 'hook' void *data2; // second argument passed to 'hook' int had_error; // return value of the last call to 'hook' -} VP9Worker; +} VPxWorker; // The interface for all thread-worker related functions. All these functions // must be implemented. typedef struct { // Must be called first, before any other method. - void (*init)(VP9Worker *const worker); + void (*init)(VPxWorker *const worker); // Must be called to initialize the object and spawn the thread. Re-entrant. // Will potentially launch the thread. Returns false in case of error. - int (*reset)(VP9Worker *const worker); + int (*reset)(VPxWorker *const worker); // Makes sure the previous work is finished. Returns true if worker->had_error // was not set and no error condition was triggered by the working thread. - int (*sync)(VP9Worker *const worker); + int (*sync)(VPxWorker *const worker); // Triggers the thread to call hook() with data1 and data2 arguments. These // hook/data1/data2 values can be changed at any time before calling this // function, but not be changed afterward until the next call to Sync(). - void (*launch)(VP9Worker *const worker); + void (*launch)(VPxWorker *const worker); // This function is similar to launch() except that it calls the // hook directly instead of using a thread. Convenient to bypass the thread - // mechanism while still using the VP9Worker structs. sync() must + // mechanism while still using the VPxWorker structs. sync() must // still be called afterward (for error reporting). - void (*execute)(VP9Worker *const worker); + void (*execute)(VPxWorker *const worker); // Kill the thread and terminate the object. To use the object again, one // must call reset() again. - void (*end)(VP9Worker *const worker); -} VP9WorkerInterface; + void (*end)(VPxWorker *const worker); +} VPxWorkerInterface; // Install a new set of threading functions, overriding the defaults. This // should be done before any workers are started, i.e., before any encoding or // decoding takes place. The contents of the interface struct are copied, it // is safe to free the corresponding memory after this call. This function is // not thread-safe. Return false in case of invalid pointer or methods. -int vp9_set_worker_interface(const VP9WorkerInterface *const winterface); +int vpx_set_worker_interface(const VPxWorkerInterface *const winterface); // Retrieve the currently set thread worker interface. -const VP9WorkerInterface *vp9_get_worker_interface(void); +const VPxWorkerInterface *vpx_get_worker_interface(void); //------------------------------------------------------------------------------ @@ -220,4 +220,4 @@ const VP9WorkerInterface *vp9_get_worker_interface(void); } // extern "C" #endif -#endif // VP9_DECODER_VP9_THREAD_H_ +#endif // VPX_THREAD_H_ diff --git a/vpx_thread/vpx_thread.mk b/vpx_thread/vpx_thread.mk new file mode 100644 index 000000000..0a4a3648a --- /dev/null +++ b/vpx_thread/vpx_thread.mk @@ -0,0 +1,13 @@ +## +## Copyright (c) 2015 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +THREAD_SRCS-yes += vpx_thread.mk +THREAD_SRCS-yes += vpx_thread.c +THREAD_SRCS-yes += vpx_thread.h |