summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libs.mk3
-rw-r--r--test/variance_test.cc79
-rw-r--r--test/vp9_thread_test.cc84
-rw-r--r--vp9/common/vp9_onyxc_int.h4
-rw-r--r--vp9/common/vp9_pred_common.c42
-rw-r--r--vp9/common/vp9_pred_common.h45
-rw-r--r--vp9/common/vp9_rtcd_defs.pl52
-rw-r--r--vp9/common/vp9_thread_common.c10
-rw-r--r--vp9/common/vp9_thread_common.h4
-rw-r--r--vp9/decoder/vp9_decodeframe.c20
-rw-r--r--vp9/decoder/vp9_decodemv.c4
-rw-r--r--vp9/decoder/vp9_decoder.c16
-rw-r--r--vp9/decoder/vp9_decoder.h9
-rw-r--r--vp9/decoder/vp9_dsubexp.c48
-rw-r--r--vp9/decoder/vp9_dthread.c16
-rw-r--r--vp9/decoder/vp9_dthread.h14
-rw-r--r--vp9/encoder/mips/msa/vp9_variance_msa.c2011
-rw-r--r--vp9/encoder/vp9_encodeframe.c18
-rw-r--r--vp9/encoder/vp9_encoder.c4
-rw-r--r--vp9/encoder/vp9_encoder.h4
-rw-r--r--vp9/encoder/vp9_ethread.c14
-rw-r--r--vp9/encoder/vp9_segmentation.c4
-rw-r--r--vp9/vp9_common.mk2
-rw-r--r--vp9/vp9_dx_iface.c54
-rw-r--r--vp9/vp9cx.mk1
-rw-r--r--vpx_thread/vpx_thread.c (renamed from vp9/common/vp9_thread.c)32
-rw-r--r--vpx_thread/vpx_thread.h (renamed from vp9/common/vp9_thread.h)40
-rw-r--r--vpx_thread/vpx_thread.mk13
28 files changed, 283 insertions, 2364 deletions
diff --git a/libs.mk b/libs.mk
index 6215990c9..165002895 100644
--- a/libs.mk
+++ b/libs.mk
@@ -50,6 +50,9 @@ CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk
CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
+include $(SRC_PATH_BARE)/vpx_thread/vpx_thread.mk
+CODEC_SRCS-yes += $(addprefix vpx_thread/,$(call enabled,THREAD_SRCS))
+
ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
VP8_PREFIX=vp8/
include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 614c4d999..220170048 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -2058,84 +2058,5 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(3, 2, variance8x4_msa, 0),
make_tuple(2, 3, variance4x8_msa, 0),
make_tuple(2, 2, variance4x4_msa, 0)));
-
-#if CONFIG_VP9_ENCODER
-const SubpixVarMxNFunc subpel_variance4x4_msa = vp9_sub_pixel_variance4x4_msa;
-const SubpixVarMxNFunc subpel_variance4x8_msa = vp9_sub_pixel_variance4x8_msa;
-const SubpixVarMxNFunc subpel_variance8x4_msa = vp9_sub_pixel_variance8x4_msa;
-const SubpixVarMxNFunc subpel_variance8x8_msa = vp9_sub_pixel_variance8x8_msa;
-const SubpixVarMxNFunc subpel_variance8x16_msa = vp9_sub_pixel_variance8x16_msa;
-const SubpixVarMxNFunc subpel_variance16x8_msa = vp9_sub_pixel_variance16x8_msa;
-const SubpixVarMxNFunc subpel_variance16x16_msa =
- vp9_sub_pixel_variance16x16_msa;
-const SubpixVarMxNFunc subpel_variance16x32_msa =
- vp9_sub_pixel_variance16x32_msa;
-const SubpixVarMxNFunc subpel_variance32x16_msa =
- vp9_sub_pixel_variance32x16_msa;
-const SubpixVarMxNFunc subpel_variance32x32_msa =
- vp9_sub_pixel_variance32x32_msa;
-const SubpixVarMxNFunc subpel_variance32x64_msa =
- vp9_sub_pixel_variance32x64_msa;
-const SubpixVarMxNFunc subpel_variance64x32_msa =
- vp9_sub_pixel_variance64x32_msa;
-const SubpixVarMxNFunc subpel_variance64x64_msa =
- vp9_sub_pixel_variance64x64_msa;
-INSTANTIATE_TEST_CASE_P(
- MSA, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_variance4x4_msa, 0),
- make_tuple(2, 3, subpel_variance4x8_msa, 0),
- make_tuple(3, 2, subpel_variance8x4_msa, 0),
- make_tuple(3, 3, subpel_variance8x8_msa, 0),
- make_tuple(3, 4, subpel_variance8x16_msa, 0),
- make_tuple(4, 3, subpel_variance16x8_msa, 0),
- make_tuple(4, 4, subpel_variance16x16_msa, 0),
- make_tuple(4, 5, subpel_variance16x32_msa, 0),
- make_tuple(5, 4, subpel_variance32x16_msa, 0),
- make_tuple(5, 5, subpel_variance32x32_msa, 0),
- make_tuple(5, 6, subpel_variance32x64_msa, 0),
- make_tuple(6, 5, subpel_variance64x32_msa, 0),
- make_tuple(6, 6, subpel_variance64x64_msa, 0)));
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_msa =
- vp9_sub_pixel_avg_variance4x4_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_msa =
- vp9_sub_pixel_avg_variance4x8_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_msa =
- vp9_sub_pixel_avg_variance8x4_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_msa =
- vp9_sub_pixel_avg_variance8x8_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_msa =
- vp9_sub_pixel_avg_variance8x16_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_msa =
- vp9_sub_pixel_avg_variance16x8_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_msa =
- vp9_sub_pixel_avg_variance16x16_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_msa =
- vp9_sub_pixel_avg_variance16x32_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_msa =
- vp9_sub_pixel_avg_variance32x16_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_msa =
- vp9_sub_pixel_avg_variance32x32_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_msa =
- vp9_sub_pixel_avg_variance32x64_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_msa =
- vp9_sub_pixel_avg_variance64x32_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_msa =
- vp9_sub_pixel_avg_variance64x64_msa;
-INSTANTIATE_TEST_CASE_P(
- MSA, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_msa, 0),
- make_tuple(2, 3, subpel_avg_variance4x8_msa, 0),
- make_tuple(3, 2, subpel_avg_variance8x4_msa, 0),
- make_tuple(3, 3, subpel_avg_variance8x8_msa, 0),
- make_tuple(3, 4, subpel_avg_variance8x16_msa, 0),
- make_tuple(4, 3, subpel_avg_variance16x8_msa, 0),
- make_tuple(4, 4, subpel_avg_variance16x16_msa, 0),
- make_tuple(4, 5, subpel_avg_variance16x32_msa, 0),
- make_tuple(5, 4, subpel_avg_variance32x16_msa, 0),
- make_tuple(5, 5, subpel_avg_variance32x32_msa, 0),
- make_tuple(5, 6, subpel_avg_variance32x64_msa, 0),
- make_tuple(6, 5, subpel_avg_variance64x32_msa, 0),
- make_tuple(6, 6, subpel_avg_variance64x64_msa, 0)));
-#endif // CONFIG_VP9_ENCODER
#endif // HAVE_MSA
} // namespace
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 902a6fc35..bde00ea04 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -18,33 +18,33 @@
#if CONFIG_WEBM_IO
#include "test/webm_video_source.h"
#endif
-#include "vp9/common/vp9_thread.h"
+#include "vpx_thread/vpx_thread.h"
namespace {
using std::string;
-class VP9WorkerThreadTest : public ::testing::TestWithParam<bool> {
+class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> {
protected:
- virtual ~VP9WorkerThreadTest() {}
+ virtual ~VPxWorkerThreadTest() {}
virtual void SetUp() {
- vp9_get_worker_interface()->init(&worker_);
+ vpx_get_worker_interface()->init(&worker_);
}
virtual void TearDown() {
- vp9_get_worker_interface()->end(&worker_);
+ vpx_get_worker_interface()->end(&worker_);
}
- void Run(VP9Worker* worker) {
+ void Run(VPxWorker* worker) {
const bool synchronous = GetParam();
if (synchronous) {
- vp9_get_worker_interface()->execute(worker);
+ vpx_get_worker_interface()->execute(worker);
} else {
- vp9_get_worker_interface()->launch(worker);
+ vpx_get_worker_interface()->launch(worker);
}
}
- VP9Worker worker_;
+ VPxWorker worker_;
};
int ThreadHook(void* data, void* return_value) {
@@ -53,12 +53,12 @@ int ThreadHook(void* data, void* return_value) {
return *reinterpret_cast<int*>(return_value);
}
-TEST_P(VP9WorkerThreadTest, HookSuccess) {
+TEST_P(VPxWorkerThreadTest, HookSuccess) {
// should be a no-op.
- EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+ EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
for (int i = 0; i < 2; ++i) {
- EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+ EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
int hook_data = 0;
int return_value = 1; // return successfully from the hook
@@ -67,17 +67,17 @@ TEST_P(VP9WorkerThreadTest, HookSuccess) {
worker_.data2 = &return_value;
Run(&worker_);
- EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+ EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
EXPECT_FALSE(worker_.had_error);
EXPECT_EQ(5, hook_data);
// should be a no-op.
- EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+ EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
}
}
-TEST_P(VP9WorkerThreadTest, HookFailure) {
- EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+TEST_P(VPxWorkerThreadTest, HookFailure) {
+ EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
int hook_data = 0;
int return_value = 0; // return failure from the hook
@@ -86,29 +86,29 @@ TEST_P(VP9WorkerThreadTest, HookFailure) {
worker_.data2 = &return_value;
Run(&worker_);
- EXPECT_FALSE(vp9_get_worker_interface()->sync(&worker_));
+ EXPECT_FALSE(vpx_get_worker_interface()->sync(&worker_));
EXPECT_EQ(1, worker_.had_error);
// Ensure _reset() clears the error and _launch() can be called again.
return_value = 1;
- EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+ EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
EXPECT_FALSE(worker_.had_error);
- vp9_get_worker_interface()->launch(&worker_);
- EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+ vpx_get_worker_interface()->launch(&worker_);
+ EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
EXPECT_FALSE(worker_.had_error);
}
-TEST_P(VP9WorkerThreadTest, EndWithoutSync) {
+TEST_P(VPxWorkerThreadTest, EndWithoutSync) {
// Create a large number of threads to increase the chances of detecting a
// race. Doing more work in the hook is no guarantee as any race would occur
// post hook execution in the main thread loop driver.
static const int kNumWorkers = 64;
- VP9Worker workers[kNumWorkers];
+ VPxWorker workers[kNumWorkers];
int hook_data[kNumWorkers];
int return_value[kNumWorkers];
for (int n = 0; n < kNumWorkers; ++n) {
- vp9_get_worker_interface()->init(&workers[n]);
+ vpx_get_worker_interface()->init(&workers[n]);
return_value[n] = 1; // return successfully from the hook
workers[n].hook = ThreadHook;
workers[n].data1 = &hook_data[n];
@@ -117,7 +117,7 @@ TEST_P(VP9WorkerThreadTest, EndWithoutSync) {
for (int i = 0; i < 2; ++i) {
for (int n = 0; n < kNumWorkers; ++n) {
- EXPECT_NE(vp9_get_worker_interface()->reset(&workers[n]), 0);
+ EXPECT_NE(vpx_get_worker_interface()->reset(&workers[n]), 0);
hook_data[n] = 0;
}
@@ -126,16 +126,16 @@ TEST_P(VP9WorkerThreadTest, EndWithoutSync) {
}
for (int n = kNumWorkers - 1; n >= 0; --n) {
- vp9_get_worker_interface()->end(&workers[n]);
+ vpx_get_worker_interface()->end(&workers[n]);
}
}
}
-TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
- EXPECT_EQ(0, vp9_set_worker_interface(NULL));
- EXPECT_TRUE(vp9_get_worker_interface() != NULL);
+TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
+ EXPECT_EQ(0, vpx_set_worker_interface(NULL));
+ EXPECT_TRUE(vpx_get_worker_interface() != NULL);
for (int i = 0; i < 6; ++i) {
- VP9WorkerInterface winterface = *vp9_get_worker_interface();
+ VPxWorkerInterface winterface = *vpx_get_worker_interface();
switch (i) {
default:
case 0: winterface.init = NULL; break;
@@ -145,7 +145,7 @@ TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
case 4: winterface.execute = NULL; break;
case 5: winterface.end = NULL; break;
}
- EXPECT_EQ(0, vp9_set_worker_interface(&winterface));
+ EXPECT_EQ(0, vpx_set_worker_interface(&winterface));
}
}
@@ -202,21 +202,21 @@ void DecodeFiles(const FileList files[]) {
// hang.
namespace impl {
-void Init(VP9Worker *const worker) { memset(worker, 0, sizeof(*worker)); }
-int Reset(VP9Worker *const /*worker*/) { return 1; }
-int Sync(VP9Worker *const worker) { return !worker->had_error; }
+void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); }
+int Reset(VPxWorker *const /*worker*/) { return 1; }
+int Sync(VPxWorker *const worker) { return !worker->had_error; }
-void Execute(VP9Worker *const worker) {
+void Execute(VPxWorker *const worker) {
worker->had_error |= !worker->hook(worker->data1, worker->data2);
}
-void Launch(VP9Worker *const worker) { Execute(worker); }
-void End(VP9Worker *const /*worker*/) {}
+void Launch(VPxWorker *const worker) { Execute(worker); }
+void End(VPxWorker *const /*worker*/) {}
} // namespace impl
-TEST(VP9WorkerThreadTest, TestSerialInterface) {
- static const VP9WorkerInterface serial_interface = {
+TEST(VPxWorkerThreadTest, TestSerialInterface) {
+ static const VPxWorkerInterface serial_interface = {
impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
};
// TODO(jzern): Avoid using a file that will use the row-based thread
@@ -225,13 +225,13 @@ TEST(VP9WorkerThreadTest, TestSerialInterface) {
// progress in the row above before proceeding.
static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
static const char filename[] = "vp90-2-03-size-226x226.webm";
- VP9WorkerInterface default_interface = *vp9_get_worker_interface();
+ VPxWorkerInterface default_interface = *vpx_get_worker_interface();
- EXPECT_NE(vp9_set_worker_interface(&serial_interface), 0);
+ EXPECT_NE(vpx_set_worker_interface(&serial_interface), 0);
EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
// Reset the interface.
- EXPECT_NE(vp9_set_worker_interface(&default_interface), 0);
+ EXPECT_NE(vpx_set_worker_interface(&default_interface), 0);
EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
}
@@ -309,6 +309,6 @@ TEST(VP9DecodeMultiThreadedTest, Decode3) {
}
#endif // CONFIG_WEBM_IO
-INSTANTIATE_TEST_CASE_P(Synchronous, VP9WorkerThreadTest, ::testing::Bool());
+INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
} // namespace
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 1811d76df..bacbd8a22 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_thread/vpx_thread.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_loopfilter.h"
@@ -21,7 +22,6 @@
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_frame_buffers.h"
#include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_thread.h"
#include "vp9/common/vp9_tile_common.h"
#if CONFIG_VP9_POSTPROC
@@ -80,7 +80,7 @@ typedef struct {
// frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
// that no FrameWorker owns, or is decoding, this buffer.
- VP9Worker *frame_worker_owner;
+ VPxWorker *frame_worker_owner;
// row and col indicate which position frame has been decoded to in real
// pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 0aac4a9e6..1f1632573 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -9,8 +9,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <limits.h>
-
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_seg_common.h"
@@ -339,43 +337,3 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
return pred_context;
}
-// Returns a context number for the given MB prediction signal
-// The mode info data structure has a one element border above and to the
-// left of the entries corresponding to real blocks.
-// The prediction flags in these dummy entries are initialized to 0.
-int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
- const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
- const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
- const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
- const int has_above = xd->up_available;
- const int has_left = xd->left_available;
- int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
- : max_tx_size;
- int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
- : max_tx_size;
- if (!has_left)
- left_ctx = above_ctx;
-
- if (!has_above)
- above_ctx = left_ctx;
-
- return (above_ctx + left_ctx) > max_tx_size;
-}
-
-int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
- BLOCK_SIZE bsize, int mi_row, int mi_col) {
- const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = num_8x8_blocks_wide_lookup[bsize];
- const int bh = num_8x8_blocks_high_lookup[bsize];
- const int xmis = MIN(cm->mi_cols - mi_col, bw);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
- int x, y, segment_id = INT_MAX;
-
- for (y = 0; y < ymis; y++)
- for (x = 0; x < xmis; x++)
- segment_id = MIN(segment_id,
- segment_ids[mi_offset + y * cm->mi_cols + x]);
-
- assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
- return segment_id;
-}
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index bc19d28b9..76161444b 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -18,8 +18,24 @@
extern "C" {
#endif
-int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
- BLOCK_SIZE bsize, int mi_row, int mi_col);
+static INLINE int get_segment_id(const VP9_COMMON *cm,
+ const uint8_t *segment_ids,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
+ const int xmis = MIN(cm->mi_cols - mi_col, bw);
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ int x, y, segment_id = MAX_SEGMENTS;
+
+ for (y = 0; y < ymis; ++y)
+ for (x = 0; x < xmis; ++x)
+ segment_id = MIN(segment_id,
+ segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+ assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+ return segment_id;
+}
static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
const MODE_INFO *const above_mi = xd->above_mi;
@@ -88,7 +104,28 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
}
-int vp9_get_tx_size_context(const MACROBLOCKD *xd);
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+ const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
+ int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
+ : max_tx_size;
+ int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
+ : max_tx_size;
+ if (!has_left)
+ left_ctx = above_ctx;
+
+ if (!has_above)
+ above_ctx = left_ctx;
+
+ return (above_ctx + left_ctx) > max_tx_size;
+}
static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
const struct tx_probs *tx_probs) {
@@ -108,7 +145,7 @@ static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size,
const MACROBLOCKD *xd,
const struct tx_probs *tx_probs) {
- return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs);
+ return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
}
static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index af2307eb0..d8c7e1447 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -805,84 +805,84 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
# variance
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x32 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance8x8 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_avg_8x8 sse2 neon msa/;
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index cba57ff41..6b11c93cc 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -157,9 +157,9 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
VP9_COMMON *cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only,
- VP9Worker *workers, int nworkers,
+ VPxWorker *workers, int nworkers,
VP9LfSync *lf_sync) {
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
// Number of superblock rows and cols
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
// Decoder may allocate more threads than number of tiles based on user's
@@ -186,10 +186,10 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
// because of contention. If the multithreading code changes in the future
// then the number of workers used by the loopfilter should be revisited.
for (i = 0; i < num_workers; ++i) {
- VP9Worker *const worker = &workers[i];
+ VPxWorker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
- worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+ worker->hook = (VPxWorkerHook)loop_filter_row_worker;
worker->data1 = lf_sync;
worker->data2 = lf_data;
@@ -218,7 +218,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
struct macroblockd_plane planes[MAX_MB_PLANE],
int frame_filter_level,
int y_only, int partial_frame,
- VP9Worker *workers, int num_workers,
+ VPxWorker *workers, int num_workers,
VP9LfSync *lf_sync) {
int start_mi_row, end_mi_row, mi_rows_to_filter;
diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h
index 3b3a6996a..94e6dfe7f 100644
--- a/vp9/common/vp9_thread_common.h
+++ b/vp9/common/vp9_thread_common.h
@@ -12,7 +12,7 @@
#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
#include "./vpx_config.h"
#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_thread.h"
+#include "vpx_thread/vpx_thread.h"
struct VP9Common;
struct FRAME_COUNTS;
@@ -48,7 +48,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
struct macroblockd_plane planes[MAX_MB_PLANE],
int frame_filter_level,
int y_only, int partial_frame,
- VP9Worker *workers, int num_workers,
+ VPxWorker *workers, int num_workers,
VP9LfSync *lf_sync);
void vp9_accumulate_frame_counts(struct VP9Common *cm,
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 4fc10b4e7..a7e9fdaa6 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -18,6 +18,7 @@
#include "vpx_ports/mem.h"
#include "vpx_ports/mem_ops.h"
#include "vpx_scale/vpx_scale.h"
+#include "vpx_thread/vpx_thread.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_common.h"
@@ -30,7 +31,6 @@
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_thread.h"
#include "vp9/common/vp9_tile_common.h"
#include "vp9/decoder/vp9_decodeframe.h"
@@ -1259,7 +1259,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
const uint8_t *data,
const uint8_t *data_end) {
VP9_COMMON *const cm = &pbi->common;
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
@@ -1272,7 +1272,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
pbi->lf_worker.data1 == NULL) {
CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
vpx_memalign(32, sizeof(LFWorkerData)));
- pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
+ pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker;
if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Loop filter thread creation failed");
@@ -1434,7 +1434,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
const uint8_t *data,
const uint8_t *data_end) {
VP9_COMMON *const cm = &pbi->common;
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
const uint8_t *bit_reader_end = NULL;
const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
const int tile_cols = 1 << cm->log2_tile_cols;
@@ -1464,7 +1464,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
for (i = 0; i < num_threads; ++i) {
- VP9Worker *const worker = &pbi->tile_workers[i];
+ VPxWorker *const worker = &pbi->tile_workers[i];
++pbi->num_tile_workers;
winterface->init(worker);
@@ -1477,9 +1477,9 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
// Reset tile decoding hook
for (n = 0; n < num_workers; ++n) {
- VP9Worker *const worker = &pbi->tile_workers[n];
+ VPxWorker *const worker = &pbi->tile_workers[n];
winterface->sync(worker);
- worker->hook = (VP9WorkerHook)tile_worker_hook;
+ worker->hook = (VPxWorkerHook)tile_worker_hook;
worker->data1 = &pbi->tile_worker_data[n];
worker->data2 = &pbi->tile_worker_info[n];
}
@@ -1529,7 +1529,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
while (n < tile_cols) {
int i;
for (i = 0; i < num_workers && n < tile_cols; ++i) {
- VP9Worker *const worker = &pbi->tile_workers[i];
+ VPxWorker *const worker = &pbi->tile_workers[i];
TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
TileInfo *const tile = (TileInfo*)worker->data2;
TileBuffer *const buf = &tile_buffers[0][n];
@@ -1561,7 +1561,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
}
for (; i > 0; --i) {
- VP9Worker *const worker = &pbi->tile_workers[i - 1];
+ VPxWorker *const worker = &pbi->tile_workers[i - 1];
// TODO(jzern): The tile may have specific error data associated with
// its vpx_internal_error_info which could be propagated to the main info
// in cm. Additionally once the threads have been synced and an error is
@@ -2020,7 +2020,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
// If encoded in frame parallel mode, frame context is ready after decoding
// the frame header.
if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
- VP9Worker *const worker = pbi->frame_worker_owner;
+ VPxWorker *const worker = pbi->frame_worker_owner;
FrameWorkerData *const frame_worker_data = worker->data1;
if (cm->refresh_frame_context) {
context_updated = 1;
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index cd20c84cf..d587c8fa3 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -66,7 +66,7 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
TX_SIZE max_tx_size, vp9_reader *r) {
FRAME_COUNTS *counts = xd->counts;
- const int ctx = vp9_get_tx_size_context(xd);
+ const int ctx = get_tx_size_context(xd);
const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
int tx_size = vp9_read(r, tx_probs[0]);
if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
@@ -155,7 +155,7 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
return 0; // Default for disabled segmentation
predicted_segment_id = cm->last_frame_seg_map ?
- vp9_get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0;
+ get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0;
if (!seg->update_map) {
copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 7991a39e6..cbb3266d3 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -20,6 +20,7 @@
#include "vpx_ports/vpx_once.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_scale/vpx_scale.h"
+#include "vpx_thread/vpx_thread.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_loopfilter.h"
@@ -30,7 +31,6 @@
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/common/vp9_thread.h"
#include "vp9/decoder/vp9_decodeframe.h"
#include "vp9/decoder/vp9_decoder.h"
@@ -118,7 +118,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
cm->error.setjmp = 0;
- vp9_get_worker_interface()->init(&pbi->lf_worker);
+ vpx_get_worker_interface()->init(&pbi->lf_worker);
return pbi;
}
@@ -126,12 +126,12 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
void vp9_decoder_remove(VP9Decoder *pbi) {
int i;
- vp9_get_worker_interface()->end(&pbi->lf_worker);
+ vpx_get_worker_interface()->end(&pbi->lf_worker);
vpx_free(pbi->lf_worker.data1);
vpx_free(pbi->tile_data);
for (i = 0; i < pbi->num_tile_workers; ++i) {
- VP9Worker *const worker = &pbi->tile_workers[i];
- vp9_get_worker_interface()->end(worker);
+ VPxWorker *const worker = &pbi->tile_workers[i];
+ vpx_get_worker_interface()->end(worker);
}
vpx_free(pbi->tile_worker_data);
vpx_free(pbi->tile_worker_info);
@@ -311,7 +311,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
pbi->hold_ref_buf = 0;
if (pbi->frame_parallel_decode) {
- VP9Worker *const worker = pbi->frame_worker_owner;
+ VPxWorker *const worker = pbi->frame_worker_owner;
vp9_frameworker_lock_stats(worker);
frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
// Reset decoding progress.
@@ -325,7 +325,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
if (setjmp(cm->error.jmp)) {
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
int i;
cm->error.setjmp = 0;
@@ -387,7 +387,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
if (pbi->frame_parallel_decode) {
// Need to lock the mutex here as another thread may
// be accessing this buffer.
- VP9Worker *const worker = pbi->frame_worker_owner;
+ VPxWorker *const worker = pbi->frame_worker_owner;
FrameWorkerData *const frame_worker_data = worker->data1;
vp9_frameworker_lock_stats(worker);
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index c19f0ac3b..50c03c18e 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -15,10 +15,11 @@
#include "vpx/vpx_codec.h"
#include "vpx_scale/yv12config.h"
+#include "vpx_thread/vpx_thread.h"
+
#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_ppflags.h"
-#include "vp9/common/vp9_thread.h"
#include "vp9/decoder/vp9_dthread.h"
#include "vp9/decoder/vp9_reader.h"
@@ -56,9 +57,9 @@ typedef struct VP9Decoder {
// the same.
RefCntBuffer *cur_buf; // Current decoding frame buffer.
- VP9Worker *frame_worker_owner; // frame_worker that owns this pbi.
- VP9Worker lf_worker;
- VP9Worker *tile_workers;
+ VPxWorker *frame_worker_owner; // frame_worker that owns this pbi.
+ VPxWorker lf_worker;
+ VPxWorker *tile_workers;
TileWorkerData *tile_worker_data;
TileInfo *tile_worker_info;
int num_tile_workers;
diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
index c22617edb..b33c3b718 100644
--- a/vp9/decoder/vp9_dsubexp.c
+++ b/vp9/decoder/vp9_dsubexp.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
+
#include "vp9/common/vp9_entropy.h"
#include "vp9/decoder/vp9_dsubexp.h"
@@ -16,7 +18,7 @@ static int inv_recenter_nonneg(int v, int m) {
if (v > 2 * m)
return v;
- return v % 2 ? m - (v + 1) / 2 : m + v / 2;
+ return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
}
static int decode_uniform(vp9_reader *r) {
@@ -27,34 +29,32 @@ static int decode_uniform(vp9_reader *r) {
}
static int inv_remap_prob(int v, int m) {
- static int inv_map_table[MAX_PROB - 1] = {
- 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188,
- 201, 214, 227, 240, 253, 0, 1, 2, 3, 4, 5, 7, 8, 9, 10,
- 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26,
- 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
- 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59,
- 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75,
- 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91,
- 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
- 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124,
- 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140,
- 141, 142, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156,
- 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
- 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189,
- 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205,
- 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
- 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
- 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252
+ static int inv_map_table[MAX_PROB] = {
+ 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176, 189,
+ 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11,
+ 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27,
+ 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+ 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60,
+ 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76,
+ 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92,
+ 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+ 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125,
+ 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141,
+ 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157,
+ 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
+ 174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190,
+ 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
+ 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
+ 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
+ 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 253
};
- // The clamp is not necessary for conforming VP9 stream, it is added to
- // prevent out of bound access for bad input data
- v = clamp(v, 0, 253);
+ assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
v = inv_map_table[v];
m--;
if ((m << 1) <= MAX_PROB) {
- return 1 + inv_recenter_nonneg(v + 1, m);
+ return 1 + inv_recenter_nonneg(v, m);
} else {
- return MAX_PROB - inv_recenter_nonneg(v + 1, MAX_PROB - 1 - m);
+ return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m);
}
}
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 96a63bd9e..14a71448f 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -17,7 +17,7 @@
// #define DEBUG_THREAD
// TODO(hkuang): Clean up all the #ifdef in this file.
-void vp9_frameworker_lock_stats(VP9Worker *const worker) {
+void vp9_frameworker_lock_stats(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
FrameWorkerData *const worker_data = worker->data1;
pthread_mutex_lock(&worker_data->stats_mutex);
@@ -26,7 +26,7 @@ void vp9_frameworker_lock_stats(VP9Worker *const worker) {
#endif
}
-void vp9_frameworker_unlock_stats(VP9Worker *const worker) {
+void vp9_frameworker_unlock_stats(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
FrameWorkerData *const worker_data = worker->data1;
pthread_mutex_unlock(&worker_data->stats_mutex);
@@ -35,7 +35,7 @@ void vp9_frameworker_unlock_stats(VP9Worker *const worker) {
#endif
}
-void vp9_frameworker_signal_stats(VP9Worker *const worker) {
+void vp9_frameworker_signal_stats(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
FrameWorkerData *const worker_data = worker->data1;
@@ -59,7 +59,7 @@ void vp9_frameworker_signal_stats(VP9Worker *const worker) {
#endif
// TODO(hkuang): Remove worker parameter as it is only used in debug code.
-void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
int row) {
#if CONFIG_MULTITHREAD
if (!ref_buf)
@@ -74,7 +74,7 @@ void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
{
// Find the worker thread that owns the reference frame. If the reference
// frame has been fully decoded, it may not have owner.
- VP9Worker *const ref_worker = ref_buf->frame_worker_owner;
+ VPxWorker *const ref_worker = ref_buf->frame_worker_owner;
FrameWorkerData *const ref_worker_data =
(FrameWorkerData *)ref_worker->data1;
const VP9Decoder *const pbi = ref_worker_data->pbi;
@@ -114,7 +114,7 @@ void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
#if CONFIG_MULTITHREAD
- VP9Worker *worker = buf->frame_worker_owner;
+ VPxWorker *worker = buf->frame_worker_owner;
#ifdef DEBUG_THREAD
{
@@ -134,8 +134,8 @@ void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
#endif // CONFIG_MULTITHREAD
}
-void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
- VP9Worker *const src_worker) {
+void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
+ VPxWorker *const src_worker) {
#if CONFIG_MULTITHREAD
FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index 979cb3d8b..862880167 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -12,7 +12,7 @@
#define VP9_DECODER_VP9_DTHREAD_H_
#include "./vpx_config.h"
-#include "vp9/common/vp9_thread.h"
+#include "vpx_thread/vpx_thread.h"
#include "vpx/internal/vpx_codec_internal.h"
struct VP9Common;
@@ -44,15 +44,15 @@ typedef struct FrameWorkerData {
int frame_decoded; // Finished decoding current frame.
} FrameWorkerData;
-void vp9_frameworker_lock_stats(VP9Worker *const worker);
-void vp9_frameworker_unlock_stats(VP9Worker *const worker);
-void vp9_frameworker_signal_stats(VP9Worker *const worker);
+void vp9_frameworker_lock_stats(VPxWorker *const worker);
+void vp9_frameworker_unlock_stats(VPxWorker *const worker);
+void vp9_frameworker_signal_stats(VPxWorker *const worker);
// Wait until ref_buf has been decoded to row in real pixel unit.
// Note: worker may already finish decoding ref_buf and release it in order to
// start decoding next frame. So need to check whether worker is still decoding
// ref_buf.
-void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
int row);
// FrameWorker broadcasts its decoding progress so other workers that are
@@ -60,7 +60,7 @@ void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
// Copy necessary decoding context from src worker to dst worker.
-void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
- VP9Worker *const src_worker);
+void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
+ VPxWorker *const src_worker);
#endif // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/vp9/encoder/mips/msa/vp9_variance_msa.c b/vp9/encoder/mips/msa/vp9_variance_msa.c
deleted file mode 100644
index 33fb4966b..000000000
--- a/vp9/encoder/mips/msa/vp9_variance_msa.c
+++ /dev/null
@@ -1,2011 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
-
-DECLARE_ALIGNED(256, static const int8_t, vp9_bilinear_filters_msa[15][2]) = {
- { 120, 8 },
- { 112, 16 },
- { 104, 24 },
- { 96, 32 },
- { 88, 40 },
- { 80, 48 },
- { 72, 56 },
- { 64, 64 },
- { 56, 72 },
- { 48, 80 },
- { 40, 88 },
- { 32, 96 },
- { 24, 104 },
- { 16, 112 },
- { 8, 120 }
-};
-
-#define CALC_MSE_AVG_B(src, ref, var, sub) { \
- v16u8 src_l0_m, src_l1_m; \
- v8i16 res_l0_m, res_l1_m; \
- \
- ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
- HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
- DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
- \
- sub += res_l0_m + res_l1_m; \
-}
-
-#define VARIANCE_WxH(sse, diff, shift) \
- sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
- sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t height,
- int32_t *diff) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 pred, src = { 0 };
- v16u8 ref = { 0 };
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- INSERT_W4_UB(src0, src1, src2, src3, src);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t height,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
- src0, src1, ref0, ref1);
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t height,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src, ref, pred;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t height,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1, pred0, pred1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1, pred0, pred1;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 16; ht_cnt--;) {
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 16; ht_cnt--;) {
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
- src0, src1, src2, src3);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src2, ref2, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src3, ref3, var, avg1);
-
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
- src0, src1, src2, src3);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src2, ref2, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src3, ref3, var, avg1);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
-
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v8i16 avg2 = { 0 };
- v8i16 avg3 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 32; ht_cnt--;) {
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
- src0, src1, src2, src3);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src2, ref2, var, avg2);
- CALC_MSE_AVG_B(src3, ref3, var, avg3);
-
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
- src0, src1, src2, src3);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src2, ref2, var, avg2);
- CALC_MSE_AVG_B(src3, ref3, var, avg3);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
- vec += __msa_hadd_s_w(avg2, avg2);
- vec += __msa_hadd_s_w(avg3, avg3);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 filt0, ref = { 0 };
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
- ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
- src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
- CALC_MSE_AVG_B(src0, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 filt0, out, ref0, ref1, ref2, ref3;
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3, const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
- out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
- CALC_MSE_AVG_B(out, ref0, var, avg);
- out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
- CALC_MSE_AVG_B(out, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v16u8 dst0, dst1, dst2, dst3, filt0;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
- v8u16 const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- out0, out1, out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
- out4, out5, out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
- PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
- src0, src1, src2, src3);
- CALC_MSE_AVG_B(src0, dst0, var, avg);
- CALC_MSE_AVG_B(src1, dst1, var, avg);
- CALC_MSE_AVG_B(src2, dst2, var, avg);
- CALC_MSE_AVG_B(src3, dst3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4, out;
- v16u8 src10_r, src32_r, src21_r, src43_r;
- v16u8 ref = { 0 };
- v16u8 src2110, src4332;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
- v8u16 tmp0, tmp1;
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
- src10_r, src21_r, src32_r, src43_r);
- ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 ref0, ref1, ref2, ref3;
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
- vec0, vec1, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out0, out1, out2, out3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- src0 = src4;
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- CALC_MSE_AVG_B(out2, ref2, var, avg);
- CALC_MSE_AVG_B(out3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter_horiz,
- const int8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out, ref = { 0 };
- v16u8 filt_vt, filt_hz, vec0, vec1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
- v8u16 tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter_horiz,
- const int8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out0, out1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt_vt, filt_hz, vec0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter_horiz,
- const int8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
- v8u16 tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- LD_UB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src0, src2, src4, src6);
- LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- CALC_MSE_AVG_B(src2, ref2, var, avg);
- CALC_MSE_AVG_B(src3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter_horiz,
- const int8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height,
- &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const int8_t *filter_horiz,
- const int8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height,
- &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 out, pred, filt0, ref = { 0 };
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
- ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
- out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 out, pred, filt0;
- v16u8 ref0, ref1, ref2, ref3;
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
- out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
-
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref0, var, avg);
- out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff,
- int32_t width) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v16u8 dst0, dst1, dst2, dst3;
- v16u8 tmp0, tmp1, tmp2, tmp3;
- v16u8 pred0, pred1, pred2, pred3, filt0;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
- v8u16 const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- dst += (4 * dst_stride);
- LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
- sec_pred += (4 * width);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- out0, out1, out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
- out4, out5, out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
- PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
- tmp0, tmp1, tmp2, tmp3);
- AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
- tmp0, tmp1, tmp2, tmp3);
-
- CALC_MSE_AVG_B(tmp0, dst0, var, avg);
- CALC_MSE_AVG_B(tmp1, dst1, var, avg);
- CALC_MSE_AVG_B(tmp2, dst2, var, avg);
- CALC_MSE_AVG_B(tmp3, dst3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height,
- &diff0[loop_cnt], 32);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height,
- &diff0[loop_cnt], 64);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 src10_r, src32_r, src21_r, src43_r;
- v16u8 out, pred, ref = { 0 };
- v16u8 src2110, src4332, filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
- v8u16 tmp0, tmp1;
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
- src10_r, src21_r, src32_r, src43_r);
- ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
-
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, filt0;
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
- vec0, vec1, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff,
- int32_t width) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out0, out1, out2, out3, filt0;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
- sec_pred += (4 * width);
-
- ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- src0 = src4;
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
- out0, out1, out2, out3);
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- CALC_MSE_AVG_B(out2, ref2, var, avg);
- CALC_MSE_AVG_B(out3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height,
- &diff0[loop_cnt], 32);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height,
- &diff0[loop_cnt], 64);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter_horiz, const int8_t *filter_vert,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 out, pred, ref = { 0 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter_horiz, const int8_t *filter_vert,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 pred0, pred1, out0, out1;
- v16u8 filt_hz, filt_vt, vec0;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter_horiz,
- const int8_t *filter_vert,
- int32_t height,
- int32_t *diff,
- int32_t width) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3;
- v16u8 out0, out1, out2, out3;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- LD_UB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src0, src2, src4, src6);
- LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
- sec_pred += (4 * width);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
- out0, out1, out2, out3);
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- CALC_MSE_AVG_B(out2, ref2, var, avg);
- CALC_MSE_AVG_B(out3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter_horiz, const int8_t *filter_vert,
- int32_t height, int32_t *diff) {
- return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter_horiz, filter_vert,
- height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter_horiz, const int8_t *filter_vert,
- int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter_horiz, filter_vert,
- height, &diff0[loop_cnt], 32);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
- const uint8_t *src, int32_t src_stride,
- const uint8_t *dst, int32_t dst_stride,
- const uint8_t *sec_pred,
- const int8_t *filter_horiz, const int8_t *filter_vert,
- int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter_horiz, filter_vert,
- height, &diff0[loop_cnt], 64);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
-uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
- int32_t src_stride, \
- int32_t xoffset, \
- int32_t yoffset, \
- const uint8_t *ref, \
- int32_t ref_stride, \
- uint32_t *sse) { \
- int32_t diff; \
- uint32_t var; \
- const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1]; \
- const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride, \
- ref, ref_stride, \
- h_filter, v_filter, \
- ht, &diff); \
- } else { \
- *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride, \
- ref, ref_stride, \
- v_filter, ht, &diff); \
- } \
- \
- var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride, \
- ref, ref_stride, \
- h_filter, ht, &diff); \
- \
- var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- } else { \
- var = vpx_variance##wd##x##ht##_msa(src, src_stride, \
- ref, ref_stride, sse); \
- } \
- } \
- \
- return var; \
-}
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
-
-#define VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
-uint32_t vp9_sub_pixel_avg_variance##wd##x##ht##_msa( \
- const uint8_t *src_ptr, int32_t src_stride, \
- int32_t xoffset, int32_t yoffset, \
- const uint8_t *ref_ptr, int32_t ref_stride, \
- uint32_t *sse, const uint8_t *sec_pred) { \
- int32_t diff; \
- const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1]; \
- const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, h_filter, \
- v_filter, ht, &diff); \
- } else { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, v_filter, \
- ht, &diff); \
- } \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, h_filter, \
- ht, &diff); \
- } else { \
- *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, ht, &diff); \
- } \
- } \
- \
- return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
-}
-
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
-
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
-
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
-
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
-
-uint32_t vp9_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- int32_t xoffset,
- int32_t yoffset,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- uint32_t *sse,
- const uint8_t *sec_pred) {
- int32_t diff;
- const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1];
- const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1];
-
- if (yoffset) {
- if (xoffset) {
- *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
- ref_ptr, ref_stride,
- sec_pred, h_filter,
- v_filter, 64, &diff);
- } else {
- *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
- ref_ptr, ref_stride,
- sec_pred, v_filter,
- 64, &diff);
- }
- } else {
- if (xoffset) {
- *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
- ref_ptr, ref_stride,
- sec_pred, h_filter,
- 64, &diff);
- } else {
- *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
- sec_pred, &diff);
- }
- }
-
- return VARIANCE_32Wx64H(*sse, diff);
-}
-
-#define VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
-uint32_t vp9_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr, \
- int32_t src_stride, \
- int32_t xoffset, \
- int32_t yoffset, \
- const uint8_t *ref_ptr, \
- int32_t ref_stride, \
- uint32_t *sse, \
- const uint8_t *sec_pred) { \
- int32_t diff; \
- const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1]; \
- const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, h_filter, \
- v_filter, ht, &diff); \
- } else { \
- *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, v_filter, \
- ht, &diff); \
- } \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, h_filter, \
- ht, &diff); \
- } else { \
- *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, \
- ref_ptr, ref_stride, \
- sec_pred, &diff); \
- } \
- } \
- \
- return VARIANCE_64Wx##ht##H(*sse, diff); \
-}
-
-VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
-VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index cb99af781..e94d43b14 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -224,7 +224,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
const uint8_t *const map = seg->update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
- mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+ mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
}
vp9_init_plane_quantizers(cpi, x);
@@ -678,7 +678,7 @@ static int choose_partitioning(VP9_COMP *cpi,
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
cm->last_frame_seg_map;
- segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+ segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
if (cyclic_refresh_segment_id_boosted(segment_id)) {
int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
@@ -1002,7 +1002,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
const uint8_t *const map = seg->update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
mi_addr->mbmi.segment_id =
- vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+ get_segment_id(cm, map, bsize, mi_row, mi_col);
}
// Else for cyclic refresh mode update the segment map, set the segment id
// and then update the quantizer.
@@ -1237,7 +1237,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
} else {
const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
- mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+ mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
}
x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
} else if (aq_mode == COMPLEXITY_AQ) {
@@ -1247,7 +1247,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
: cm->last_frame_seg_map;
// If segment is boosted, use rdmult for that segment.
if (cyclic_refresh_segment_id_boosted(
- vp9_get_segment_id(cm, map, bsize, mi_row, mi_col)))
+ get_segment_id(cm, map, bsize, mi_row, mi_col)))
x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
}
@@ -1698,7 +1698,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
cpi->oxcf.aq_mode == VARIANCE_AQ ) {
const uint8_t *const map = seg->update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
- mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+ mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
} else {
// Setting segmentation map for cyclic_refresh.
vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
@@ -2799,7 +2799,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi,
if (seg->enabled) {
const uint8_t *const map = seg->update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
- int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+ int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
}
@@ -3568,7 +3568,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
if (seg->enabled) {
const uint8_t *const map = seg->update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
- int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+ int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
if (seg_skip) {
partition_search_type = FIXED_PARTITION;
@@ -4198,7 +4198,7 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
if (cm->tx_mode == TX_MODE_SELECT &&
mbmi->sb_type >= BLOCK_8X8 &&
!(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
- ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
+ ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
&td->counts->tx)[mbmi->tx_size];
} else {
int x, y;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 45b5df4d1..b9b11064d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2013,11 +2013,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
#endif
for (t = 0; t < cpi->num_workers; ++t) {
- VP9Worker *const worker = &cpi->workers[t];
+ VPxWorker *const worker = &cpi->workers[t];
EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
// Deallocate allocated threads.
- vp9_get_worker_interface()->end(worker);
+ vpx_get_worker_interface()->end(worker);
// Deallocate allocated thread data.
if (t < cpi->num_workers - 1) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index afe3ae9da..8b324a77b 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -16,13 +16,13 @@
#include "./vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vp8cx.h"
+#include "vpx_thread/vpx_thread.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_ppflags.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_thread.h"
#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
#include "vp9/encoder/vp9_context_tree.h"
@@ -497,7 +497,7 @@ typedef struct VP9_COMP {
// Multi-threading
int num_workers;
- VP9Worker *workers;
+ VPxWorker *workers;
struct EncWorkerData *tile_thr_data;
VP9LfSync lf_row_sync;
} VP9_COMP;
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 4ae3fbc54..b4132d815 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -69,7 +69,7 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
void vp9_encode_tiles_mt(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols);
int i;
@@ -94,7 +94,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
sizeof(*cpi->tile_thr_data)));
for (i = 0; i < allocated_workers; i++) {
- VP9Worker *const worker = &cpi->workers[i];
+ VPxWorker *const worker = &cpi->workers[i];
EncWorkerData *thread_data = &cpi->tile_thr_data[i];
++cpi->num_workers;
@@ -132,10 +132,10 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
}
for (i = 0; i < num_workers; i++) {
- VP9Worker *const worker = &cpi->workers[i];
+ VPxWorker *const worker = &cpi->workers[i];
EncWorkerData *thread_data;
- worker->hook = (VP9WorkerHook)enc_worker_hook;
+ worker->hook = (VPxWorkerHook)enc_worker_hook;
worker->data1 = &cpi->tile_thr_data[i];
worker->data2 = NULL;
thread_data = (EncWorkerData*)worker->data1;
@@ -170,7 +170,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
// Encode a frame
for (i = 0; i < num_workers; i++) {
- VP9Worker *const worker = &cpi->workers[i];
+ VPxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
// Set the starting tile for each thread.
@@ -184,12 +184,12 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
// Encoding ends.
for (i = 0; i < num_workers; i++) {
- VP9Worker *const worker = &cpi->workers[i];
+ VPxWorker *const worker = &cpi->workers[i];
winterface->sync(worker);
}
for (i = 0; i < num_workers; i++) {
- VP9Worker *const worker = &cpi->workers[i];
+ VPxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
// Accumulate counters.
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 9b15072e9..1f0d4dfee 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -129,8 +129,8 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
if (cm->frame_type != KEY_FRAME) {
const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
// Test to see if the segment id matches the predicted value.
- const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
- bsize, mi_row, mi_col);
+ const int pred_segment_id = get_segment_id(cm, cm->last_frame_seg_map,
+ bsize, mi_row, mi_col);
const int pred_flag = pred_segment_id == segment_id;
const int pred_context = vp9_get_pred_context_seg_id(xd);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 8a180387f..c257d91b4 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -51,8 +51,6 @@ VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
VP9_COMMON_SRCS-yes += common/vp9_textblit.h
-VP9_COMMON_SRCS-yes += common/vp9_thread.h
-VP9_COMMON_SRCS-yes += common/vp9_thread.c
VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 4080d64c1..ab5bcba1c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -17,10 +17,10 @@
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
+#include "vpx_thread/vpx_thread.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_frame_buffers.h"
-#include "vp9/common/vp9_thread.h"
#include "vp9/decoder/vp9_decoder.h"
#include "vp9/decoder/vp9_decodeframe.h"
@@ -59,7 +59,7 @@ struct vpx_codec_alg_priv {
// Frame parallel related.
int frame_parallel_decode; // frame-based threading.
- VP9Worker *frame_workers;
+ VPxWorker *frame_workers;
int num_frame_workers;
int next_submit_worker_id;
int last_submit_worker_id;
@@ -112,10 +112,10 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
if (ctx->frame_workers != NULL) {
int i;
for (i = 0; i < ctx->num_frame_workers; ++i) {
- VP9Worker *const worker = &ctx->frame_workers[i];
+ VPxWorker *const worker = &ctx->frame_workers[i];
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
- vp9_get_worker_interface()->end(worker);
+ vpx_get_worker_interface()->end(worker);
vp9_remove_common(&frame_worker_data->pbi->common);
#if CONFIG_VP9_POSTPROC
vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
@@ -279,7 +279,7 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
int i;
for (i = 0; i < ctx->num_frame_workers; ++i) {
- VP9Worker *const worker = &ctx->frame_workers[i];
+ VPxWorker *const worker = &ctx->frame_workers[i];
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
VP9_COMMON *const cm = &frame_worker_data->pbi->common;
BufferPool *const pool = cm->buffer_pool;
@@ -336,7 +336,7 @@ static int frame_worker_hook(void *arg1, void *arg2) {
// the compressed data.
if (frame_worker_data->result != 0 ||
frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
- VP9Worker *const worker = frame_worker_data->pbi->frame_worker_owner;
+ VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
// Signal all the other threads that are waiting for this frame.
vp9_frameworker_lock_stats(worker);
@@ -359,7 +359,7 @@ static int frame_worker_hook(void *arg1, void *arg2) {
static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
int i;
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
ctx->last_show_frame = -1;
ctx->next_submit_worker_id = 0;
@@ -387,7 +387,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
}
#endif
- ctx->frame_workers = (VP9Worker *)
+ ctx->frame_workers = (VPxWorker *)
vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
if (ctx->frame_workers == NULL) {
set_error_detail(ctx, "Failed to allocate frame_workers");
@@ -395,7 +395,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
}
for (i = 0; i < ctx->num_frame_workers; ++i) {
- VP9Worker *const worker = &ctx->frame_workers[i];
+ VPxWorker *const worker = &ctx->frame_workers[i];
FrameWorkerData *frame_worker_data = NULL;
winterface->init(worker);
worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
@@ -435,7 +435,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
frame_worker_data->pbi->common.frame_parallel_decode =
ctx->frame_parallel_decode;
- worker->hook = (VP9WorkerHook)frame_worker_hook;
+ worker->hook = (VPxWorkerHook)frame_worker_hook;
if (!winterface->reset(worker)) {
set_error_detail(ctx, "Frame Worker thread creation failed");
return VPX_CODEC_MEM_ERROR;
@@ -464,7 +464,7 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
const uint8_t **data, unsigned int data_sz,
void *user_priv, int64_t deadline) {
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
(void)deadline;
// Determine the stream parameters. Note that we rely on peek_si to
@@ -483,7 +483,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
}
if (!ctx->frame_parallel_decode) {
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
frame_worker_data->data = *data;
frame_worker_data->data_size = data_sz;
@@ -506,7 +506,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
check_resync(ctx, frame_worker_data->pbi);
} else {
- VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+ VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
// Copy context from last worker thread to next worker thread.
if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
@@ -554,8 +554,8 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
YV12_BUFFER_CONFIG sd;
vp9_ppflags_t flags = {0, 0, 0};
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
- VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+ VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
ctx->next_output_worker_id =
(ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
@@ -746,8 +746,8 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
do {
YV12_BUFFER_CONFIG sd;
vp9_ppflags_t flags = {0, 0, 0};
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
- VP9Worker *const worker =
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+ VPxWorker *const worker =
&ctx->frame_workers[ctx->next_output_worker_id];
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
@@ -819,7 +819,7 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
if (data) {
vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
YV12_BUFFER_CONFIG sd;
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
image2yuvconfig(&frame->img, &sd);
return vp9_set_reference_dec(&frame_worker_data->pbi->common,
@@ -842,7 +842,7 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
if (data) {
vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
YV12_BUFFER_CONFIG sd;
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
image2yuvconfig(&frame->img, &sd);
return vp9_copy_reference_dec(frame_worker_data->pbi,
@@ -864,7 +864,7 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
if (data) {
YV12_BUFFER_CONFIG* fb;
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
if (fb == NULL) return VPX_CODEC_ERROR;
@@ -913,7 +913,7 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
if (update_info) {
if (ctx->frame_workers) {
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
*update_info = frame_worker_data->pbi->refresh_frame_flags;
@@ -932,7 +932,7 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
if (corrupted) {
if (ctx->frame_workers) {
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
RefCntBuffer *const frame_bufs =
@@ -962,7 +962,7 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
if (frame_size) {
if (ctx->frame_workers) {
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
@@ -989,7 +989,7 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
if (display_size) {
if (ctx->frame_workers) {
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
@@ -1007,7 +1007,7 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
va_list args) {
unsigned int *const bit_depth = va_arg(args, unsigned int *);
- VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+ VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
if (bit_depth) {
if (worker) {
@@ -1053,7 +1053,7 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
ctx->byte_alignment = byte_alignment;
if (ctx->frame_workers) {
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
frame_worker_data->pbi->common.byte_alignment = byte_alignment;
@@ -1066,7 +1066,7 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
ctx->skip_loop_filter = va_arg(args, int);
if (ctx->frame_workers) {
- VP9Worker *const worker = ctx->frame_workers;
+ VPxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index dca9a6cbd..5a039accc 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -163,6 +163,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vp9/common/vp9_thread.c b/vpx_thread/vpx_thread.c
index 1c6aec032..0bb0125bd 100644
--- a/vp9/common/vp9_thread.c
+++ b/vpx_thread/vpx_thread.c
@@ -15,12 +15,12 @@
#include <assert.h>
#include <string.h> // for memset()
-#include "./vp9_thread.h"
+#include "./vpx_thread.h"
#include "vpx_mem/vpx_mem.h"
#if CONFIG_MULTITHREAD
-struct VP9WorkerImpl {
+struct VPxWorkerImpl {
pthread_mutex_t mutex_;
pthread_cond_t condition_;
pthread_t thread_;
@@ -28,10 +28,10 @@ struct VP9WorkerImpl {
//------------------------------------------------------------------------------
-static void execute(VP9Worker *const worker); // Forward declaration.
+static void execute(VPxWorker *const worker); // Forward declaration.
static THREADFN thread_loop(void *ptr) {
- VP9Worker *const worker = (VP9Worker*)ptr;
+ VPxWorker *const worker = (VPxWorker*)ptr;
int done = 0;
while (!done) {
pthread_mutex_lock(&worker->impl_->mutex_);
@@ -52,8 +52,8 @@ static THREADFN thread_loop(void *ptr) {
}
// main thread state control
-static void change_state(VP9Worker *const worker,
- VP9WorkerStatus new_status) {
+static void change_state(VPxWorker *const worker,
+ VPxWorkerStatus new_status) {
// No-op when attempting to change state on a thread that didn't come up.
// Checking status_ without acquiring the lock first would result in a data
// race.
@@ -78,12 +78,12 @@ static void change_state(VP9Worker *const worker,
//------------------------------------------------------------------------------
-static void init(VP9Worker *const worker) {
+static void init(VPxWorker *const worker) {
memset(worker, 0, sizeof(*worker));
worker->status_ = NOT_OK;
}
-static int sync(VP9Worker *const worker) {
+static int sync(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
change_state(worker, OK);
#endif
@@ -91,12 +91,12 @@ static int sync(VP9Worker *const worker) {
return !worker->had_error;
}
-static int reset(VP9Worker *const worker) {
+static int reset(VPxWorker *const worker) {
int ok = 1;
worker->had_error = 0;
if (worker->status_ < OK) {
#if CONFIG_MULTITHREAD
- worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
+ worker->impl_ = (VPxWorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
if (worker->impl_ == NULL) {
return 0;
}
@@ -129,13 +129,13 @@ static int reset(VP9Worker *const worker) {
return ok;
}
-static void execute(VP9Worker *const worker) {
+static void execute(VPxWorker *const worker) {
if (worker->hook != NULL) {
worker->had_error |= !worker->hook(worker->data1, worker->data2);
}
}
-static void launch(VP9Worker *const worker) {
+static void launch(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
change_state(worker, WORK);
#else
@@ -143,7 +143,7 @@ static void launch(VP9Worker *const worker) {
#endif
}
-static void end(VP9Worker *const worker) {
+static void end(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
if (worker->impl_ != NULL) {
change_state(worker, NOT_OK);
@@ -162,11 +162,11 @@ static void end(VP9Worker *const worker) {
//------------------------------------------------------------------------------
-static VP9WorkerInterface g_worker_interface = {
+static VPxWorkerInterface g_worker_interface = {
init, reset, sync, launch, execute, end
};
-int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) {
+int vpx_set_worker_interface(const VPxWorkerInterface* const winterface) {
if (winterface == NULL ||
winterface->init == NULL || winterface->reset == NULL ||
winterface->sync == NULL || winterface->launch == NULL ||
@@ -177,7 +177,7 @@ int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) {
return 1;
}
-const VP9WorkerInterface *vp9_get_worker_interface(void) {
+const VPxWorkerInterface *vpx_get_worker_interface(void) {
return &g_worker_interface;
}
diff --git a/vp9/common/vp9_thread.h b/vpx_thread/vpx_thread.h
index 12848fede..de63c4da0 100644
--- a/vp9/common/vp9_thread.h
+++ b/vpx_thread/vpx_thread.h
@@ -13,8 +13,8 @@
// http://git.chromium.org/webm/libwebp.git
// 100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38 src/utils/thread.h
-#ifndef VP9_DECODER_VP9_THREAD_H_
-#define VP9_DECODER_VP9_THREAD_H_
+#ifndef VPX_THREAD_H_
+#define VPX_THREAD_H_
#include "./vpx_config.h"
@@ -160,59 +160,59 @@ typedef enum {
NOT_OK = 0, // object is unusable
OK, // ready to work
WORK // busy finishing the current task
-} VP9WorkerStatus;
+} VPxWorkerStatus;
// Function to be called by the worker thread. Takes two opaque pointers as
// arguments (data1 and data2), and should return false in case of error.
-typedef int (*VP9WorkerHook)(void*, void*);
+typedef int (*VPxWorkerHook)(void*, void*);
// Platform-dependent implementation details for the worker.
-typedef struct VP9WorkerImpl VP9WorkerImpl;
+typedef struct VPxWorkerImpl VPxWorkerImpl;
// Synchronization object used to launch job in the worker thread
typedef struct {
- VP9WorkerImpl *impl_;
- VP9WorkerStatus status_;
- VP9WorkerHook hook; // hook to call
+ VPxWorkerImpl *impl_;
+ VPxWorkerStatus status_;
+ VPxWorkerHook hook; // hook to call
void *data1; // first argument passed to 'hook'
void *data2; // second argument passed to 'hook'
int had_error; // return value of the last call to 'hook'
-} VP9Worker;
+} VPxWorker;
// The interface for all thread-worker related functions. All these functions
// must be implemented.
typedef struct {
// Must be called first, before any other method.
- void (*init)(VP9Worker *const worker);
+ void (*init)(VPxWorker *const worker);
// Must be called to initialize the object and spawn the thread. Re-entrant.
// Will potentially launch the thread. Returns false in case of error.
- int (*reset)(VP9Worker *const worker);
+ int (*reset)(VPxWorker *const worker);
// Makes sure the previous work is finished. Returns true if worker->had_error
// was not set and no error condition was triggered by the working thread.
- int (*sync)(VP9Worker *const worker);
+ int (*sync)(VPxWorker *const worker);
// Triggers the thread to call hook() with data1 and data2 arguments. These
// hook/data1/data2 values can be changed at any time before calling this
// function, but not be changed afterward until the next call to Sync().
- void (*launch)(VP9Worker *const worker);
+ void (*launch)(VPxWorker *const worker);
// This function is similar to launch() except that it calls the
// hook directly instead of using a thread. Convenient to bypass the thread
- // mechanism while still using the VP9Worker structs. sync() must
+ // mechanism while still using the VPxWorker structs. sync() must
// still be called afterward (for error reporting).
- void (*execute)(VP9Worker *const worker);
+ void (*execute)(VPxWorker *const worker);
// Kill the thread and terminate the object. To use the object again, one
// must call reset() again.
- void (*end)(VP9Worker *const worker);
-} VP9WorkerInterface;
+ void (*end)(VPxWorker *const worker);
+} VPxWorkerInterface;
// Install a new set of threading functions, overriding the defaults. This
// should be done before any workers are started, i.e., before any encoding or
// decoding takes place. The contents of the interface struct are copied, it
// is safe to free the corresponding memory after this call. This function is
// not thread-safe. Return false in case of invalid pointer or methods.
-int vp9_set_worker_interface(const VP9WorkerInterface *const winterface);
+int vpx_set_worker_interface(const VPxWorkerInterface *const winterface);
// Retrieve the currently set thread worker interface.
-const VP9WorkerInterface *vp9_get_worker_interface(void);
+const VPxWorkerInterface *vpx_get_worker_interface(void);
//------------------------------------------------------------------------------
@@ -220,4 +220,4 @@ const VP9WorkerInterface *vp9_get_worker_interface(void);
} // extern "C"
#endif
-#endif // VP9_DECODER_VP9_THREAD_H_
+#endif // VPX_THREAD_H_
diff --git a/vpx_thread/vpx_thread.mk b/vpx_thread/vpx_thread.mk
new file mode 100644
index 000000000..0a4a3648a
--- /dev/null
+++ b/vpx_thread/vpx_thread.mk
@@ -0,0 +1,13 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+THREAD_SRCS-yes += vpx_thread.mk
+THREAD_SRCS-yes += vpx_thread.c
+THREAD_SRCS-yes += vpx_thread.h