summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohann Koenig <johannkoenig@google.com>2022-03-31 00:45:31 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2022-03-31 00:45:31 +0000
commit6d1844e54d132c6c2078f529b511ab443bc910ac (patch)
treedc74fd5cadbd3164d16f6bd37e1252a02fbe4e6a
parent2200039d33c49a9f7a5c438656df143755b022c4 (diff)
parentafd60bd07d41e5d20a0b11eeeb104846d9517c65 (diff)
downloadlibvpx-6d1844e54d132c6c2078f529b511ab443bc910ac.tar
libvpx-6d1844e54d132c6c2078f529b511ab443bc910ac.tar.gz
libvpx-6d1844e54d132c6c2078f529b511ab443bc910ac.tar.bz2
libvpx-6d1844e54d132c6c2078f529b511ab443bc910ac.zip
Merge "remove sad x3,x8 specializations" into main
-rw-r--r--test/sad_test.cc68
-rw-r--r--vp8/common/rtcd_defs.pl5
-rw-r--r--vp8/encoder/mcomp.c276
-rw-r--r--vp8/encoder/mcomp.h8
-rw-r--r--vp8/encoder/onyx_if.c11
-rw-r--r--vp8/encoder/onyx_int.h1
-rw-r--r--vp8/encoder/rdopt.c4
-rw-r--r--vp9/encoder/vp9_encoder.c50
-rw-r--r--vp9/encoder/vp9_mcomp.c23
-rw-r--r--vp9/encoder/vp9_mcomp.h10
-rw-r--r--vpx_dsp/mips/sad_mmi.c23
-rw-r--r--vpx_dsp/mips/sad_msa.c426
-rw-r--r--vpx_dsp/sad.c41
-rw-r--r--vpx_dsp/variance.h3
-rw-r--r--vpx_dsp/vpx_dsp.mk3
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl38
-rw-r--r--vpx_dsp/x86/sad4d_avx2.c57
-rw-r--r--vpx_dsp/x86/sad_sse3.asm376
-rw-r--r--vpx_dsp/x86/sad_sse4.asm361
-rw-r--r--vpx_dsp/x86/sad_ssse3.asm372
20 files changed, 33 insertions, 2123 deletions
diff --git a/test/sad_test.cc b/test/sad_test.cc
index ee10a4638..560c5f382 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -56,8 +56,6 @@ typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sad_array);
-typedef TestParams<SadMxNx8Func> SadMxNx8Param;
-
using libvpx_test::ACMRandom;
namespace {
@@ -266,30 +264,6 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
ParamType params_;
};
-class SADx8Test : public SADTestBase<SadMxNx8Param> {
- public:
- SADx8Test() : SADTestBase(GetParam()) {}
-
- protected:
- void SADs(unsigned int *results) const {
- const uint8_t *reference = GetReferenceFromOffset(0);
-
- ASM_REGISTER_STATE_CHECK(params_.func(
- source_data_, source_stride_, reference, reference_stride_, results));
- }
-
- void CheckSADs() const {
- uint32_t reference_sad;
- DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[8]);
-
- SADs(exp_sad);
- for (int offset = 0; offset < 8; ++offset) {
- reference_sad = ReferenceSAD(offset);
- EXPECT_EQ(reference_sad, exp_sad[offset]) << "offset " << offset;
- }
- }
-};
-
class SADx4Test : public SADTestBase<SadMxNx4Param> {
public:
SADx4Test() : SADTestBase(GetParam()) {}
@@ -564,13 +538,6 @@ TEST_P(SADx4Test, DISABLED_Speed) {
reference_stride_ = tmp_stride;
}
-TEST_P(SADx8Test, Regular) {
- FillRandomWH(source_data_, source_stride_, params_.width, params_.height);
- FillRandomWH(GetReferenceFromOffset(0), reference_stride_, params_.width + 8,
- params_.height);
- CheckSADs();
-}
-
//------------------------------------------------------------------------------
// C functions
const SadMxNParam c_tests[] = {
@@ -747,24 +714,6 @@ const SadMxNx4Param x4d_c_tests[] = {
};
INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
-// TODO(angiebird): implement the marked-down sad functions
-const SadMxNx8Param x8_c_tests[] = {
- // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c),
- // SadMxNx8Param(64, 32, &vpx_sad64x32x8_c),
- // SadMxNx8Param(32, 64, &vpx_sad32x64x8_c),
- SadMxNx8Param(32, 32, &vpx_sad32x32x8_c),
- // SadMxNx8Param(32, 16, &vpx_sad32x16x8_c),
- // SadMxNx8Param(16, 32, &vpx_sad16x32x8_c),
- SadMxNx8Param(16, 16, &vpx_sad16x16x8_c),
- SadMxNx8Param(16, 8, &vpx_sad16x8x8_c),
- SadMxNx8Param(8, 16, &vpx_sad8x16x8_c),
- SadMxNx8Param(8, 8, &vpx_sad8x8x8_c),
- // SadMxNx8Param(8, 4, &vpx_sad8x4x8_c),
- // SadMxNx8Param(4, 8, &vpx_sad4x8x8_c),
- SadMxNx8Param(4, 4, &vpx_sad4x4x8_c),
-};
-INSTANTIATE_TEST_SUITE_P(C, SADx8Test, ::testing::ValuesIn(x8_c_tests));
-
//------------------------------------------------------------------------------
// ARM functions
#if HAVE_NEON
@@ -992,18 +941,6 @@ INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
// Only functions are x3, which do not have tests.
#endif // HAVE_SSSE3
-#if HAVE_SSE4_1
-const SadMxNx8Param x8_sse4_1_tests[] = {
- SadMxNx8Param(16, 16, &vpx_sad16x16x8_sse4_1),
- SadMxNx8Param(16, 8, &vpx_sad16x8x8_sse4_1),
- SadMxNx8Param(8, 16, &vpx_sad8x16x8_sse4_1),
- SadMxNx8Param(8, 8, &vpx_sad8x8x8_sse4_1),
- SadMxNx8Param(4, 4, &vpx_sad4x4x8_sse4_1),
-};
-INSTANTIATE_TEST_SUITE_P(SSE4_1, SADx8Test,
- ::testing::ValuesIn(x8_sse4_1_tests));
-#endif // HAVE_SSE4_1
-
#if HAVE_AVX2
const SadMxNParam avx2_tests[] = {
SadMxNParam(64, 64, &vpx_sad64x64_avx2),
@@ -1029,11 +966,6 @@ const SadMxNx4Param x4d_avx2_tests[] = {
};
INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
-const SadMxNx8Param x8_avx2_tests[] = {
- // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c),
- SadMxNx8Param(32, 32, &vpx_sad32x32x8_avx2),
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, SADx8Test, ::testing::ValuesIn(x8_avx2_tests));
#endif // HAVE_AVX2
#if HAVE_AVX512
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 32601b4eb..c7911032f 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -216,11 +216,6 @@ specialize qw/vp8_mbuverror sse2 msa/;
#
# Motion search
#
-add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_full_search_sad sse3 sse4_1/;
-$vp8_full_search_sad_sse3=vp8_full_search_sadx3;
-$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
-
add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
specialize qw/vp8_refining_search_sad sse2 msa/;
$vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 4ab6c7b3d..769c2f558 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1280,10 +1280,10 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
}
#endif // HAVE_SSE2 || HAVE_MSA
-int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
- int sad_per_bit, int distance,
- vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
- int_mv *center_mv) {
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int sad_per_bit, int distance,
+ vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+ int_mv *center_mv) {
unsigned char *what = (*(b->base_src) + b->src);
int what_stride = b->src_stride;
unsigned char *in_what;
@@ -1327,217 +1327,6 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
- /* Apply further limits to prevent us looking using vectors that
- * stretch beyiond the UMV border
- */
- if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
- if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
- if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
- if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
- for (r = row_min; r < row_max; ++r) {
- this_mv.as_mv.row = r;
- check_here = r * mv_stride + in_what + col_min;
-
- for (c = col_min; c < col_max; ++c) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
- this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
-
- check_here++;
- }
- }
-
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
- return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
- mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-
-#if HAVE_SSSE3
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
- int sad_per_bit, int distance,
- vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
- int_mv *center_mv) {
- unsigned char *what = (*(b->base_src) + b->src);
- int what_stride = b->src_stride;
- unsigned char *in_what;
- int pre_stride = x->e_mbd.pre.y_stride;
- unsigned char *base_pre = x->e_mbd.pre.y_buffer;
- int in_what_stride = pre_stride;
- int mv_stride = pre_stride;
- unsigned char *bestaddress;
- int_mv *best_mv = &d->bmi.mv;
- int_mv this_mv;
- unsigned int bestsad;
- unsigned int thissad;
- int r, c;
-
- unsigned char *check_here;
-
- int ref_row = ref_mv->as_mv.row;
- int ref_col = ref_mv->as_mv.col;
-
- int row_min = ref_row - distance;
- int row_max = ref_row + distance;
- int col_min = ref_col - distance;
- int col_max = ref_col + distance;
-
- unsigned int sad_array[3];
-
- int *mvsadcost[2];
- int_mv fcenter_mv;
-
- mvsadcost[0] = x->mvsadcost[0];
- mvsadcost[1] = x->mvsadcost[1];
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- /* Work out the mid point for the search */
- in_what = base_pre + d->offset;
- bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
- best_mv->as_mv.row = ref_row;
- best_mv->as_mv.col = ref_col;
-
- /* Baseline value at the centre */
- bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
- mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
- /* Apply further limits to prevent us looking using vectors that stretch
- * beyond the UMV border
- */
- if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
- if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
- if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
- if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
- for (r = row_min; r < row_max; ++r) {
- this_mv.as_mv.row = r;
- check_here = r * mv_stride + in_what + col_min;
- c = col_min;
-
- while ((c + 2) < col_max) {
- int i;
-
- fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
- for (i = 0; i < 3; ++i) {
- thissad = sad_array[i];
-
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad +=
- mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
- }
-
- while (c < col_max) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad +=
- mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
- }
-
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
- return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
- mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-#endif // HAVE_SSSE3
-
-#if HAVE_SSE4_1
-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
- int sad_per_bit, int distance,
- vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
- int_mv *center_mv) {
- unsigned char *what = (*(b->base_src) + b->src);
- int what_stride = b->src_stride;
- int pre_stride = x->e_mbd.pre.y_stride;
- unsigned char *base_pre = x->e_mbd.pre.y_buffer;
- unsigned char *in_what;
- int in_what_stride = pre_stride;
- int mv_stride = pre_stride;
- unsigned char *bestaddress;
- int_mv *best_mv = &d->bmi.mv;
- int_mv this_mv;
- unsigned int bestsad;
- unsigned int thissad;
- int r, c;
-
- unsigned char *check_here;
-
- int ref_row = ref_mv->as_mv.row;
- int ref_col = ref_mv->as_mv.col;
-
- int row_min = ref_row - distance;
- int row_max = ref_row + distance;
- int col_min = ref_col - distance;
- int col_max = ref_col + distance;
-
- DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
- unsigned int sad_array[3];
-
- int *mvsadcost[2];
- int_mv fcenter_mv;
-
- mvsadcost[0] = x->mvsadcost[0];
- mvsadcost[1] = x->mvsadcost[1];
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- /* Work out the mid point for the search */
- in_what = base_pre + d->offset;
- bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
- best_mv->as_mv.row = ref_row;
- best_mv->as_mv.col = ref_col;
-
- /* Baseline value at the centre */
- bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
- mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
/* Apply further limits to prevent us looking using vectors that stretch
* beyond the UMV border
*/
@@ -1552,61 +1341,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
for (r = row_min; r < row_max; ++r) {
this_mv.as_mv.row = r;
check_here = r * mv_stride + in_what + col_min;
- c = col_min;
-
- while ((c + 7) < col_max) {
- int i;
-
- fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
- for (i = 0; i < 8; ++i) {
- thissad = sad_array8[i];
-
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad +=
- mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
- }
-
- while ((c + 2) < col_max) {
- int i;
-
- fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
- for (i = 0; i < 3; ++i) {
- thissad = sad_array[i];
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad +=
- mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
- }
-
- while (c < col_max) {
+ for (c = col_min; c < col_max; ++c) {
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad) {
@@ -1623,7 +1359,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
}
check_here++;
- c++;
}
}
@@ -1633,7 +1368,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
}
-#endif // HAVE_SSE4_1
int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int_mv *ref_mv, int error_per_bit,
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 57c18f523..1ee6fe5dd 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -50,10 +50,10 @@ fractional_mv_step_fp vp8_find_best_sub_pixel_step;
fractional_mv_step_fp vp8_find_best_half_pixel_step;
fractional_mv_step_fp vp8_skip_fractional_mv_step;
-typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
- int_mv *ref_mv, int sad_per_bit,
- int distance, vp8_variance_fn_ptr_t *fn_ptr,
- int *mvcost[2], int_mv *center_mv);
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int sad_per_bit, int distance,
+ vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+ int_mv *center_mv);
typedef int (*vp8_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int_mv *ref_mv, int sad_per_bit,
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index f09177c7f..ffb3867dd 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2012,36 +2012,26 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
- cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
- cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
- cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
- cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
- cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
- cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
- cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
- cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
- cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
- cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
#if VPX_ARCH_X86 || VPX_ARCH_X86_64
@@ -2052,7 +2042,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn;
#endif
- cpi->full_search_sad = vp8_full_search_sad;
cpi->diamond_search_sad = vp8_diamond_search_sad;
cpi->refining_search_sad = vp8_refining_search_sad;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 7f8298e44..424f51b18 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -549,7 +549,6 @@ typedef struct VP8_COMP {
unsigned char *partition_d_end[MAX_PARTITIONS];
fractional_mv_step_fp *find_fractional_mv_step;
- vp8_full_search_fn_t full_search_sad;
vp8_refining_search_fn_t refining_search_sad;
vp8_diamond_search_fn_t diamond_search_sad;
vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 79a858e43..5821fc734 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1097,8 +1097,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min,
x->mv_row_max);
- thissme = cpi->full_search_sad(x, c, e, &mvp_full, sadpb, 16,
- v_fn_ptr, x->mvcost, bsi->ref_mv);
+ thissme = vp8_full_search_sad(x, c, e, &mvp_full, sadpb, 16,
+ v_fn_ptr, x->mvcost, bsi->ref_mv);
if (thissme < bestsme) {
bestsme = thissme;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 97805fc16..84ab80fe3 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1569,15 +1569,13 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
}
#if CONFIG_VP9_HIGHBITDEPTH
-// TODO(angiebird): make sdx8f available for highbitdepth if needed
#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
cpi->fn_ptr[BT].sdf = SDF; \
cpi->fn_ptr[BT].sdaf = SDAF; \
cpi->fn_ptr[BT].vf = VF; \
cpi->fn_ptr[BT].svf = SVF; \
cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF; \
- cpi->fn_ptr[BT].sdx8f = NULL;
+ cpi->fn_ptr[BT].sdx4df = SDX4DF;
#define MAKE_BFP_SAD_WRAPPER(fnname) \
static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
@@ -2561,67 +2559,61 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
cpi->source_var_thresh = 0;
cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX8F) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].sdaf = SDAF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF; \
- cpi->fn_ptr[BT].sdx8f = SDX8F;
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF;
- // TODO(angiebird): make sdx8f available for every block size
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
- vpx_sad32x16x4d, NULL)
+ vpx_sad32x16x4d)
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
- vpx_sad16x32x4d, NULL)
+ vpx_sad16x32x4d)
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
- vpx_sad64x32x4d, NULL)
+ vpx_sad64x32x4d)
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
- vpx_sad32x64x4d, NULL)
+ vpx_sad32x64x4d)
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
- vpx_sad32x32x4d, vpx_sad32x32x8)
+ vpx_sad32x32x4d)
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
- vpx_sad64x64x4d, NULL)
+ vpx_sad64x64x4d)
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
- vpx_sad16x16x4d, vpx_sad16x16x8)
+ vpx_sad16x16x4d)
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
- vpx_sad16x8x4d, vpx_sad16x8x8)
+ vpx_sad16x8x4d)
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
- vpx_sad8x16x4d, vpx_sad8x16x8)
+ vpx_sad8x16x4d)
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
- vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
- vpx_sad8x8x8)
+ vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
- vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
- NULL)
+ vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
- vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
- NULL)
+ vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
- vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
- vpx_sad4x4x8)
+ vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index cd6706420..1f08aa5de 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1796,29 +1796,6 @@ static int64_t exhaustive_mesh_search_single_step(
end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
for (r = start_row; r <= end_row; r += 1) {
c = start_col;
- // sdx8f may not be available some block size
- if (fn_ptr->sdx8f) {
- while (c + 7 <= end_col) {
- unsigned int sads[8];
- const MV mv = { r, c };
- const uint8_t *buf = get_buf_from_mv(pre, &mv);
- fn_ptr->sdx8f(src->buf, src->stride, buf, pre->stride, sads);
-
- for (i = 0; i < 8; ++i) {
- int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
- if (sad < best_sad) {
- const MV mv = { r, c + i };
- sad += lambda *
- vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
- }
- }
- c += 8;
- }
- }
while (c + 3 <= end_col) {
unsigned int sads[4];
const uint8_t *addrs[4];
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 0c4d8f23c..bdaf2ce77 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -93,16 +93,6 @@ extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
-typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
- int sad_per_bit, int distance,
- const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv, MV *best_mv);
-
-typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv,
- int sad_per_bit, int distance,
- const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv);
-
typedef int (*vp9_diamond_search_fn_t)(
const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
int search_param, int sad_per_bit, int *num00,
diff --git a/vpx_dsp/mips/sad_mmi.c b/vpx_dsp/mips/sad_mmi.c
index eaca4773f..7f5882bca 100644
--- a/vpx_dsp/mips/sad_mmi.c
+++ b/vpx_dsp/mips/sad_mmi.c
@@ -334,19 +334,6 @@
"paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
#endif /* _MIPS_SIM == _ABIO32 */
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK_mmi(m, n, k) \
- void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride, \
- const uint8_t *ref_array, int ref_stride, \
- uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < (k); ++i) \
- sad_array[i] = \
- vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
- }
-
-// This appears to be equivalent to the above when k == 4 and refs is const
#define sadMxNx4D_mmi(m, n) \
void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \
const uint8_t *const ref_array[], \
@@ -583,10 +570,6 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
vpx_sad16xN(32);
vpx_sad16xN(16);
vpx_sad16xN(8);
-sadMxNxK_mmi(16, 16, 3);
-sadMxNxK_mmi(16, 16, 8);
-sadMxNxK_mmi(16, 8, 3);
-sadMxNxK_mmi(16, 8, 8);
sadMxNx4D_mmi(16, 32);
sadMxNx4D_mmi(16, 16);
sadMxNx4D_mmi(16, 8);
@@ -681,10 +664,6 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
vpx_sad8xN(16);
vpx_sad8xN(8);
vpx_sad8xN(4);
-sadMxNxK_mmi(8, 16, 3);
-sadMxNxK_mmi(8, 16, 8);
-sadMxNxK_mmi(8, 8, 3);
-sadMxNxK_mmi(8, 8, 8);
sadMxNx4D_mmi(8, 16);
sadMxNx4D_mmi(8, 8);
sadMxNx4D_mmi(8, 4);
@@ -777,8 +756,6 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
vpx_sad4xN(8);
vpx_sad4xN(4);
-sadMxNxK_mmi(4, 4, 3);
-sadMxNxK_mmi(4, 4, 8);
sadMxNx4D_mmi(4, 8);
sadMxNx4D_mmi(4, 4);
diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c
index e3e91c433..b0f8ff1fd 100644
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c
@@ -159,380 +159,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
return sad;
}
-static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v16u8 ref0, ref1, ref2, ref3, diff;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- INSERT_W4_UB(src0, src1, src2, src3, src);
-
- LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
- ref += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
- ref0, ref1);
- sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src, ref, ref0, ref1, diff;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
-
- for (ht_cnt = (height >> 1); ht_cnt--;) {
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3, diff;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
- v8u16 sad4 = { 0 };
- v8u16 sad5 = { 0 };
- v8u16 sad6 = { 0 };
- v8u16 sad7 = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- INSERT_W4_UB(src0, src1, src2, src3, src);
- src_ptr += (4 * src_stride);
- LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad3 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad4 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad5 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad6 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad7 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
- sad_array[4] = HADD_UH_U32(sad4);
- sad_array[5] = HADD_UH_U32(sad5);
- sad_array[6] = HADD_UH_U32(sad6);
- sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
- v8u16 sad4 = { 0 };
- v8u16 sad5 = { 0 };
- v8u16 sad6 = { 0 };
- v8u16 sad7 = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
- ref += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
- ref0, ref1);
- sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
- sad_array[4] = HADD_UH_U32(sad4);
- sad_array[5] = HADD_UH_U32(sad5);
- sad_array[6] = HADD_UH_U32(sad6);
- sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src, ref0, ref1, ref;
- v16u8 diff;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
- v8u16 sad4 = { 0 };
- v8u16 sad5 = { 0 };
- v8u16 sad6 = { 0 };
- v8u16 sad7 = { 0 };
-
- for (ht_cnt = (height >> 1); ht_cnt--;) {
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
- diff = __msa_asub_u_b(src, ref);
- sad3 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
- diff = __msa_asub_u_b(src, ref);
- sad4 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
- diff = __msa_asub_u_b(src, ref);
- sad5 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
- diff = __msa_asub_u_b(src, ref);
- sad6 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
- diff = __msa_asub_u_b(src, ref);
- sad7 += __msa_hadd_u_h(diff, diff);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
- diff = __msa_asub_u_b(src, ref);
- sad3 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
- diff = __msa_asub_u_b(src, ref);
- sad4 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
- diff = __msa_asub_u_b(src, ref);
- sad5 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
- diff = __msa_asub_u_b(src, ref);
- sad6 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
- diff = __msa_asub_u_b(src, ref);
- sad7 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
- sad_array[4] = HADD_UH_U32(sad4);
- sad_array[5] = HADD_UH_U32(sad5);
- sad_array[6] = HADD_UH_U32(sad6);
- sad_array[7] = HADD_UH_U32(sad7);
-}
-
static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t *const aref_ptr[],
int32_t ref_stride, int32_t height,
@@ -1037,48 +663,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
}
-#define VPX_SAD_4xHEIGHTx3_MSA(height) \
- void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t sads[3]) { \
- sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define VPX_SAD_8xHEIGHTx3_MSA(height) \
- void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t sads[3]) { \
- sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define VPX_SAD_16xHEIGHTx3_MSA(height) \
- void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t sads[3]) { \
- sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define VPX_SAD_4xHEIGHTx8_MSA(height) \
- void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t sads[8]) { \
- sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define VPX_SAD_8xHEIGHTx8_MSA(height) \
- void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t sads[8]) { \
- sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define VPX_SAD_16xHEIGHTx8_MSA(height) \
- void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t sads[8]) { \
- sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
#define VPX_SAD_4xHEIGHTx4D_MSA(height) \
void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *const refs[4], \
@@ -1186,29 +770,21 @@ VPX_AVGSAD_16xHEIGHT_MSA(32);
// 16x16
VPX_SAD_16xHEIGHT_MSA(16);
-VPX_SAD_16xHEIGHTx3_MSA(16);
-VPX_SAD_16xHEIGHTx8_MSA(16);
VPX_SAD_16xHEIGHTx4D_MSA(16);
VPX_AVGSAD_16xHEIGHT_MSA(16);
// 16x8
VPX_SAD_16xHEIGHT_MSA(8);
-VPX_SAD_16xHEIGHTx3_MSA(8);
-VPX_SAD_16xHEIGHTx8_MSA(8);
VPX_SAD_16xHEIGHTx4D_MSA(8);
VPX_AVGSAD_16xHEIGHT_MSA(8);
// 8x16
VPX_SAD_8xHEIGHT_MSA(16);
-VPX_SAD_8xHEIGHTx3_MSA(16);
-VPX_SAD_8xHEIGHTx8_MSA(16);
VPX_SAD_8xHEIGHTx4D_MSA(16);
VPX_AVGSAD_8xHEIGHT_MSA(16);
// 8x8
VPX_SAD_8xHEIGHT_MSA(8);
-VPX_SAD_8xHEIGHTx3_MSA(8);
-VPX_SAD_8xHEIGHTx8_MSA(8);
VPX_SAD_8xHEIGHTx4D_MSA(8);
VPX_AVGSAD_8xHEIGHT_MSA(8);
@@ -1224,7 +800,5 @@ VPX_AVGSAD_4xHEIGHT_MSA(8);
// 4x4
VPX_SAD_4xHEIGHT_MSA(4);
-VPX_SAD_4xHEIGHTx3_MSA(4);
-VPX_SAD_4xHEIGHTx8_MSA(4);
VPX_SAD_4xHEIGHTx4D_MSA(4);
VPX_AVGSAD_4xHEIGHT_MSA(4);
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 46d513b68..b47c43430 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -45,35 +45,7 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
return sad(src_ptr, src_stride, comp_pred, m, m, n); \
}
-// Compare |src_ptr| to |k| adjacent blocks starting at |ref_ptr|.
-// |k| == {3,8}. Used in vp8 for an exhaustive search.
-// src: ref:
-// 0 1 2 3 0 1 2 3 x x
-// 4 5 6 7 6 7 8 9 x x
-// 8 9 10 11 12 13 14 15 x x
-// 12 13 14 15 18 19 20 21 x x
-//
-// x 1 2 3 4 x
-// x 7 8 9 10 x
-// x 13 14 15 16 x
-// x 19 20 21 22 x
-//
-// x x 2 3 4 5
-// x x 8 9 10 11
-// x x 14 15 16 17
-// x x 20 21 22 23
-//
-#define sadMxNxK(m, n, k) \
- void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- uint32_t sad_array[k]) { \
- int i; \
- for (i = 0; i < k; ++i) \
- sad_array[i] = \
- vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_ptr + i, ref_stride); \
- }
-
-// Compare |src_ptr| to 4 distinct references in |ref_array[]|
+// Compare |src_ptr| to 4 distinct references in |ref_array[4]|
#define sadMxNx4D(m, n) \
void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
const uint8_t *const ref_array[4], \
@@ -99,7 +71,6 @@ sadMxNx4D(32, 64)
// 32x32
sadMxN(32, 32)
-sadMxNxK(32, 32, 8)
sadMxNx4D(32, 32)
// 32x16
@@ -112,26 +83,18 @@ sadMxNx4D(16, 32)
// 16x16
sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
sadMxNx4D(16, 16)
// 16x8
sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
sadMxNx4D(16, 8)
// 8x16
sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
sadMxNx4D(8, 16)
// 8x8
sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
sadMxNx4D(8, 8)
// 8x4
@@ -144,8 +107,6 @@ sadMxNx4D(4, 8)
// 4x4
sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
sadMxNx4D(4, 4)
/* clang-format on */
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index f8b44f03d..755cb907d 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -59,8 +59,6 @@ typedef struct variance_vtable {
vpx_sad_fn_t sdf;
vpx_variance_fn_t vf;
vpx_subpixvariance_fn_t svf;
- vpx_sad_multi_fn_t sdx3f;
- vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
#if VPX_ARCH_X86 || VPX_ARCH_X86_64
vp8_copy32xn_fn_t copymem;
@@ -76,7 +74,6 @@ typedef struct vp9_variance_vtable {
vpx_subpixvariance_fn_t svf;
vpx_subp_avg_variance_fn_t svaf;
vpx_sad_multi_d_fn_t sdx4df;
- vpx_sad_multi_fn_t sdx8f;
} vp9_variance_fn_ptr_t;
#endif // CONFIG_VP9
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index b930fbd0a..bf348c112 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -355,9 +355,6 @@ DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c
DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c
-DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm
-DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 63097b0b6..73f28ff92 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -866,44 +866,6 @@ add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stri
specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-# Blocks of 3
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
-
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
-
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
-
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
-
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
-specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
-
-# Blocks of 8
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad32x32x8 avx2/;
-
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
-specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
-
-#
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 81f1a916f..399b67b3f 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -73,63 +73,6 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums, sad_array);
}
-void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t sad_array[8]) {
- int i;
- __m256i sums[8];
-
- sums[0] = _mm256_setzero_si256();
- sums[1] = _mm256_setzero_si256();
- sums[2] = _mm256_setzero_si256();
- sums[3] = _mm256_setzero_si256();
- sums[4] = _mm256_setzero_si256();
- sums[5] = _mm256_setzero_si256();
- sums[6] = _mm256_setzero_si256();
- sums[7] = _mm256_setzero_si256();
-
- for (i = 0; i < 32; i++) {
- __m256i r[8];
-
- // load src and all ref[]
- const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
- r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
- r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
- r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
- r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
- r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
- r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
- r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
- r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
-
- // sum of the absolute differences between every ref[] to src
- r[0] = _mm256_sad_epu8(r[0], s);
- r[1] = _mm256_sad_epu8(r[1], s);
- r[2] = _mm256_sad_epu8(r[2], s);
- r[3] = _mm256_sad_epu8(r[3], s);
- r[4] = _mm256_sad_epu8(r[4], s);
- r[5] = _mm256_sad_epu8(r[5], s);
- r[6] = _mm256_sad_epu8(r[6], s);
- r[7] = _mm256_sad_epu8(r[7], s);
-
- // sum every ref[]
- sums[0] = _mm256_add_epi32(sums[0], r[0]);
- sums[1] = _mm256_add_epi32(sums[1], r[1]);
- sums[2] = _mm256_add_epi32(sums[2], r[2]);
- sums[3] = _mm256_add_epi32(sums[3], r[3]);
- sums[4] = _mm256_add_epi32(sums[4], r[4]);
- sums[5] = _mm256_add_epi32(sums[5], r[5]);
- sums[6] = _mm256_add_epi32(sums[6], r[6]);
- sums[7] = _mm256_add_epi32(sums[7], r[7]);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- }
-
- calc_final_4(sums, sad_array);
- calc_final_4(sums + 4, sad_array + 4);
-}
-
void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
diff --git a/vpx_dsp/x86/sad_sse3.asm b/vpx_dsp/x86/sad_sse3.asm
deleted file mode 100644
index acbd2e4fa..000000000
--- a/vpx_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,376 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define ref_ptr rdi
- %define ref_stride rdx
- %define end_ptr rcx
- %define ret_var rbx
- %define result_ptr arg(4)
- %define height dword ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- mov rsi, arg(0) ; src_ptr
- mov rdi, arg(2) ; ref_ptr
-
- movsxd rax, dword ptr arg(1) ; src_stride
- movsxd rdx, dword ptr arg(3) ; ref_stride
-%else
- %if LIBVPX_YASM_WIN64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define ref_ptr r8
- %define ref_stride r9
- %define end_ptr r10
- %define ret_var r11
- %define result_ptr [rsp+xmm_stack_space+8+4*8]
- %define height dword ptr [rsp+xmm_stack_space+8+4*8]
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define ref_ptr rdx
- %define ref_stride rcx
- %define end_ptr r9
- %define ret_var r10
- %define result_ptr r8
- %define height r8
- %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
- %define src_ptr
- %define src_stride
- %define ref_ptr
- %define ref_stride
- %define end_ptr
- %define ret_var
- %define result_ptr
- %define height
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %if LIBVPX_YASM_WIN64
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm5, XMMWORD PTR [%3]
- lddqu xmm6, XMMWORD PTR [%3+1]
- lddqu xmm7, XMMWORD PTR [%3+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm1, XMMWORD PTR [%3]
- lddqu xmm2, XMMWORD PTR [%3+1]
- lddqu xmm3, XMMWORD PTR [%3+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [%2+%4]
- lddqu xmm1, XMMWORD PTR [%3+%5]
- lddqu xmm2, XMMWORD PTR [%3+%5+1]
- lddqu xmm3, XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
- movq mm0, QWORD PTR [%2]
- movq mm5, QWORD PTR [%3]
- movq mm6, QWORD PTR [%3+1]
- movq mm7, QWORD PTR [%3+2]
-
- psadbw mm5, mm0
- psadbw mm6, mm0
- psadbw mm7, mm0
-%else
- movq mm0, QWORD PTR [%2]
- movq mm1, QWORD PTR [%3]
- movq mm2, QWORD PTR [%3+1]
- movq mm3, QWORD PTR [%3+2]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endif
- movq mm0, QWORD PTR [%2+%4]
- movq mm1, QWORD PTR [%3+%5]
- movq mm2, QWORD PTR [%3+%5+1]
- movq mm3, QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x16x3_sse3)
-sym(vpx_sad16x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad16x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x8x3_sse3)
-sym(vpx_sad16x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad8x16x3_sse3)
-sym(vpx_sad8x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad8x8x3_sse3)
-sym(vpx_sad8x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad4x4x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad4x4x3_sse3)
-sym(vpx_sad4x4x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm1, DWORD PTR [ref_ptr]
-
- movd mm2, DWORD PTR [src_ptr+src_stride]
- movd mm3, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movd mm4, DWORD PTR [ref_ptr+1]
- movd mm5, DWORD PTR [ref_ptr+2]
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
-
- psadbw mm1, mm0
-
- punpcklbw mm4, mm2
- punpcklbw mm5, mm3
-
- psadbw mm4, mm0
- psadbw mm5, mm0
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm2, DWORD PTR [ref_ptr]
-
- movd mm3, DWORD PTR [src_ptr+src_stride]
- movd mm6, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm3
- punpcklbw mm2, mm6
-
- movd mm3, DWORD PTR [ref_ptr+1]
- movd mm7, DWORD PTR [ref_ptr+2]
-
- psadbw mm2, mm0
-
- paddw mm1, mm2
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
-
- punpcklbw mm3, mm2
- punpcklbw mm7, mm6
-
- psadbw mm3, mm0
- psadbw mm7, mm0
-
- paddw mm3, mm4
- paddw mm7, mm5
-
- mov rcx, result_ptr
-
- punpckldq mm1, mm3
-
- movq [rcx], mm1
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
diff --git a/vpx_dsp/x86/sad_sse4.asm b/vpx_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 0818ed5f0..000000000
--- a/vpx_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,361 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm1, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm1, xmm2
- paddw xmm1, xmm3
- paddw xmm1, xmm4
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endif
- movdqa xmm0, XMMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- movq xmm2, MMWORD PTR [rdi+ rdx+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
- movq xmm0, MMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm1, xmm2
-%else
- movq xmm0, MMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endif
- movq xmm0, MMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
- movd xmm0, [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- mpsadbw xmm1, xmm0, 0x0
-%else
- movd xmm0, [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endif
- movd xmm0, [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
- mov rdi, arg(4) ;Results
- pxor xmm0, xmm0
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm0
- punpckhwd xmm2, xmm0
-
- movdqa [rdi], xmm1
- movdqa [rdi + 16], xmm2
-%endmacro
-
-SECTION .text
-
-;void vpx_sad16x16x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array);
-globalsym(vpx_sad16x16x8_sse4_1)
-sym(vpx_sad16x16x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad16x8x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad16x8x8_sse4_1)
-sym(vpx_sad16x8x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad8x8x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad8x8x8_sse4_1)
-sym(vpx_sad8x8x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad8x16x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad8x16x8_sse4_1)
-sym(vpx_sad8x16x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad4x4x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad4x4x8_sse4_1)
-sym(vpx_sad4x4x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_4X2X8 1
- PROCESS_4X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
diff --git a/vpx_dsp/x86/sad_ssse3.asm b/vpx_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index a5bc6d730..000000000
--- a/vpx_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm5, XMMWORD PTR [rdi]
- lddqu xmm6, XMMWORD PTR [rdi+1]
- lddqu xmm7, XMMWORD PTR [rdi+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm1, XMMWORD PTR [rdi]
- lddqu xmm2, XMMWORD PTR [rdi+1]
- lddqu xmm3, XMMWORD PTR [rdi+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- lddqu xmm1, XMMWORD PTR [rdi+rdx]
- lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
- lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm7, XMMWORD PTR [rdi+16]
-
- movdqa xmm5, xmm7
- palignr xmm5, xmm4, %2
-
- movdqa xmm6, xmm7
- palignr xmm6, xmm4, (%2+1)
-
- palignr xmm7, xmm4, (%2+2)
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm3, XMMWORD PTR [rdi+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- movdqa xmm4, XMMWORD PTR [rdi+rdx]
- movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x16x3_ssse3)
-sym(vpx_sad16x16x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vpx_sad16x16x3_ssse3_skiptable
-.vpx_sad16x16x3_ssse3_jumptable:
- dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_skiptable:
-
- call .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
-
-.vpx_sad16x16x3_ssse3_aligned_by_15:
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vpx_sad16x16x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void int vpx_sad16x8x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x8x3_ssse3)
-sym(vpx_sad16x8x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vpx_sad16x8x3_ssse3_skiptable
-.vpx_sad16x8x3_ssse3_jumptable:
- dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_skiptable:
-
- call .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
-
-.vpx_sad16x8x3_ssse3_aligned_by_15:
-
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vpx_sad16x8x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret