summaryrefslogtreecommitdiff
path: root/vp9/encoder/vp9_mcomp.c
diff options
context:
space:
mode:
authorGeza Lore <gezalore@gmail.com>2015-10-28 14:35:04 +0000
committerGeza Lore <gezalore@gmail.com>2015-11-11 14:03:47 +0000
commit5eefd3ebfdf61f76676de4f86e128e3d101311a2 (patch)
treea763404e3e9890907b57fc522408fa2d63fd9ce1 /vp9/encoder/vp9_mcomp.c
parent420e8d6d039c2224e00c13aba7f8908b68868359 (diff)
downloadlibvpx-5eefd3ebfdf61f76676de4f86e128e3d101311a2.tar
libvpx-5eefd3ebfdf61f76676de4f86e128e3d101311a2.tar.gz
libvpx-5eefd3ebfdf61f76676de4f86e128e3d101311a2.tar.bz2
libvpx-5eefd3ebfdf61f76676de4f86e128e3d101311a2.zip
Add AVX vectorized vp9_diamond_search_sad
This function now has an AVX intrinsics version which is about 80% faster compared to the C implementation. This provides a 2-4% total speed-up for encode, depending on encoding parameters. The function utilizes 3 properties of the cost function lookup table, constructed in 'cal_nmvjointsadcost' and 'cal_nmvsadcosts'. For the joint cost: - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] For the component costs: - For all i: mvsadcost[0][i] == mvsadcost[1][i] (equal per component cost) - For all i: mvsadcost[0][i] == mvsadcost[0][-i] (Cost function is even) These must hold, otherwise the AVX version of the function cannot be used. Change-Id: I6c2791d43022822a9e6ab43cd124a773946d0bdc
Diffstat (limited to 'vp9/encoder/vp9_mcomp.c')
-rw-r--r--vp9/encoder/vp9_mcomp.c26
1 files changed, 10 insertions, 16 deletions
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 9744e43c7..b9a104a60 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -101,11 +101,8 @@ static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
}
void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
- int len, ss_count = 1;
-
- cfg->ss_mv[0].col = 0;
- cfg->ss_mv[0].row = 0;
- cfg->ss_os[0] = 0;
+ int len;
+ int ss_count = 0;
for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
// Generate offsets for 4 search sites per step.
@@ -117,16 +114,13 @@ void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
}
}
- cfg->ss_count = ss_count;
cfg->searches_per_step = 4;
+ cfg->total_steps = ss_count / cfg->searches_per_step;
}
void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
- int len, ss_count = 1;
-
- cfg->ss_mv[0].col = 0;
- cfg->ss_mv[0].row = 0;
- cfg->ss_os[0] = 0;
+ int len;
+ int ss_count = 0;
for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
// Generate offsets for 8 search sites per step.
@@ -141,8 +135,8 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
}
}
- cfg->ss_count = ss_count;
cfg->searches_per_step = 8;
+ cfg->total_steps = ss_count / cfg->searches_per_step;
}
/*
@@ -1612,8 +1606,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
const uint8_t *best_address;
unsigned int bestsad = INT_MAX;
- int best_site = 0;
- int last_site = 0;
+ int best_site = -1;
+ int last_site = -1;
int ref_row;
int ref_col;
@@ -1626,7 +1620,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
// const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step];
const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step];
- const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+ const int tot_steps = cfg->total_steps - search_param;
const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
@@ -1644,7 +1638,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+ mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
- i = 1;
+ i = 0;
for (step = 0; step < tot_steps; step++) {
int all_in = 1, t;