summaryrefslogtreecommitdiff
path: root/vp8/encoder
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/encoder')
-rw-r--r--vp8/encoder/generic/csystemdependent.c6
-rw-r--r--vp8/encoder/mcomp.c154
-rw-r--r--vp8/encoder/mcomp.h1
-rw-r--r--vp8/encoder/onyx_if.c5
-rw-r--r--vp8/encoder/sad_c.c90
-rw-r--r--vp8/encoder/variance.h43
-rw-r--r--vp8/encoder/x86/mcomp_x86.h9
-rw-r--r--vp8/encoder/x86/sad_sse4.asm353
-rw-r--r--vp8/encoder/x86/variance_x86.h27
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c23
10 files changed, 703 insertions, 8 deletions
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index ae22b274e..824af5e46 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -40,6 +40,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_c;
cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_c;
+ cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_c;
+ cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_c;
+ cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_c;
+ cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_c;
+ cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_c;
+
cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_c;
cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_c;
cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 8cc63f83d..bb85afa6f 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1323,7 +1323,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
check_here = r * mv_stride + in_what + col_min;
c = col_min;
- while ((c + 3) < col_max)
+ while ((c + 2) < col_max)
{
int i;
@@ -1388,6 +1388,158 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
#endif
+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ int mv_stride = d->pre_stride;
+ unsigned char *bestaddress;
+ MV *best_mv = &d->bmi.mv.as_mv;
+ MV this_mv;
+ int bestsad = INT_MAX;
+ int r, c;
+
+ unsigned char *check_here;
+ unsigned int thissad;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ unsigned short sad_array8[8];
+ unsigned int sad_array[3];
+
+ // Work out the mid point for the search
+ in_what = *(d->base_pre) + d->pre;
+ bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Baseline value at the centre
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max ; r++)
+ {
+ this_mv.row = r << 3;
+ check_here = r * mv_stride + in_what + col_min;
+ c = col_min;
+
+ while ((c + 7) < col_max)
+ {
+ int i;
+
+ fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
+
+ for (i = 0; i < 8; i++)
+ {
+ thissad = (unsigned int)sad_array8[i];
+
+ if (thissad < bestsad)
+ {
+ this_mv.col = c << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while ((c + 2) < col_max)
+ {
+ int i;
+
+ fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+ for (i = 0; i < 3; i++)
+ {
+ thissad = sad_array[i];
+
+ if (thissad < bestsad)
+ {
+ this_mv.col = c << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while (c < col_max)
+ {
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.col = c << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here ++;
+ c ++;
+ }
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad < INT_MAX)
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+ else
+ return INT_MAX;
+}
+
#ifdef ENTROPY_STATS
void print_mode_context(void)
{
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 181e95822..7d6036248 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -93,6 +93,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
typedef prototype_full_search_sad(*vp8_full_search_fn_t);
extern prototype_full_search_sad(vp8_full_search_sad);
extern prototype_full_search_sad(vp8_full_search_sadx3);
+extern prototype_full_search_sad(vp8_full_search_sadx8);
typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
extern prototype_diamond_search_sad(vp8_diamond_search_sad);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 7a78b2901..5f02a5a02 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2341,6 +2341,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
cpi->fn_ptr[BLOCK_16X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
+ cpi->fn_ptr[BLOCK_16X16].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x8);
cpi->fn_ptr[BLOCK_16X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
cpi->fn_ptr[BLOCK_16X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
@@ -2350,6 +2351,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_16X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
+ cpi->fn_ptr[BLOCK_16X8].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x8);
cpi->fn_ptr[BLOCK_16X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
cpi->fn_ptr[BLOCK_8X16].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
@@ -2359,6 +2361,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
+ cpi->fn_ptr[BLOCK_8X16].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x8);
cpi->fn_ptr[BLOCK_8X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
cpi->fn_ptr[BLOCK_8X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
@@ -2368,6 +2371,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
+ cpi->fn_ptr[BLOCK_8X8].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x8);
cpi->fn_ptr[BLOCK_8X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
cpi->fn_ptr[BLOCK_4X4].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
@@ -2377,6 +2381,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_4X4].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
+ cpi->fn_ptr[BLOCK_4X4].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
#if !(CONFIG_REALTIME_ONLY)
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
index 2ff122f68..5eaca5935 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -126,6 +126,24 @@ void vp8_sad16x16x3_c(
sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
+void vp8_sad16x16x8_c(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array
+)
+{
+ sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+ sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+ sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+ sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+ sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+ sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
void vp8_sad16x8x3_c(
const unsigned char *src_ptr,
int src_stride,
@@ -139,6 +157,24 @@ void vp8_sad16x8x3_c(
sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
+void vp8_sad16x8x8_c(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array
+)
+{
+ sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+ sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+ sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+ sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+ sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+ sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
void vp8_sad8x8x3_c(
const unsigned char *src_ptr,
int src_stride,
@@ -152,6 +188,24 @@ void vp8_sad8x8x3_c(
sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
+void vp8_sad8x8x8_c(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array
+)
+{
+ sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+ sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+ sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+ sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+ sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+ sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
void vp8_sad8x16x3_c(
const unsigned char *src_ptr,
int src_stride,
@@ -165,6 +219,24 @@ void vp8_sad8x16x3_c(
sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
+void vp8_sad8x16x8_c(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array
+)
+{
+ sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+ sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+ sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+ sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+ sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+ sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
void vp8_sad4x4x3_c(
const unsigned char *src_ptr,
int src_stride,
@@ -178,6 +250,24 @@ void vp8_sad4x4x3_c(
sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
+void vp8_sad4x4x8_c(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array
+)
+{
+ sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+ sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+ sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+ sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+ sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+ sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
void vp8_sad16x16x4d_c(
const unsigned char *src_ptr,
int src_stride,
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 5b70fe54b..5befd3b86 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -32,6 +32,16 @@
unsigned int *sad_array\
)
+#define prototype_sad_multi_same_address_1(sym)\
+ void (sym)\
+ (\
+ const unsigned char *src_ptr, \
+ int source_stride, \
+ const unsigned char *ref_ptr, \
+ int ref_stride, \
+ unsigned short *sad_array\
+ )
+
#define prototype_sad_multi_dif_address(sym)\
void (sym)\
(\
@@ -138,6 +148,31 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
#endif
extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
+#ifndef vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8);
+
+#ifndef vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8);
+
+#ifndef vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8);
+
+#ifndef vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8);
+
+#ifndef vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
+
//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
#ifndef vp8_variance_sad16x16x4d
@@ -274,6 +309,7 @@ extern prototype_sad(vp8_variance_get4x4sse_cs);
typedef prototype_sad(*vp8_sad_fn_t);
typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
+typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
typedef prototype_variance(*vp8_variance_fn_t);
typedef prototype_variance2(*vp8_variance2_fn_t);
@@ -317,6 +353,12 @@ typedef struct
vp8_sad_multi_fn_t sad8x8x3;
vp8_sad_multi_fn_t sad4x4x3;
+ vp8_sad_multi1_fn_t sad16x16x8;
+ vp8_sad_multi1_fn_t sad16x8x8;
+ vp8_sad_multi1_fn_t sad8x16x8;
+ vp8_sad_multi1_fn_t sad8x8x8;
+ vp8_sad_multi1_fn_t sad4x4x8;
+
vp8_sad_multi_d_fn_t sad16x16x4d;
vp8_sad_multi_d_fn_t sad16x8x4d;
vp8_sad_multi_d_fn_t sad8x16x4d;
@@ -334,6 +376,7 @@ typedef struct
vp8_variance_fn_t svf_halfpix_v;
vp8_variance_fn_t svf_halfpix_hv;
vp8_sad_multi_fn_t sdx3f;
+ vp8_sad_multi1_fn_t sdx8f;
vp8_sad_multi_d_fn_t sdx4df;
} vp8_variance_fn_ptr_t;
diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h
index e8d658b39..3b7b29c21 100644
--- a/vp8/encoder/x86/mcomp_x86.h
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -24,5 +24,14 @@
#endif
#endif
+#if HAVE_SSE4_1
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx8
+
+#endif
+#endif
+
#endif
diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm
new file mode 100644
index 000000000..03ecec4b3
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm1, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ movq xmm2, MMWORD PTR [rdi+ rdx+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm1, xmm2
+%else
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endif
+ movq xmm0, MMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+ movd xmm0, [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ mpsadbw xmm1, xmm0, 0x0
+%else
+ movd xmm0, [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endif
+ movd xmm0, [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4)
+sym(vp8_sad16x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad16x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4)
+sym(vp8_sad16x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad8x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4)
+sym(vp8_sad8x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad8x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4)
+sym(vp8_sad8x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad4x4x8_c(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4)
+sym(vp8_sad4x4x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_4X2X8 1
+ PROCESS_4X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 2b62b5f7e..6bea15ebc 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -297,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
#endif
#endif
+
+#if HAVE_SSE4_1
+extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
+
+#undef vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
+
+#undef vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
+
+#undef vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
+
+#undef vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
+
+#endif
+#endif
+
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 2581c337f..ed0e71ed0 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -188,6 +188,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
int wmt_enabled = flags & HAS_SSE2;
int SSE3Enabled = flags & HAS_SSE3;
int SSSE3Enabled = flags & HAS_SSSE3;
+ int SSE4_1Enabled = flags & HAS_SSE4_1;
/* Note:
*
@@ -198,7 +199,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
/* Override default functions with fastest ones for this CPU. */
#if HAVE_MMX
-
if (mmx_enabled)
{
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx;
@@ -254,10 +254,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
/*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/
}
-
#endif
-#if HAVE_SSE2
+#if HAVE_SSE2
if (wmt_enabled)
{
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt;
@@ -307,10 +306,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
}
-
#endif
-#if HAVE_SSE3
+#if HAVE_SSE3
if (SSE3Enabled)
{
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3;
@@ -328,16 +326,27 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;
}
-
#endif
-#if HAVE_SSSE3
+#if HAVE_SSSE3
if (SSSE3Enabled)
{
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
}
+#endif
+#if HAVE_SSE4_1
+ if (SSE4_1Enabled)
+ {
+ cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_sse4;
+ cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_sse4;
+ cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4;
+ cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4;
+ cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4;
+ cpi->rtcd.search.full_search = vp8_full_search_sadx8;
+ }
#endif
+
#endif
}