summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
authorRonald S. Bultje <rbultje@google.com>2013-06-25 11:26:49 -0700
committerRonald S. Bultje <rbultje@google.com>2013-06-25 12:57:28 -0700
commitc24d922396f4dcd50216be3cdfd784ea31c47564 (patch)
treeda9b0b652199acb1d99b7f28a752d2d76d24c998 /vp9
parent9d95993115db450a3a9d4387d564ae191e19f90e (diff)
downloadlibvpx-c24d922396f4dcd50216be3cdfd784ea31c47564.tar
libvpx-c24d922396f4dcd50216be3cdfd784ea31c47564.tar.gz
libvpx-c24d922396f4dcd50216be3cdfd784ea31c47564.tar.bz2
libvpx-c24d922396f4dcd50216be3cdfd784ea31c47564.zip
Add averaging-SAD functions for 8-point comp-inter motion search.
Makes first 50 frames of bus @ 1500kbps encode from 3min22.7 to 3min18.2, i.e. 2.3% faster. In addition, use the sub_pixel_avg functions to calc the variance of the averaging predictor. This is slightly suboptimal because the function is subpixel-position-aware, but it will (at least for the SSE2 version) not actually use a bilinear filter for a full-pixel position, thus leading to approximately the same performance compared to if we implemented an actual average-aware full-pixel variance function. That gains another 0.3 seconds (i.e. encode time goes to 3min17.4), thus leading to a total gain of 2.7%. Change-Id: I3f059d2b04243921868cfed2568d4fa65d7b5acd
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_rtcd_defs.sh40
-rw-r--r--vp9/encoder/vp9_mcomp.c22
-rw-r--r--vp9/encoder/vp9_onyx_if.c43
-rw-r--r--vp9/encoder/vp9_sad_c.c137
-rw-r--r--vp9/encoder/vp9_variance.h30
-rw-r--r--vp9/encoder/x86/vp9_sad_sse2.asm116
6 files changed, 215 insertions, 173 deletions
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index b956e6af9..d71def519 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -369,7 +369,6 @@ specialize vp9_sad8x16 mmx sse2
prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad8x8 mmx sse2
-# TODO(jingning): need to covert these functions into mmx/sse2 form
prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad8x4 sse2
@@ -379,6 +378,45 @@ specialize vp9_sad4x8 sse
prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad4x4 mmx sse
+prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x64_avg sse2
+
+prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x64_avg sse2
+
+prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x32_avg sse2
+
+prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x16_avg sse2
+
+prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x32_avg sse2
+
+prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x32_avg sse2
+
+prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x16_avg sse2
+
+prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x8_avg sse2
+
+prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x16_avg sse2
+
+prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x8_avg sse2
+
+prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x4_avg sse2
+
+prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x8_avg sse
+
+prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x4_avg sse
+
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_h sse2
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 53b70adc4..212dce3b8 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2353,16 +2353,12 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
int *mvjsadcost = x->nmvjointsadcost;
int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
- /* Compound pred buffer */
- DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
/* Get compound pred by averaging two pred blocks. */
- comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
-
- bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+ bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
+ second_pred, 0x7fffffff) +
mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
for (i = 0; i < search_range; i++) {
@@ -2380,9 +2376,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
best_address;
/* Get compound block and use it to calculate SAD. */
- comp_avg_pred(comp_pred, second_pred, w, h, check_here,
- in_what_stride);
- thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+ thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
+ second_pred, bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
@@ -2412,10 +2407,11 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
this_mv.as_mv.col = ref_mv->as_mv.col << 3;
if (bestsad < INT_MAX) {
- int besterr;
- comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
- besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
- (unsigned int *)(&thissad)) +
+ // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
+ // so we don't have to use the subpixel with xoff=0,yoff=0 here.
+ int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0,
+ what, what_stride, (unsigned int *)(&thissad),
+ second_pred) +
mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
xd->allow_high_precision_mv);
return besterr;
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 48a8b48c6..d6fa3fa6c 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1472,8 +1472,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
for (i = 0; i < MAX_MODES; i++)
cpi->rd_thresh_mult[i] = 128;
-#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
+ SDX3F, SDX8F, SDX4DF)\
cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
cpi->fn_ptr[BT].vf = VF; \
cpi->fn_ptr[BT].svf = SVF; \
cpi->fn_ptr[BT].svaf = SVAF; \
@@ -1484,67 +1486,80 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->fn_ptr[BT].sdx8f = SDX8F; \
cpi->fn_ptr[BT].sdx4df = SDX4DF;
- BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
+ BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
+ vp9_variance32x16, vp9_sub_pixel_variance32x16,
vp9_sub_pixel_avg_variance32x16, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x16x4d)
- BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
+ BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
+ vp9_variance16x32, vp9_sub_pixel_variance16x32,
vp9_sub_pixel_avg_variance16x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad16x32x4d)
- BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
+ BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
+ vp9_variance64x32, vp9_sub_pixel_variance64x32,
vp9_sub_pixel_avg_variance64x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad64x32x4d)
- BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
+ BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
+ vp9_variance32x64, vp9_sub_pixel_variance32x64,
vp9_sub_pixel_avg_variance32x64, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x64x4d)
- BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
+ BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
+ vp9_variance32x32, vp9_sub_pixel_variance32x32,
vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
vp9_variance_halfpixvar32x32_v,
vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
vp9_sad32x32x4d)
- BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
+ BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
+ vp9_variance64x64, vp9_sub_pixel_variance64x64,
vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
vp9_variance_halfpixvar64x64_v,
vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
vp9_sad64x64x4d)
- BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
+ BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
+ vp9_variance16x16, vp9_sub_pixel_variance16x16,
vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
vp9_variance_halfpixvar16x16_v,
vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
vp9_sad16x16x4d)
- BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
+ BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
+ vp9_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
- BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
+ BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
+ vp9_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
- BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
+ BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
+ vp9_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
- BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
+ BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
+ vp9_variance8x4, vp9_sub_pixel_variance8x4,
vp9_sub_pixel_avg_variance8x4, NULL, NULL,
NULL, NULL, vp9_sad8x4x8,
vp9_sad8x4x4d)
- BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
+ BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
+ vp9_variance4x8, vp9_sub_pixel_variance4x8,
vp9_sub_pixel_avg_variance4x8, NULL, NULL,
NULL, NULL, vp9_sad4x8x8,
vp9_sad4x8x4d)
- BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
+ BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
+ vp9_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index 6b1ba4964..42ddb21a5 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -11,25 +11,43 @@
#include <stdlib.h>
#include "vp9/common/vp9_sadmxn.h"
+#include "vp9/encoder/vp9_variance.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "./vp9_rtcd.h"
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
-}
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);
-}
+#define sad_mxn_func(m, n) \
+unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
+ int src_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int max_sad) { \
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+} \
+unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
+ int src_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ const uint8_t *second_pred, \
+ unsigned int max_sad) { \
+ uint8_t comp_pred[m * n]; \
+ comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+ return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
+}
+
+sad_mxn_func(64, 64)
+sad_mxn_func(64, 32)
+sad_mxn_func(32, 64)
+sad_mxn_func(32, 32)
+sad_mxn_func(32, 16)
+sad_mxn_func(16, 32)
+sad_mxn_func(16, 16)
+sad_mxn_func(16, 8)
+sad_mxn_func(8, 16)
+sad_mxn_func(8, 8)
+sad_mxn_func(8, 4)
+sad_mxn_func(4, 8)
+sad_mxn_func(4, 4)
void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
int src_stride,
@@ -46,14 +64,6 @@ void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);
-}
-
void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
@@ -69,22 +79,6 @@ void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
-}
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);
-}
-
void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
@@ -100,14 +94,6 @@ void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);
-}
-
void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
@@ -123,63 +109,6 @@ void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
-}
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
-}
-
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
-}
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
-}
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
-}
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
-}
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
-}
-
void vp9_sad64x64x3_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 38808d7be..6e686d6f9 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -20,6 +20,13 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int ref_stride,
unsigned int max_sad);
+typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred,
+ unsigned int max_sad);
+
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -74,20 +81,21 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
int ref_stride);
typedef struct vp9_variance_vtable {
- vp9_sad_fn_t sdf;
- vp9_variance_fn_t vf;
- vp9_subpixvariance_fn_t svf;
- vp9_subp_avg_variance_fn_t svaf;
- vp9_variance_fn_t svf_halfpix_h;
- vp9_variance_fn_t svf_halfpix_v;
- vp9_variance_fn_t svf_halfpix_hv;
- vp9_sad_multi_fn_t sdx3f;
- vp9_sad_multi1_fn_t sdx8f;
- vp9_sad_multi_d_fn_t sdx4df;
+ vp9_sad_fn_t sdf;
+ vp9_sad_avg_fn_t sdaf;
+ vp9_variance_fn_t vf;
+ vp9_subpixvariance_fn_t svf;
+ vp9_subp_avg_variance_fn_t svaf;
+ vp9_variance_fn_t svf_halfpix_h;
+ vp9_variance_fn_t svf_halfpix_v;
+ vp9_variance_fn_t svf_halfpix_hv;
+ vp9_sad_multi_fn_t sdx3f;
+ vp9_sad_multi1_fn_t sdx8f;
+ vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, uint8_t *ref, int ref_stride) {
+ int height, const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; i++) {
diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm
index 8fb7d4118..c4c5c54f0 100644
--- a/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -12,12 +12,42 @@
SECTION .text
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD64XN 1
-cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+ SAD_FN 64, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
.loop:
@@ -25,6 +55,13 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+32]
@@ -47,21 +84,27 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
INIT_XMM sse2
SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD32XN 1
-cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
+%macro SAD32XN 1-2 0
+ SAD_FN 32, %1, 5, %2
mov n_rowsd, %1/2
pxor m0, m0
-
.loop:
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+ref_strideq]
movu m4, [refq+ref_strideq+16]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+src_strideq]
@@ -85,16 +128,14 @@ INIT_XMM sse2
SAD32XN 64 ; sad32x64_sse2
SAD32XN 32 ; sad32x32_sse2
SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD16XN 1
-cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD16XN 1-2 0
+ SAD_FN 16, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
@@ -103,6 +144,13 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
movu m2, [refq+ref_strideq]
movu m3, [refq+ref_strideq*2]
movu m4, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+src_strideq]
psadbw m3, [srcq+src_strideq*2]
@@ -126,16 +174,14 @@ INIT_XMM sse2
SAD16XN 32 ; sad16x32_sse2
SAD16XN 16 ; sad16x16_sse2
SAD16XN 8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN 8, 1 ; sad16x8_avg_sse2
; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD8XN 1
-cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD8XN 1-2 0
+ SAD_FN 8, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
@@ -144,6 +190,11 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
movhps m1, [refq+ref_strideq]
movh m2, [refq+ref_strideq*2]
movhps m2, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
movh m3, [srcq]
movhps m3, [srcq+src_strideq]
movh m4, [srcq+src_strideq*2]
@@ -167,16 +218,14 @@ INIT_XMM sse2
SAD8XN 16 ; sad8x16_sse2
SAD8XN 8 ; sad8x8_sse2
SAD8XN 4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN 8, 1 ; sad8x8_avg_sse2
+SAD8XN 4, 1 ; sad8x4_avg_sse2
; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD4XN 1
-cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD4XN 1-2 0
+ SAD_FN 4, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
@@ -187,6 +236,11 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
movd m4, [refq+ref_stride3q]
punpckldq m1, m2
punpckldq m3, m4
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m3, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
movd m2, [srcq]
movd m5, [srcq+src_strideq]
movd m4, [srcq+src_strideq*2]
@@ -209,3 +263,5 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
INIT_MMX sse
SAD4XN 8 ; sad4x8_sse
SAD4XN 4 ; sad4x4_sse
+SAD4XN 8, 1 ; sad4x8_avg_sse
+SAD4XN 4, 1 ; sad4x4_avg_sse