summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/vp9_intrapred_test.cc24
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl4
-rw-r--r--vpx_dsp/x86/highbd_intrapred_sse2.asm18
-rw-r--r--vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm18
-rw-r--r--vpx_dsp/x86/highbd_variance_sse2.c64
-rw-r--r--vpx_dsp/x86/sad_sse2.asm8
6 files changed, 69 insertions, 67 deletions
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 66a23cbd6..2bebdcbd9 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -141,13 +141,13 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
&vpx_highbd_tm_predictor_16x16_c, 16, 8),
make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
&vpx_highbd_tm_predictor_32x32_c, 32, 8),
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 8),
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
&vpx_highbd_dc_predictor_8x8_c, 8, 8),
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
&vpx_highbd_dc_predictor_16x16_c, 16, 8),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 8),
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
&vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -162,13 +162,13 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
#else
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
::testing::Values(
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 8),
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
&vpx_highbd_dc_predictor_8x8_c, 8, 8),
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
&vpx_highbd_dc_predictor_16x16_c, 16, 8),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 8),
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
&vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -194,14 +194,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
&vpx_highbd_tm_predictor_32x32_c, 32,
10),
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 10),
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
&vpx_highbd_dc_predictor_8x8_c, 8, 10),
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
&vpx_highbd_dc_predictor_16x16_c, 16,
10),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 10),
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
&vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -218,14 +218,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
#else
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
::testing::Values(
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 10),
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
&vpx_highbd_dc_predictor_8x8_c, 8, 10),
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
&vpx_highbd_dc_predictor_16x16_c, 16,
10),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 10),
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
&vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -251,14 +251,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
&vpx_highbd_tm_predictor_32x32_c, 32,
12),
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 12),
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
&vpx_highbd_dc_predictor_8x8_c, 8, 12),
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
&vpx_highbd_dc_predictor_16x16_c, 16,
12),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 12),
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
&vpx_highbd_v_predictor_8x8_c, 8, 12),
@@ -275,14 +275,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
#else
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
::testing::Values(
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 12),
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
&vpx_highbd_dc_predictor_8x8_c, 8, 12),
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
&vpx_highbd_dc_predictor_16x16_c, 16,
12),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+ make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 12),
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
&vpx_highbd_v_predictor_8x8_c, 8, 12),
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 04c4b7317..a2a067457 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -288,13 +288,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_d153_predictor_4x4/;
add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
+ specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
+ specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_top_predictor_4x4/;
diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm
index 712ab49fa..233958a52 100644
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -17,24 +17,20 @@ pw_16: times 4 dd 16
pw_32: times 4 dd 32
SECTION .text
-INIT_MMX sse
+INIT_XMM sse2
cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
GET_GOT goffsetq
movq m0, [aboveq]
movq m2, [leftq]
- DEFINE_ARGS dst, stride, one
- mov oned, 0x0001
- pxor m1, m1
- movd m3, oned
- pshufw m3, m3, 0x0
paddw m0, m2
- pmaddwd m0, m3
- packssdw m0, m1
- pmaddwd m0, m3
+ pshuflw m1, m0, 0xe
+ paddw m0, m1
+ pshuflw m1, m0, 0x1
+ paddw m0, m1
paddw m0, [GLOBAL(pw_4)]
psraw m0, 3
- pshufw m0, m0, 0x0
+ pshuflw m0, m0, 0x0
movq [dstq ], m0
movq [dstq+strideq*2], m0
lea dstq, [dstq+strideq*4]
@@ -183,7 +179,7 @@ cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
REP_RET
%endif
-INIT_MMX sse
+INIT_XMM sse2
cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
movq m0, [aboveq]
movq [dstq ], m0
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 22d52a2af..30ee81b68 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -79,20 +79,13 @@ SECTION .text
%macro INC_SRC_BY_SRC_STRIDE 0
%if ARCH_X86=1 && CONFIG_PIC=1
- lea srcq, [srcq + src_stridemp*2]
+ add srcq, src_stridemp
+ add srcq, src_stridemp
%else
lea srcq, [srcq + src_strideq*2]
%endif
%endmacro
-%macro INC_SRC_BY_SRC_2STRIDE 0
-%if ARCH_X86=1 && CONFIG_PIC=1
- lea srcq, [srcq + src_stridemp*4]
-%else
- lea srcq, [srcq + src_strideq*4]
-%endif
-%endmacro
-
%macro SUBPEL_VARIANCE 1-2 0 ; W
%define bilin_filter_m bilin_filter_m_sse2
%define filter_idx_shift 5
@@ -984,8 +977,9 @@ SECTION .text
.x_other_y_other_loop:
movu m2, [srcq]
movu m4, [srcq+2]
- movu m3, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*2+2]
+ INC_SRC_BY_SRC_STRIDE
+ movu m3, [srcq]
+ movu m5, [srcq+2]
pmullw m2, filter_x_a
pmullw m4, filter_x_b
paddw m2, filter_rnd
@@ -1018,7 +1012,7 @@ SECTION .text
SUM_SSE m0, m2, m4, m3, m6, m7
mova m0, m5
- INC_SRC_BY_SRC_2STRIDE
+ INC_SRC_BY_SRC_STRIDE
lea dstq, [dstq + dst_strideq * 4]
%if %2 == 1 ; avg
add secq, sec_str
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index b45331caa..81ec5dbdb 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -243,13 +243,18 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
}
#if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
#define DECL(w, opt) \
int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
- int height, unsigned int *sse);
+ int height, \
+ unsigned int *sse, \
+ void *unused0, void *unused);
#define DECLS(opt1, opt2) \
DECL(8, opt1); \
DECL(16, opt1)
@@ -274,7 +279,7 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, h, \
- &sse); \
+ &sse, NULL, NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -282,19 +287,20 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
- h, &sse2); \
+ h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48, src_stride, x_offset, y_offset, \
- dst + 48, dst_stride, h, &sse2); \
+ dst + 48, dst_stride, h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
} \
@@ -312,7 +318,7 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
- h, &sse); \
+ h, &sse, NULL, NULL); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -320,20 +326,21 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
- h, &sse2); \
+ h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
} \
@@ -359,27 +366,27 @@ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + (start_row * dst_stride), \
- dst_stride, height, &sse2); \
+ dst_stride, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
- dst_stride, height, &sse2); \
+ dst_stride, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
- dst_stride, height, &sse2); \
+ dst_stride, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
- dst_stride, height, &sse2); \
+ dst_stride, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
}\
@@ -410,6 +417,7 @@ FNS(sse2, sse);
#undef FNS
#undef FN
+// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
@@ -419,7 +427,8 @@ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
const uint16_t *sec, \
ptrdiff_t sec_stride, \
int height, \
- unsigned int *sse);
+ unsigned int *sse, \
+ void *unused0, void *unused);
#define DECLS(opt1) \
DECL(16, opt1) \
DECL(8, opt1)
@@ -439,23 +448,23 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
- y_offset, dst, dst_stride, sec, w, h, &sse); \
+ y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, x_offset, y_offset, \
- dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+ dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, x_offset, y_offset, \
- dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+ dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, x_offset, y_offset, \
- dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+ dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
} \
@@ -475,14 +484,15 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
- sec, w, h, &sse); \
+ sec, w, h, &sse, NULL, NULL); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
- sec + 16, w, h, &sse2); \
+ sec + 16, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
@@ -490,14 +500,16 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
- sec + 32, w, h, &sse2); \
+ sec + 32, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
- sec + 48, w, h, &sse2); \
+ sec + 48, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
} \
@@ -525,7 +537,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, x_offset, \
y_offset, dst + (start_row * dst_stride), dst_stride, \
- sec + (start_row * w), w, height, &sse2); \
+ sec + (start_row * w), w, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
@@ -533,7 +545,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 16 + (start_row * dst_stride), dst_stride, \
- sec + 16 + (start_row * w), w, height, &sse2); \
+ sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
@@ -541,14 +553,14 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 32 + (start_row * dst_stride), dst_stride, \
- sec + 32 + (start_row * w), w, height, &sse2); \
+ sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 48 + (start_row * dst_stride), dst_stride, \
- sec + 48 + (start_row * w), w, height, &sse2); \
+ sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
} \
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index a141999ae..1ec906c23 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -17,7 +17,7 @@ SECTION .text
%if %3 == 5
cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
%else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
%endif ; %3 == 5/7
%else ; avg
@@ -25,7 +25,7 @@ cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
second_pred, n_rows
%else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
ref, ref_stride, \
second_pred, \
src_stride3, ref_stride3
@@ -244,9 +244,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2
movd m2, [srcq]
movd m5, [srcq+src_strideq]
movd m4, [srcq+src_strideq*2]
- movd m6, [srcq+src_stride3q]
+ movd m3, [srcq+src_stride3q]
punpckldq m2, m5
- punpckldq m4, m6
+ punpckldq m4, m3
movlhps m2, m4
psadbw m1, m2
lea refq, [refq+ref_strideq*4]