diff options
-rw-r--r-- | test/vp9_intrapred_test.cc | 24 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_intrapred_sse2.asm | 18 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm | 18 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_variance_sse2.c | 64 | ||||
-rw-r--r-- | vpx_dsp/x86/sad_sse2.asm | 8 |
6 files changed, 69 insertions, 67 deletions
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 66a23cbd6..2bebdcbd9 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -141,13 +141,13 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, &vpx_highbd_tm_predictor_16x16_c, 16, 8), make_tuple(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 8), - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 8), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 8), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 8), @@ -162,13 +162,13 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, #else INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, ::testing::Values( - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 8), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 8), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 8), @@ -194,14 +194,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, make_tuple(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 10), - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 10), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 10), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 10), @@ -218,14 +218,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, #else INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, ::testing::Values( - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 10), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 10), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 10), @@ -251,14 +251,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, make_tuple(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 12), - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 12), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 12), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 12), @@ -275,14 +275,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, #else INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, ::testing::Values( - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 12), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 12), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 12), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 04c4b7317..a2a067457 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -288,13 +288,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_d153_predictor_4x4/; add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc"; + specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc"; + specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_4x4/; diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm index 712ab49fa..233958a52 100644 --- a/vpx_dsp/x86/highbd_intrapred_sse2.asm +++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm @@ -17,24 +17,20 @@ pw_16: times 4 dd 16 pw_32: times 4 dd 32 SECTION .text -INIT_MMX sse +INIT_XMM sse2 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset GET_GOT goffsetq movq m0, [aboveq] movq m2, [leftq] - DEFINE_ARGS dst, stride, one - mov oned, 0x0001 - pxor m1, m1 - movd m3, oned - pshufw m3, m3, 0x0 paddw m0, m2 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 paddw m0, [GLOBAL(pw_4)] psraw m0, 3 - pshufw m0, m0, 0x0 + pshuflw m0, m0, 0x0 movq [dstq ], m0 movq [dstq+strideq*2], m0 lea dstq, [dstq+strideq*4] @@ -183,7 +179,7 @@ cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset REP_RET %endif -INIT_MMX sse +INIT_XMM sse2 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above movq m0, [aboveq] movq [dstq ], m0 diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 22d52a2af..30ee81b68 100644 --- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -79,20 +79,13 @@ SECTION .text %macro INC_SRC_BY_SRC_STRIDE 0 %if ARCH_X86=1 && CONFIG_PIC=1 - lea srcq, [srcq + src_stridemp*2] + add srcq, src_stridemp + add srcq, src_stridemp %else lea srcq, [srcq + src_strideq*2] %endif %endmacro -%macro INC_SRC_BY_SRC_2STRIDE 0 -%if ARCH_X86=1 && CONFIG_PIC=1 - lea srcq, [srcq + src_stridemp*4] -%else - lea srcq, [srcq + src_strideq*4] -%endif -%endmacro - %macro SUBPEL_VARIANCE 1-2 0 ; W %define bilin_filter_m bilin_filter_m_sse2 %define filter_idx_shift 5 @@ -984,8 +977,9 @@ SECTION .text .x_other_y_other_loop: movu m2, [srcq] movu m4, [srcq+2] - movu m3, [srcq+src_strideq*2] - movu m5, [srcq+src_strideq*2+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] pmullw m2, filter_x_a pmullw m4, filter_x_b paddw m2, filter_rnd @@ -1018,7 +1012,7 @@ SECTION .text SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 - INC_SRC_BY_SRC_2STRIDE + INC_SRC_BY_SRC_STRIDE lea dstq, [dstq + dst_strideq * 4] %if %2 == 1 ; avg add secq, sec_str diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index b45331caa..81ec5dbdb 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -243,13 +243,18 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, } #if CONFIG_USE_X86INC +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm #define DECL(w, opt) \ int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ int x_offset, int y_offset, \ const uint16_t *dst, \ ptrdiff_t dst_stride, \ - int height, unsigned int *sse); + int height, \ + unsigned int *sse, \ + void *unused0, void *unused); #define DECLS(opt1, opt2) \ DECL(8, opt1); \ DECL(16, opt1) @@ -274,7 +279,7 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, h, \ - &sse); \ + &sse, NULL, NULL); \ if (w > wf) { \ unsigned int sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ @@ -282,19 +287,20 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ x_offset, y_offset, \ dst + 16, \ dst_stride, \ - h, &sse2); \ + h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, h, &sse2); \ + dst + 48, dst_stride, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -312,7 +318,7 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, \ - h, &sse); \ + h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ @@ -320,20 +326,21 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ x_offset, y_offset, \ dst + 16, \ dst_stride, \ - h, &sse2); \ + h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ x_offset, y_offset, \ dst + 48, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -359,27 +366,27 @@ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ }\ @@ -410,6 +417,7 @@ FNS(sse2, sse); #undef FNS #undef FN +// The 2 unused parameters are place holders for PIC enabled build. #define DECL(w, opt) \ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ @@ -419,7 +427,8 @@ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ const uint16_t *sec, \ ptrdiff_t sec_stride, \ int height, \ - unsigned int *sse); + unsigned int *sse, \ + void *unused0, void *unused); #define DECLS(opt1) \ DECL(16, opt1) \ DECL(8, opt1) @@ -439,23 +448,23 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src, src_stride, x_offset, \ - y_offset, dst, dst_stride, sec, w, h, &sse); \ + y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16, src_stride, x_offset, y_offset, \ - dst + 16, dst_stride, sec + 16, w, h, &sse2); \ + dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 32, src_stride, x_offset, y_offset, \ - dst + 32, dst_stride, sec + 32, w, h, &sse2); \ + dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, sec + 48, w, h, &sse2); \ + dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -475,14 +484,15 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src, src_stride, x_offset, \ y_offset, dst, dst_stride, \ - sec, w, h, &sse); \ + sec, w, h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16, src_stride, \ x_offset, y_offset, \ dst + 16, dst_stride, \ - sec + 16, w, h, &sse2); \ + sec + 16, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ @@ -490,14 +500,16 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - sec + 32, w, h, &sse2); \ + sec + 32, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48, src_stride, \ x_offset, y_offset, \ dst + 48, dst_stride, \ - sec + 48, w, h, &sse2); \ + sec + 48, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -525,7 +537,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, \ y_offset, dst + (start_row * dst_stride), dst_stride, \ - sec + (start_row * w), w, height, &sse2); \ + sec + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ @@ -533,7 +545,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 16 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 16 + (start_row * dst_stride), dst_stride, \ - sec + 16 + (start_row * w), w, height, &sse2); \ + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ @@ -541,14 +553,14 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 32 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 32 + (start_row * dst_stride), dst_stride, \ - sec + 32 + (start_row * w), w, height, &sse2); \ + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 48 + (start_row * dst_stride), dst_stride, \ - sec + 48 + (start_row * w), w, height, &sse2); \ + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ } \ diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm index a141999ae..1ec906c23 100644 --- a/vpx_dsp/x86/sad_sse2.asm +++ b/vpx_dsp/x86/sad_sse2.asm @@ -17,7 +17,7 @@ SECTION .text %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 -cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ +cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %else ; avg @@ -25,7 +25,7 @@ cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 -cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 @@ -244,9 +244,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2 movd m2, [srcq] movd m5, [srcq+src_strideq] movd m4, [srcq+src_strideq*2] - movd m6, [srcq+src_stride3q] + movd m3, [srcq+src_stride3q] punpckldq m2, m5 - punpckldq m4, m6 + punpckldq m4, m3 movlhps m2, m4 psadbw m1, m2 lea refq, [refq+ref_strideq*4] |