From 694d4e777705ec7ad9d903f4074ba23d1806fe01 Mon Sep 17 00:00:00 2001 From: Fritz Koenig Date: Mon, 22 Aug 2011 12:36:28 -0700 Subject: Reclassify optimized ssim calculations as SSE2. Calculations were incorrectly classified as either SSE3 or SSSE3. Only using SSE2 instructions. Cleanup function names and make non-RTCD code work as well. Change-Id: I48ad0218af0cc51c5078070a08511dee43ecfe09 --- vp8/encoder/generic/csystemdependent.c | 9 ++++---- vp8/encoder/ssim.c | 19 +++++------------ vp8/encoder/variance.h | 16 +++++++------- vp8/encoder/x86/ssim_opt.asm | 12 +++++------ vp8/encoder/x86/variance_x86.h | 10 +++++++++ vp8/encoder/x86/x86_csystemdependent.c | 38 +++++++--------------------------- 6 files changed, 41 insertions(+), 63 deletions(-) (limited to 'vp8/encoder') diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index 990610554..a14843a80 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -94,16 +94,15 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) #if !(CONFIG_REALTIME_ONLY) cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_c; #endif +#if CONFIG_INTERNAL_STATS + cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_c; + cpi->rtcd.variance.ssimpf_16x16 = vp8_ssim_parms_16x16_c; +#endif #endif // Pure C: vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame; -#if CONFIG_INTERNAL_STATS - cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c; - cpi->rtcd.variance.ssimpf = ssim_parms_c; -#endif - #if ARCH_X86 || ARCH_X86_64 vp8_arch_x86_encoder_init(cpi); #endif diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index fea756f7b..d0f8e490a 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -9,18 +9,9 @@ */ -#include "vpx_scale/yv12config.h" -#include "math.h" #include "onyx_int.h" -#if CONFIG_RUNTIME_CPU_DETECT -#define IF_RTCD(x) (x) -#else -#define IF_RTCD(x) NULL -#endif - - -void ssim_parms_c +void vp8_ssim_parms_16x16_c ( unsigned char *s, int sp, @@ -46,7 +37,7 @@ void ssim_parms_c } } } -void ssim_parms_8x8_c +void vp8_ssim_parms_8x8_c ( unsigned char *s, int sp, @@ -107,14 +98,14 @@ static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp, const vp8_variance_rtcd_vtable_t *rtcd) { unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; - rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256); } static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp, const vp8_variance_rtcd_vtable_t *rtcd) { unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; - rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + SSIMPF_INVOKE(rtcd,8x8)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); } @@ -134,7 +125,7 @@ long dssim(unsigned char *s,int sp, unsigned char *r,int rp, c1 = cc1*16; c2 = cc2*16; - rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); ssim_n1 = (2*sum_s*sum_r+ c1); ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2); diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h index 5fd6d3ae0..d9bf66975 100644 --- a/vp8/encoder/variance.h +++ b/vp8/encoder/variance.h @@ -320,16 +320,16 @@ extern prototype_variance(vp8_variance_mse16x16); #endif extern prototype_get16x16prederror(vp8_variance_get4x4sse_cs); -#ifndef vp8_ssimpf -#define vp8_ssimpf ssim_parms_c -#endif -extern prototype_ssimpf(vp8_ssimpf) - #ifndef vp8_ssimpf_8x8 -#define vp8_ssimpf_8x8 ssim_parms_8x8_c +#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_c #endif extern prototype_ssimpf(vp8_ssimpf_8x8) +#ifndef vp8_ssimpf_16x16 +#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_c +#endif +extern prototype_ssimpf(vp8_ssimpf_16x16) + typedef prototype_sad(*vp8_sad_fn_t); typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t); typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t); @@ -394,7 +394,7 @@ typedef struct #if CONFIG_INTERNAL_STATS vp8_ssimpf_fn_t ssimpf_8x8; - vp8_ssimpf_fn_t ssimpf; + vp8_ssimpf_fn_t ssimpf_16x16; #endif } vp8_variance_rtcd_vtable_t; @@ -417,8 +417,10 @@ typedef struct #if CONFIG_RUNTIME_CPU_DETECT #define VARIANCE_INVOKE(ctx,fn) (ctx)->fn +#define SSIMPF_INVOKE(ctx,fn) (ctx)->ssimpf_##fn #else #define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn +#define SSIMPF_INVOKE(ctx,fn) vp8_ssimpf_##fn #endif #endif diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm index d5d267a69..8af4b4533 100644 --- a/vp8/encoder/x86/ssim_opt.asm +++ b/vp8/encoder/x86/ssim_opt.asm @@ -44,7 +44,7 @@ paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro -;void ssim_parms_sse3( +;void ssim_parms_sse2( ; unsigned char *s, ; int sp, ; unsigned char *r, @@ -61,8 +61,8 @@ ; or pavgb At this point this is just meant to be first pass for calculating ; all the parms needed for 16x16 ssim so we can play with dssim as distortion ; in mode selection code. -global sym(vp8_ssim_parms_16x16_sse3) -sym(vp8_ssim_parms_16x16_sse3): +global sym(vp8_ssim_parms_16x16_sse2) +sym(vp8_ssim_parms_16x16_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 @@ -134,7 +134,7 @@ NextRow: pop rbp ret -;void ssim_parms_sse3( +;void ssim_parms_sse2( ; unsigned char *s, ; int sp, ; unsigned char *r, @@ -151,8 +151,8 @@ NextRow: ; or pavgb At this point this is just meant to be first pass for calculating ; all the parms needed for 16x16 ssim so we can play with dssim as distortion ; in mode selection code. -global sym(vp8_ssim_parms_8x8_sse3) -sym(vp8_ssim_parms_8x8_sse3): +global sym(vp8_ssim_parms_8x8_sse2) +sym(vp8_ssim_parms_8x8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index af6c4d27e..4b41b5436 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -140,6 +140,8 @@ extern prototype_getmbss(vp8_get_mb_ss_sse2); extern prototype_variance(vp8_mse16x16_wmt); extern prototype_variance2(vp8_get8x8var_sse2); extern prototype_variance2(vp8_get16x16var_sse2); +extern prototype_ssimpf(vp8_ssim_parms_8x8_sse2) +extern prototype_ssimpf(vp8_ssim_parms_16x16_sse2) #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_variance_sad4x4 @@ -208,6 +210,14 @@ extern prototype_variance2(vp8_get16x16var_sse2); #undef vp8_variance_mse16x16 #define vp8_variance_mse16x16 vp8_mse16x16_wmt +#if ARCH_X86_64 +#undef vp8_ssimpf_8x8 +#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_sse2 + +#undef vp8_ssimpf_16x16 +#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_sse2 +#endif + #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index badb9f044..36b7b7194 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -111,29 +111,6 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) #endif -#if HAVE_SSSE3 -#if CONFIG_INTERNAL_STATS -#if ARCH_X86_64 -typedef void ssimpf -( - unsigned char *s, - int sp, - unsigned char *r, - int rp, - unsigned long *sum_s, - unsigned long *sum_r, - unsigned long *sum_sq_s, - unsigned long *sum_sq_r, - unsigned long *sum_sxr -); - -extern ssimpf vp8_ssim_parms_16x16_sse3; -extern ssimpf vp8_ssim_parms_8x8_sse3; -#endif -#endif -#endif - - void vp8_arch_x86_encoder_init(VP8_COMP *cpi) { #if CONFIG_RUNTIME_CPU_DETECT @@ -245,6 +222,13 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) #if !(CONFIG_REALTIME_ONLY) cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2; +#endif + +#if CONFIG_INTERNAL_STATS +#if ARCH_X86_64 + cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse2; + cpi->rtcd.variance.ssimpf_16x16 = vp8_ssim_parms_16x16_sse2; +#endif #endif } #endif @@ -280,14 +264,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3; cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; - -#if CONFIG_INTERNAL_STATS -#if ARCH_X86_64 - cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3; - cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3; -#endif -#endif - } #endif -- cgit v1.2.3 From c5f890af2cff951048cc41630f2523b61fb74a0b Mon Sep 17 00:00:00 2001 From: Fritz Koenig Date: Mon, 22 Aug 2011 15:29:41 -0700 Subject: Use local labels for jumps/loops in x86 assembly. Prepend . to local labels in assembly code. This allows non unique labels within a file. Also makes profiling information more informative by keeping the function name with the loop name. Change-Id: I7a983cb3a5ba2413d5dafd0a37936b268fb9e37f --- vp8/encoder/x86/encodeopt.asm | 16 +-- vp8/encoder/x86/quantize_sse2.asm | 6 +- vp8/encoder/x86/quantize_sse4.asm | 6 +- vp8/encoder/x86/sad_mmx.asm | 16 +-- vp8/encoder/x86/sad_sse2.asm | 40 +++--- vp8/encoder/x86/sad_sse3.asm | 12 +- vp8/encoder/x86/sad_ssse3.asm | 164 ++++++++++++------------- vp8/encoder/x86/ssim_opt.asm | 8 +- vp8/encoder/x86/subtract_mmx.asm | 4 +- vp8/encoder/x86/subtract_sse2.asm | 4 +- vp8/encoder/x86/temporal_filter_apply_sse2.asm | 18 +-- vp8/encoder/x86/variance_impl_mmx.asm | 12 +- vp8/encoder/x86/variance_impl_sse2.asm | 8 +- vp8/encoder/x86/variance_impl_ssse3.asm | 38 +++--- 14 files changed, 176 insertions(+), 176 deletions(-) (limited to 'vp8/encoder') diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm index 994629499..7ec7d603c 100644 --- a/vp8/encoder/x86/encodeopt.asm +++ b/vp8/encoder/x86/encodeopt.asm @@ -148,7 +148,7 @@ sym(vp8_mbblock_error_mmx_impl): pcmpeqw mm1, mm7 mov rcx, 16 -mberror_loop_mmx: +.mberror_loop_mmx: movq mm3, [rsi] movq mm4, [rdi] @@ -186,7 +186,7 @@ mberror_loop_mmx: add rdi, 32 sub rcx, 1 - jnz mberror_loop_mmx + jnz .mberror_loop_mmx movq mm0, mm2 psrlq mm2, 32 @@ -226,7 +226,7 @@ sym(vp8_mbblock_error_xmm_impl): pcmpeqw xmm5, xmm6 mov rcx, 16 -mberror_loop: +.mberror_loop: movdqa xmm0, [rsi] movdqa xmm1, [rdi] @@ -249,7 +249,7 @@ mberror_loop: paddd xmm4, xmm2 paddd xmm4, xmm0 - jnz mberror_loop + jnz .mberror_loop movdqa xmm0, xmm4 punpckldq xmm0, xmm6 @@ -289,7 +289,7 @@ sym(vp8_mbuverror_mmx_impl): mov rcx, 16 pxor mm7, mm7 -mbuverror_loop_mmx: +.mbuverror_loop_mmx: movq mm1, [rsi] movq mm2, [rdi] @@ -313,7 +313,7 @@ mbuverror_loop_mmx: add rdi, 16 dec rcx - jnz mbuverror_loop_mmx + jnz .mbuverror_loop_mmx movq mm0, mm7 psrlq mm7, 32 @@ -346,7 +346,7 @@ sym(vp8_mbuverror_xmm_impl): mov rcx, 16 pxor xmm3, xmm3 -mbuverror_loop: +.mbuverror_loop: movdqa xmm1, [rsi] movdqa xmm2, [rdi] @@ -360,7 +360,7 @@ mbuverror_loop: add rdi, 16 dec rcx - jnz mbuverror_loop + jnz .mbuverror_loop pxor xmm0, xmm0 movdqa xmm1, xmm3 diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 056b64c39..c483933df 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -137,17 +137,17 @@ sym(vp8_regular_quantize_b_sse2): ; if (x >= zbin) sub cx, WORD PTR[rdx] ; x - zbin lea rdx, [rdx + 2] ; zbin_boost_ptr++ - jl rq_zigzag_loop_%1 ; x < zbin + jl .rq_zigzag_loop_%1 ; x < zbin movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] ; downshift by quant_shift[rc] movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc] sar edi, cl ; also sets Z bit - je rq_zigzag_loop_%1 ; !y + je .rq_zigzag_loop_%1 ; !y mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost -rq_zigzag_loop_%1: +.rq_zigzag_loop_%1: %endmacro ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c ZIGZAG_LOOP 0 diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm index 258899eed..95e1c2074 100644 --- a/vp8/encoder/x86/quantize_sse4.asm +++ b/vp8/encoder/x86/quantize_sse4.asm @@ -140,21 +140,21 @@ sym(vp8_regular_quantize_b_sse4): ; if (x >= zbin) sub cx, WORD PTR[rdx] ; x - zbin lea rdx, [rdx + 2] ; zbin_boost_ptr++ - jl rq_zigzag_loop_%1 ; x < zbin + jl .rq_zigzag_loop_%1 ; x < zbin pextrw edi, %3, %2 ; y ; downshift by quant_shift[rc] pextrb ecx, xmm5, %1 ; quant_shift[rc] sar edi, cl ; also sets Z bit - je rq_zigzag_loop_%1 ; !y + je .rq_zigzag_loop_%1 ; !y %if ABI_IS_32BIT mov WORD PTR[rsp + qcoeff + %1 *2], di %else pinsrw %5, edi, %2 ; qcoeff[rc] %endif mov rdx, rax ; reset to b->zrun_zbin_boost -rq_zigzag_loop_%1: +.rq_zigzag_loop_%1: %endmacro ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4 diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm index 85cb023a4..407b39979 100644 --- a/vp8/encoder/x86/sad_mmx.asm +++ b/vp8/encoder/x86/sad_mmx.asm @@ -43,7 +43,7 @@ sym(vp8_sad16x16_mmx): pxor mm6, mm6 -x16x16sad_mmx_loop: +.x16x16sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm2, QWORD PTR [rsi+8] @@ -83,7 +83,7 @@ x16x16sad_mmx_loop: paddw mm7, mm1 cmp rsi, rcx - jne x16x16sad_mmx_loop + jne .x16x16sad_mmx_loop movq mm0, mm7 @@ -135,7 +135,7 @@ sym(vp8_sad8x16_mmx): pxor mm6, mm6 -x8x16sad_mmx_loop: +.x8x16sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] @@ -158,7 +158,7 @@ x8x16sad_mmx_loop: paddw mm7, mm2 cmp rsi, rcx - jne x8x16sad_mmx_loop + jne .x8x16sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 @@ -205,7 +205,7 @@ sym(vp8_sad8x8_mmx): pxor mm6, mm6 -x8x8sad_mmx_loop: +.x8x8sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] @@ -228,7 +228,7 @@ x8x8sad_mmx_loop: paddw mm7, mm0 cmp rsi, rcx - jne x8x8sad_mmx_loop + jne .x8x8sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 @@ -364,7 +364,7 @@ sym(vp8_sad16x8_mmx): pxor mm6, mm6 -x16x8sad_mmx_loop: +.x16x8sad_mmx_loop: movq mm0, [rsi] movq mm1, [rdi] @@ -404,7 +404,7 @@ x16x8sad_mmx_loop: paddw mm7, mm0 cmp rsi, rcx - jne x16x8sad_mmx_loop + jne .x16x8sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index 1011c9553..fa8e3e3f8 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -37,7 +37,7 @@ sym(vp8_sad16x16_wmt): lea rcx, [rcx+rax*8] pxor xmm6, xmm6 -x16x16sad_wmt_loop: +.x16x16sad_wmt_loop: movq xmm0, QWORD PTR [rsi] movq xmm2, QWORD PTR [rsi+8] @@ -68,7 +68,7 @@ x16x16sad_wmt_loop: paddw xmm6, xmm4 cmp rsi, rcx - jne x16x16sad_wmt_loop + jne .x16x16sad_wmt_loop movq xmm0, xmm6 psrldq xmm6, 8 @@ -111,11 +111,11 @@ sym(vp8_sad8x16_wmt): lea rcx, [rcx+rbx*8] pxor mm7, mm7 -x8x16sad_wmt_loop: +.x8x16sad_wmt_loop: movq rax, mm7 cmp eax, arg(4) - jg x8x16sad_wmt_early_exit + jg .x8x16sad_wmt_early_exit movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] @@ -133,11 +133,11 @@ x8x16sad_wmt_loop: paddw mm7, mm2 cmp rsi, rcx - jne x8x16sad_wmt_loop + jne .x8x16sad_wmt_loop movq rax, mm7 -x8x16sad_wmt_early_exit: +.x8x16sad_wmt_early_exit: ; begin epilog pop rdi @@ -172,11 +172,11 @@ sym(vp8_sad8x8_wmt): lea rcx, [rsi+rbx*8] pxor mm7, mm7 -x8x8sad_wmt_loop: +.x8x8sad_wmt_loop: movq rax, mm7 cmp eax, arg(4) - jg x8x8sad_wmt_early_exit + jg .x8x8sad_wmt_early_exit movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] @@ -188,10 +188,10 @@ x8x8sad_wmt_loop: paddw mm7, mm0 cmp rsi, rcx - jne x8x8sad_wmt_loop + jne .x8x8sad_wmt_loop movq rax, mm7 -x8x8sad_wmt_early_exit: +.x8x8sad_wmt_early_exit: ; begin epilog pop rdi @@ -281,11 +281,11 @@ sym(vp8_sad16x8_wmt): lea rcx, [rsi+rbx*8] pxor mm7, mm7 -x16x8sad_wmt_loop: +.x16x8sad_wmt_loop: movq rax, mm7 cmp eax, arg(4) - jg x16x8sad_wmt_early_exit + jg .x16x8sad_wmt_early_exit movq mm0, QWORD PTR [rsi] movq mm2, QWORD PTR [rsi+8] @@ -315,11 +315,11 @@ x16x8sad_wmt_loop: paddw mm7, mm4 cmp rsi, rcx - jne x16x8sad_wmt_loop + jne .x16x8sad_wmt_loop movq rax, mm7 -x16x8sad_wmt_early_exit: +.x16x8sad_wmt_early_exit: ; begin epilog pop rdi @@ -352,7 +352,7 @@ sym(vp8_copy32xn_sse2): movsxd rdx, dword ptr arg(3) ;dst_stride movsxd rcx, dword ptr arg(4) ;height -block_copy_sse2_loopx4: +.block_copy_sse2_loopx4: movdqu xmm0, XMMWORD PTR [rsi] movdqu xmm1, XMMWORD PTR [rsi + 16] movdqu xmm2, XMMWORD PTR [rsi + rax] @@ -383,12 +383,12 @@ block_copy_sse2_loopx4: sub rcx, 4 cmp rcx, 4 - jge block_copy_sse2_loopx4 + jge .block_copy_sse2_loopx4 cmp rcx, 0 - je copy_is_done + je .copy_is_done -block_copy_sse2_loop: +.block_copy_sse2_loop: movdqu xmm0, XMMWORD PTR [rsi] movdqu xmm1, XMMWORD PTR [rsi + 16] lea rsi, [rsi+rax] @@ -398,9 +398,9 @@ block_copy_sse2_loop: lea rdi, [rdi+rdx] sub rcx, 1 - jne block_copy_sse2_loop + jne .block_copy_sse2_loop -copy_is_done: +.copy_is_done: ; begin epilog pop rdi pop rsi diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index 9e0552166..a2550974c 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -647,7 +647,7 @@ sym(vp8_copy32xn_sse3): STACK_FRAME_CREATE_X3 -block_copy_sse3_loopx4: +.block_copy_sse3_loopx4: lea end_ptr, [src_ptr+src_stride*2] movdqu xmm0, XMMWORD PTR [src_ptr] @@ -676,13 +676,13 @@ block_copy_sse3_loopx4: sub height, 4 cmp height, 4 - jge block_copy_sse3_loopx4 + jge .block_copy_sse3_loopx4 ;Check to see if there is more rows need to be copied. cmp height, 0 - je copy_is_done + je .copy_is_done -block_copy_sse3_loop: +.block_copy_sse3_loop: movdqu xmm0, XMMWORD PTR [src_ptr] movdqu xmm1, XMMWORD PTR [src_ptr + 16] lea src_ptr, [src_ptr+src_stride] @@ -692,9 +692,9 @@ block_copy_sse3_loop: lea ref_ptr, [ref_ptr+ref_stride] sub height, 1 - jne block_copy_sse3_loop + jne .block_copy_sse3_loop -copy_is_done: +.copy_is_done: STACK_FRAME_DESTROY_X3 ;void vp8_sad16x16x4d_sse3( diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm index 6ecf08184..95b6c89e6 100644 --- a/vp8/encoder/x86/sad_ssse3.asm +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -169,30 +169,30 @@ sym(vp8_sad16x16x3_ssse3): mov rdx, 0xf and rdx, rdi - jmp vp8_sad16x16x3_ssse3_skiptable -vp8_sad16x16x3_ssse3_jumptable: - dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump -vp8_sad16x16x3_ssse3_skiptable: - - call vp8_sad16x16x3_ssse3_do_jump -vp8_sad16x16x3_ssse3_do_jump: + jmp .vp8_sad16x16x3_ssse3_skiptable +.vp8_sad16x16x3_ssse3_jumptable: + dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump +.vp8_sad16x16x3_ssse3_skiptable: + + call .vp8_sad16x16x3_ssse3_do_jump +.vp8_sad16x16x3_ssse3_do_jump: pop rcx ; get the address of do_jump - mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump + mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable @@ -203,23 +203,23 @@ vp8_sad16x16x3_ssse3_do_jump: jmp rcx - PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3 - -vp8_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3 + +.vp8_sad16x16x3_ssse3_aligned_by_15: PROCESS_16X2X3 1 PROCESS_16X2X3 0 PROCESS_16X2X3 0 @@ -229,7 +229,7 @@ vp8_sad16x16x3_ssse3_aligned_by_15: PROCESS_16X2X3 0 PROCESS_16X2X3 0 -vp8_sad16x16x3_ssse3_store_off: +.vp8_sad16x16x3_ssse3_store_off: mov rdi, arg(4) ;Results movq xmm0, xmm5 @@ -282,30 +282,30 @@ sym(vp8_sad16x8x3_ssse3): mov rdx, 0xf and rdx, rdi - jmp vp8_sad16x8x3_ssse3_skiptable -vp8_sad16x8x3_ssse3_jumptable: - dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump -vp8_sad16x8x3_ssse3_skiptable: - - call vp8_sad16x8x3_ssse3_do_jump -vp8_sad16x8x3_ssse3_do_jump: + jmp .vp8_sad16x8x3_ssse3_skiptable +.vp8_sad16x8x3_ssse3_jumptable: + dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump +.vp8_sad16x8x3_ssse3_skiptable: + + call .vp8_sad16x8x3_ssse3_do_jump +.vp8_sad16x8x3_ssse3_do_jump: pop rcx ; get the address of do_jump - mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump + mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable @@ -316,30 +316,30 @@ vp8_sad16x8x3_ssse3_do_jump: jmp rcx - PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3 - -vp8_sad16x8x3_ssse3_aligned_by_15: + PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3 + +.vp8_sad16x8x3_ssse3_aligned_by_15: PROCESS_16X2X3 1 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 -vp8_sad16x8x3_ssse3_store_off: +.vp8_sad16x8x3_ssse3_store_off: mov rdi, arg(4) ;Results movq xmm0, xmm5 diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm index 8af4b4533..c6db3d1c6 100644 --- a/vp8/encoder/x86/ssim_opt.asm +++ b/vp8/encoder/x86/ssim_opt.asm @@ -84,7 +84,7 @@ sym(vp8_ssim_parms_16x16_sse2): pxor xmm11,xmm11 ;sum_sxr mov rdx, 16 ;row counter -NextRow: +.NextRow: ;grab source and reference pixels movdqu xmm5, [rsi] @@ -107,7 +107,7 @@ NextRow: add rdi, rax ; next r row dec rdx ; counter - jnz NextRow + jnz .NextRow SUM_ACROSS_W xmm15 SUM_ACROSS_W xmm14 @@ -174,7 +174,7 @@ sym(vp8_ssim_parms_8x8_sse2): pxor xmm11,xmm11 ;sum_sxr mov rdx, 8 ;row counter -NextRow2: +.NextRow: ;grab source and reference pixels movq xmm3, [rsi] @@ -188,7 +188,7 @@ NextRow2: add rdi, rax ; next r row dec rdx ; counter - jnz NextRow2 + jnz .NextRow SUM_ACROSS_W xmm15 SUM_ACROSS_W xmm14 diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm index a47e1f0d6..4ce16ce90 100644 --- a/vp8/encoder/x86/subtract_mmx.asm +++ b/vp8/encoder/x86/subtract_mmx.asm @@ -93,7 +93,7 @@ sym(vp8_subtract_mby_mmx): mov rcx, 16 pxor mm0, mm0 -submby_loop: +.submby_loop: movq mm1, [rsi] movq mm3, [rax] @@ -139,7 +139,7 @@ submby_loop: lea rsi, [rsi+rdx] sub rcx, 1 - jnz submby_loop + jnz .submby_loop pop rdi pop rsi diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm index 95888f6be..3bd1ff678 100644 --- a/vp8/encoder/x86/subtract_sse2.asm +++ b/vp8/encoder/x86/subtract_sse2.asm @@ -91,7 +91,7 @@ sym(vp8_subtract_mby_sse2): mov rcx, 8 ; do two lines at one time -submby_loop: +.submby_loop: movdqa xmm0, XMMWORD PTR [rsi] ; src movdqa xmm1, XMMWORD PTR [rax] ; pred @@ -133,7 +133,7 @@ submby_loop: lea rsi, [rsi+rdx*2] sub rcx, 1 - jnz submby_loop + jnz .submby_loop pop rdi pop rsi diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm index b777ef566..b97c69439 100644 --- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -71,26 +71,26 @@ sym(vp8_temporal_filter_apply_sse2): lea rcx, [rdx + 16*16*1] cmp dword ptr [rsp + block_size], 8 - jne temporal_filter_apply_load_16 + jne .temporal_filter_apply_load_16 lea rcx, [rdx + 8*8*1] -temporal_filter_apply_load_8: +.temporal_filter_apply_load_8: movq xmm0, [rsi] ; first row lea rsi, [rsi + rbp] ; += stride punpcklbw xmm0, xmm7 ; src[ 0- 7] movq xmm1, [rsi] ; second row lea rsi, [rsi + rbp] ; += stride punpcklbw xmm1, xmm7 ; src[ 8-15] - jmp temporal_filter_apply_load_finished + jmp .temporal_filter_apply_load_finished -temporal_filter_apply_load_16: +.temporal_filter_apply_load_16: movdqa xmm0, [rsi] ; src (frame1) lea rsi, [rsi + rbp] ; += stride movdqa xmm1, xmm0 punpcklbw xmm0, xmm7 ; src[ 0- 7] punpckhbw xmm1, xmm7 ; src[ 8-15] -temporal_filter_apply_load_finished: +.temporal_filter_apply_load_finished: movdqa xmm2, [rdx] ; predictor (frame2) movdqa xmm3, xmm2 punpcklbw xmm2, xmm7 ; pred[ 0- 7] @@ -176,13 +176,13 @@ temporal_filter_apply_load_finished: lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) cmp rdx, rcx - je temporal_filter_apply_epilog + je .temporal_filter_apply_epilog pxor xmm7, xmm7 ; zero for extraction cmp dword ptr [rsp + block_size], 16 - je temporal_filter_apply_load_16 - jmp temporal_filter_apply_load_8 + je .temporal_filter_apply_load_16 + jmp .temporal_filter_apply_load_8 -temporal_filter_apply_epilog: +.temporal_filter_apply_epilog: ; begin epilog mov rbp, [rsp + rbp_backup] add rsp, stack_size diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm index 13b76ea91..2be8bbeb3 100644 --- a/vp8/encoder/x86/variance_impl_mmx.asm +++ b/vp8/encoder/x86/variance_impl_mmx.asm @@ -27,7 +27,7 @@ sym(vp8_get_mb_ss_mmx): mov rcx, 16 pxor mm4, mm4 -NEXTROW: +.NEXTROW: movq mm0, [rax] movq mm1, [rax+8] movq mm2, [rax+16] @@ -44,7 +44,7 @@ NEXTROW: add rax, 32 dec rcx - ja NEXTROW + ja .NEXTROW movq QWORD PTR [rsp], mm4 ;return sum[0]+sum[1]; @@ -568,7 +568,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx): add rsi, r8 %endif -filter_block2d_bil4x4_var_mmx_loop: +.filter_block2d_bil4x4_var_mmx_loop: movd mm1, [rsi] ; movd mm3, [rsi+1] ; @@ -614,7 +614,7 @@ filter_block2d_bil4x4_var_mmx_loop: add rdi, r9 %endif sub rcx, 1 ; - jnz filter_block2d_bil4x4_var_mmx_loop ; + jnz .filter_block2d_bil4x4_var_mmx_loop ; pxor mm3, mm3 ; @@ -726,7 +726,7 @@ sym(vp8_filter_block2d_bil_var_mmx): add rsi, r8 %endif -filter_block2d_bil_var_mmx_loop: +.filter_block2d_bil_var_mmx_loop: movq mm1, [rsi] ; movq mm3, [rsi+1] ; @@ -807,7 +807,7 @@ filter_block2d_bil_var_mmx_loop: add rdi, r9 %endif sub rcx, 1 ; - jnz filter_block2d_bil_var_mmx_loop ; + jnz .filter_block2d_bil_var_mmx_loop ; pxor mm3, mm3 ; diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index b7a6b3286..762922091 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -33,7 +33,7 @@ sym(vp8_get_mb_ss_sse2): mov rcx, 8 pxor xmm4, xmm4 -NEXTROW: +.NEXTROW: movdqa xmm0, [rax] movdqa xmm1, [rax+16] movdqa xmm2, [rax+32] @@ -50,7 +50,7 @@ NEXTROW: add rax, 0x40 dec rcx - ja NEXTROW + ja .NEXTROW movdqa xmm3,xmm4 psrldq xmm4,8 @@ -126,7 +126,7 @@ sym(vp8_get16x16var_sse2): pxor xmm6, xmm6 ; clear xmm6 for accumulating sse mov rcx, 16 -var16loop: +.var16loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rdi] @@ -160,7 +160,7 @@ var16loop: add rdi, rdx sub rcx, 1 - jnz var16loop + jnz .var16loop movdqa xmm1, xmm6 diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm index a582f8dc5..97e8b0e2e 100644 --- a/vp8/encoder/x86/variance_impl_ssse3.asm +++ b/vp8/encoder/x86/variance_impl_ssse3.asm @@ -47,7 +47,7 @@ sym(vp8_filter_block2d_bil_var_ssse3): movsxd rax, dword ptr arg(5) ; xoffset cmp rax, 0 ; skip first_pass filter if xoffset=0 - je filter_block2d_bil_var_ssse3_sp_only + je .filter_block2d_bil_var_ssse3_sp_only shl rax, 4 ; point to filter coeff with xoffset lea rax, [rax + rcx] ; HFilter @@ -55,7 +55,7 @@ sym(vp8_filter_block2d_bil_var_ssse3): movsxd rdx, dword ptr arg(6) ; yoffset cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je filter_block2d_bil_var_ssse3_fp_only + je .filter_block2d_bil_var_ssse3_fp_only shl rdx, 4 lea rdx, [rdx + rcx] ; VFilter @@ -88,7 +88,7 @@ sym(vp8_filter_block2d_bil_var_ssse3): lea rsi, [rsi + r8] %endif -filter_block2d_bil_var_ssse3_loop: +.filter_block2d_bil_var_ssse3_loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rsi+1] movdqa xmm3, xmm1 @@ -142,15 +142,15 @@ filter_block2d_bil_var_ssse3_loop: %endif sub rcx, 1 - jnz filter_block2d_bil_var_ssse3_loop + jnz .filter_block2d_bil_var_ssse3_loop - jmp filter_block2d_bil_variance + jmp .filter_block2d_bil_variance -filter_block2d_bil_var_ssse3_sp_only: +.filter_block2d_bil_var_ssse3_sp_only: movsxd rdx, dword ptr arg(6) ; yoffset cmp rdx, 0 ; Both xoffset =0 and yoffset=0 - je filter_block2d_bil_var_ssse3_full_pixel + je .filter_block2d_bil_var_ssse3_full_pixel shl rdx, 4 lea rdx, [rdx + rcx] ; VFilter @@ -169,7 +169,7 @@ filter_block2d_bil_var_ssse3_sp_only: lea rsi, [rsi + rax] -filter_block2d_bil_sp_only_loop: +.filter_block2d_bil_sp_only_loop: movdqu xmm3, XMMWORD PTR [rsi] movdqa xmm2, xmm1 movdqa xmm0, xmm3 @@ -209,11 +209,11 @@ filter_block2d_bil_sp_only_loop: %endif sub rcx, 1 - jnz filter_block2d_bil_sp_only_loop + jnz .filter_block2d_bil_sp_only_loop - jmp filter_block2d_bil_variance + jmp .filter_block2d_bil_variance -filter_block2d_bil_var_ssse3_full_pixel: +.filter_block2d_bil_var_ssse3_full_pixel: mov rsi, arg(0) ;ref_ptr mov rdi, arg(2) ;src_ptr movsxd rcx, dword ptr arg(4) ;Height @@ -221,7 +221,7 @@ filter_block2d_bil_var_ssse3_full_pixel: movsxd rdx, dword ptr arg(3) ;src_pixels_per_line pxor xmm0, xmm0 -filter_block2d_bil_full_pixel_loop: +.filter_block2d_bil_full_pixel_loop: movq xmm1, QWORD PTR [rsi] punpcklbw xmm1, xmm0 movq xmm2, QWORD PTR [rsi+8] @@ -244,11 +244,11 @@ filter_block2d_bil_full_pixel_loop: lea rsi, [rsi + rax] ;ref_pixels_per_line lea rdi, [rdi + rdx] ;src_pixels_per_line sub rcx, 1 - jnz filter_block2d_bil_full_pixel_loop + jnz .filter_block2d_bil_full_pixel_loop - jmp filter_block2d_bil_variance + jmp .filter_block2d_bil_variance -filter_block2d_bil_var_ssse3_fp_only: +.filter_block2d_bil_var_ssse3_fp_only: mov rsi, arg(0) ;ref_ptr mov rdi, arg(2) ;src_ptr movsxd rcx, dword ptr arg(4) ;Height @@ -260,7 +260,7 @@ filter_block2d_bil_var_ssse3_fp_only: movsxd r9, dword ptr arg(3) ;src_pixels_per_line %endif -filter_block2d_bil_fp_only_loop: +.filter_block2d_bil_fp_only_loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rsi+1] movdqa xmm3, xmm1 @@ -298,11 +298,11 @@ filter_block2d_bil_fp_only_loop: %endif sub rcx, 1 - jnz filter_block2d_bil_fp_only_loop + jnz .filter_block2d_bil_fp_only_loop - jmp filter_block2d_bil_variance + jmp .filter_block2d_bil_variance -filter_block2d_bil_variance: +.filter_block2d_bil_variance: pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm5, xmm5 -- cgit v1.2.3