summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFritz Koenig <frkoenig@google.com>2011-08-22 15:29:41 -0700
committerFritz Koenig <frkoenig@google.com>2011-08-23 09:05:29 -0700
commitc5f890af2cff951048cc41630f2523b61fb74a0b (patch)
tree0b0ce3929049cef7470a01da1b6436306768a2c6
parent694d4e777705ec7ad9d903f4074ba23d1806fe01 (diff)
downloadlibvpx-c5f890af2cff951048cc41630f2523b61fb74a0b.tar
libvpx-c5f890af2cff951048cc41630f2523b61fb74a0b.tar.gz
libvpx-c5f890af2cff951048cc41630f2523b61fb74a0b.tar.bz2
libvpx-c5f890af2cff951048cc41630f2523b61fb74a0b.zip
Use local labels for jumps/loops in x86 assembly.
Prepend . to local labels in assembly code. This allows non unique labels within a file. Also makes profiling information more informative by keeping the function name with the loop name. Change-Id: I7a983cb3a5ba2413d5dafd0a37936b268fb9e37f
-rw-r--r--vp8/common/x86/loopfilter_mmx.asm24
-rw-r--r--vp8/common/x86/postproc_mmx.asm32
-rw-r--r--vp8/common/x86/postproc_sse2.asm44
-rw-r--r--vp8/common/x86/recon_sse2.asm8
-rw-r--r--vp8/common/x86/subpixel_mmx.asm20
-rw-r--r--vp8/common/x86/subpixel_sse2.asm62
-rw-r--r--vp8/common/x86/subpixel_ssse3.asm100
-rw-r--r--vp8/encoder/x86/encodeopt.asm16
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm6
-rw-r--r--vp8/encoder/x86/quantize_sse4.asm6
-rw-r--r--vp8/encoder/x86/sad_mmx.asm16
-rw-r--r--vp8/encoder/x86/sad_sse2.asm40
-rw-r--r--vp8/encoder/x86/sad_sse3.asm12
-rw-r--r--vp8/encoder/x86/sad_ssse3.asm164
-rw-r--r--vp8/encoder/x86/ssim_opt.asm8
-rw-r--r--vp8/encoder/x86/subtract_mmx.asm4
-rw-r--r--vp8/encoder/x86/subtract_sse2.asm4
-rw-r--r--vp8/encoder/x86/temporal_filter_apply_sse2.asm18
-rw-r--r--vp8/encoder/x86/variance_impl_mmx.asm12
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm8
-rw-r--r--vp8/encoder/x86/variance_impl_ssse3.asm38
21 files changed, 321 insertions, 321 deletions
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index ad47284cf..697a5dee6 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -40,7 +40,7 @@ sym(vp8_loop_filter_horizontal_edge_mmx):
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
movsxd rcx, dword ptr arg(5) ;count
-next8_h:
+.next8_h:
mov rdx, arg(3) ;limit
movq mm7, [rdx]
mov rdi, rsi ; rdi points to row +1 for indirect addressing
@@ -211,7 +211,7 @@ next8_h:
add rsi,8
neg rax
dec rcx
- jnz next8_h
+ jnz .next8_h
add rsp, 32
pop rsp
@@ -255,7 +255,7 @@ sym(vp8_loop_filter_vertical_edge_mmx):
lea rsi, [rsi + rax*4 - 4]
movsxd rcx, dword ptr arg(5) ;count
-next8_v:
+.next8_v:
mov rdi, rsi ; rdi points to row +1 for indirect addressing
add rdi, rax
@@ -581,7 +581,7 @@ next8_v:
lea rsi, [rsi+rax*8]
dec rcx
- jnz next8_v
+ jnz .next8_v
add rsp, 64
pop rsp
@@ -622,7 +622,7 @@ sym(vp8_mbloop_filter_horizontal_edge_mmx):
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
movsxd rcx, dword ptr arg(5) ;count
-next8_mbh:
+.next8_mbh:
mov rdx, arg(3) ;limit
movq mm7, [rdx]
mov rdi, rsi ; rdi points to row +1 for indirect addressing
@@ -898,7 +898,7 @@ next8_mbh:
neg rax
add rsi,8
dec rcx
- jnz next8_mbh
+ jnz .next8_mbh
add rsp, 32
pop rsp
@@ -942,7 +942,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx):
lea rsi, [rsi + rax*4 - 4]
movsxd rcx, dword ptr arg(5) ;count
-next8_mbv:
+.next8_mbv:
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
;transpose
@@ -1365,7 +1365,7 @@ next8_mbv:
lea rsi, [rsi+rax*8]
dec rcx
- jnz next8_mbv
+ jnz .next8_mbv
add rsp, 96
pop rsp
@@ -1398,7 +1398,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx):
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
mov rcx, 2 ; count
-nexts8_h:
+.nexts8_h:
mov rdx, arg(2) ;blimit ; get blimit
movq mm3, [rdx] ;
@@ -1483,7 +1483,7 @@ nexts8_h:
add rsi,8
neg rax
dec rcx
- jnz nexts8_h
+ jnz .nexts8_h
; begin epilog
pop rdi
@@ -1520,7 +1520,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx):
lea rsi, [rsi + rax*4- 2]; ;
mov rcx, 2 ; count
-nexts8_v:
+.nexts8_v:
lea rdi, [rsi + rax];
movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
@@ -1695,7 +1695,7 @@ nexts8_v:
lea rsi, [rsi+rax*8] ; next 8
dec rcx
- jnz nexts8_v
+ jnz .nexts8_v
add rsp, 32
pop rsp
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index 787e83268..81122181f 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -58,10 +58,10 @@ sym(vp8_post_proc_down_and_across_mmx):
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
-nextrow:
+.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
-nextcol:
+.nextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
@@ -146,7 +146,7 @@ nextcol:
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
- jl nextcol
+ jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
sub rdi, rdx
@@ -156,7 +156,7 @@ nextcol:
xor rdx, rdx
mov rax, [rdi-4];
-acrossnextcol:
+.acrossnextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ;
movq mm4, [rdi+rdx] ; mm4 = p0..p7
@@ -237,7 +237,7 @@ acrossnextcol:
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
- jl acrossnextcol;
+ jl .acrossnextcol;
mov DWORD PTR [rdi+rdx-4], eax
pop rax
@@ -249,7 +249,7 @@ acrossnextcol:
movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
dec rcx ; decrement count
- jnz nextrow ; next row
+ jnz .nextrow ; next row
pop rbx
; begin epilog
@@ -293,7 +293,7 @@ sym(vp8_mbpost_proc_down_mmx):
add dword ptr arg(2), 8
;for(c=0; c<cols; c+=4)
-loop_col:
+.loop_col:
mov rsi, arg(0) ;s
pxor mm0, mm0 ;
@@ -312,7 +312,7 @@ loop_col:
mov rcx, 15 ;
-loop_initvar:
+.loop_initvar:
movd mm1, DWORD PTR [rdi];
punpcklbw mm1, mm0 ;
@@ -329,10 +329,10 @@ loop_initvar:
lea rdi, [rdi+rax] ;
dec rcx
- jne loop_initvar
+ jne .loop_initvar
;save the var and sum
xor rdx, rdx
-loop_row:
+.loop_row:
movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
@@ -438,13 +438,13 @@ loop_row:
add rdx, 1
cmp edx, dword arg(2) ;rows
- jl loop_row
+ jl .loop_row
add dword arg(0), 4 ; s += 4
sub dword arg(3), 4 ; cols -= 4
cmp dword arg(3), 0
- jg loop_col
+ jg .loop_col
add rsp, 136
pop rsp
@@ -475,7 +475,7 @@ sym(vp8_plane_add_noise_mmx):
push rdi
; end prolog
-addnoise_loop:
+.addnoise_loop:
call sym(rand) WRT_PLT
mov rcx, arg(1) ;noise
and rax, 0xff
@@ -492,7 +492,7 @@ addnoise_loop:
mov rsi, arg(0) ;Pos
xor rax,rax
-addnoise_nextset:
+.addnoise_nextset:
movq mm1,[rsi+rax] ; get the source
psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
@@ -506,12 +506,12 @@ addnoise_nextset:
add rax,8 ; move to the next line
cmp rax, rcx
- jl addnoise_nextset
+ jl .addnoise_nextset
movsxd rax, dword arg(7) ; Pitch
add arg(0), rax ; Start += Pitch
sub dword arg(6), 1 ; Height -= 1
- jg addnoise_loop
+ jg .addnoise_loop
; begin epilog
pop rdi
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index 06d51ec6f..1f219ca87 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -57,10 +57,10 @@ sym(vp8_post_proc_down_and_across_xmm):
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor xmm0, xmm0 ; mm0 = 00000000
-nextrow:
+.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
-nextcol:
+.nextcol:
movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
movdqa xmm1, xmm3 ; mm1 = p0..p3
@@ -133,7 +133,7 @@ nextcol:
add rdx, 8
cmp edx, dword arg(5) ;cols
- jl nextcol
+ jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
@@ -142,7 +142,7 @@ nextcol:
xor rdx, rdx
movq mm0, QWORD PTR [rdi-8];
-acrossnextcol:
+.acrossnextcol:
movq xmm7, QWORD PTR [rdi +rdx -2]
movd xmm4, DWORD PTR [rdi +rdx +6]
@@ -219,7 +219,7 @@ acrossnextcol:
add rdx, 8
cmp edx, dword arg(5) ;cols
- jl acrossnextcol;
+ jl .acrossnextcol;
; last 8 pixels
movq QWORD PTR [rdi+rdx-8], mm0
@@ -231,7 +231,7 @@ acrossnextcol:
mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
dec rcx ; decrement count
- jnz nextrow ; next row
+ jnz .nextrow ; next row
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
add rsp,16
@@ -282,7 +282,7 @@ sym(vp8_mbpost_proc_down_xmm):
add dword arg(2), 8
;for(c=0; c<cols; c+=8)
-loop_col:
+.loop_col:
mov rsi, arg(0) ; s
pxor xmm0, xmm0 ;
@@ -301,7 +301,7 @@ loop_col:
mov rcx, 15 ;
-loop_initvar:
+.loop_initvar:
movq xmm1, QWORD PTR [rdi];
punpcklbw xmm1, xmm0 ;
@@ -318,10 +318,10 @@ loop_initvar:
lea rdi, [rdi+rax] ;
dec rcx
- jne loop_initvar
+ jne .loop_initvar
;save the var and sum
xor rdx, rdx
-loop_row:
+.loop_row:
movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
@@ -428,12 +428,12 @@ loop_row:
add rdx, 1
cmp edx, dword arg(2) ;rows
- jl loop_row
+ jl .loop_row
add dword arg(0), 8 ; s += 8
sub dword arg(3), 8 ; cols -= 8
cmp dword arg(3), 0
- jg loop_col
+ jg .loop_col
add rsp, 128+16
pop rsp
@@ -475,13 +475,13 @@ sym(vp8_mbpost_proc_across_ip_xmm):
;for(r=0;r<rows;r++)
-ip_row_loop:
+.ip_row_loop:
xor rdx, rdx ;sumsq=0;
xor rcx, rcx ;sum=0;
mov rsi, arg(0); s
mov rdi, -8
-ip_var_loop:
+.ip_var_loop:
;for(i=-8;i<=6;i++)
;{
; sumsq += s[i]*s[i];
@@ -493,7 +493,7 @@ ip_var_loop:
add edx, eax
add rdi, 1
cmp rdi, 6
- jle ip_var_loop
+ jle .ip_var_loop
;mov rax, sumsq
@@ -513,7 +513,7 @@ ip_var_loop:
pxor mm1, mm1
pxor xmm0, xmm0
-nextcol4:
+.nextcol4:
movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
@@ -600,7 +600,7 @@ nextcol4:
add rcx, 4
cmp rcx, rdx
- jl nextcol4
+ jl .nextcol4
;s+=pitch;
movsxd rax, dword arg(1)
@@ -608,7 +608,7 @@ nextcol4:
sub dword arg(2), 1 ;rows-=1
cmp dword arg(2), 0
- jg ip_row_loop
+ jg .ip_row_loop
add rsp, 16
pop rsp
@@ -640,7 +640,7 @@ sym(vp8_plane_add_noise_wmt):
push rdi
; end prolog
-addnoise_loop:
+.addnoise_loop:
call sym(rand) WRT_PLT
mov rcx, arg(1) ;noise
and rax, 0xff
@@ -657,7 +657,7 @@ addnoise_loop:
mov rsi, arg(0) ;Pos
xor rax,rax
-addnoise_nextset:
+.addnoise_nextset:
movdqu xmm1,[rsi+rax] ; get the source
psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
@@ -671,12 +671,12 @@ addnoise_nextset:
add rax,16 ; move to the next line
cmp rax, rcx
- jl addnoise_nextset
+ jl .addnoise_nextset
movsxd rax, dword arg(7) ; Pitch
add arg(0), rax ; Start += Pitch
sub dword arg(6), 1 ; Height -= 1
- jg addnoise_loop
+ jg .addnoise_loop
; begin epilog
pop rdi
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 0e23116ce..f54cc4e7e 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -503,7 +503,7 @@ sym(vp8_intra_pred_uv_tm_%1):
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
-vp8_intra_pred_uv_tm_%1_loop:
+.vp8_intra_pred_uv_tm_%1_loop:
movd xmm3, [rsi]
movd xmm5, [rsi+rax]
%ifidn %1, sse2
@@ -525,7 +525,7 @@ vp8_intra_pred_uv_tm_%1_loop:
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
- jnz vp8_intra_pred_uv_tm_%1_loop
+ jnz .vp8_intra_pred_uv_tm_%1_loop
; begin epilog
pop rdi
@@ -615,7 +615,7 @@ sym(vp8_intra_pred_uv_ho_%1):
%endif
dec rsi
%ifidn %1, mmx2
-vp8_intra_pred_uv_ho_%1_loop:
+.vp8_intra_pred_uv_ho_%1_loop:
movd mm0, [rsi]
movd mm1, [rsi+rax]
punpcklbw mm0, mm0
@@ -627,7 +627,7 @@ vp8_intra_pred_uv_ho_%1_loop:
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
- jnz vp8_intra_pred_uv_ho_%1_loop
+ jnz .vp8_intra_pred_uv_ho_%1_loop
%else
movd xmm0, [rsi]
movd xmm3, [rsi+rax]
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 9004b525d..e68d950ad 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -50,7 +50,7 @@ sym(vp8_filter_block1d_h6_mmx):
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
-nextrow:
+.nextrow:
movq mm3, [rsi-2] ; mm3 = p-2..p5
movq mm4, mm3 ; mm4 = p-2..p5
psrlq mm3, 8 ; mm3 = p-1..p5
@@ -102,7 +102,7 @@ nextrow:
%endif
dec rcx ; decrement count
- jnz nextrow ; next row
+ jnz .nextrow ; next row
; begin epilog
pop rdi
@@ -152,7 +152,7 @@ sym(vp8_filter_block1dc_v6_mmx):
pxor mm0, mm0 ; mm0 = 00000000
-nextrow_cv:
+.nextrow_cv:
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
@@ -190,7 +190,7 @@ nextrow_cv:
; avoidable!!!.
lea rdi, [rdi+rax] ;
dec rcx ; decrement count
- jnz nextrow_cv ; next row
+ jnz .nextrow_cv ; next row
pop rbx
@@ -282,7 +282,7 @@ sym(vp8_bilinear_predict8x8_mmx):
packuswb mm7, mm4 ;
add rsi, rdx ; next line
-next_row_8x8:
+.next_row_8x8:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
@@ -349,7 +349,7 @@ next_row_8x8:
add rdi, r8 ;dst_pitch
%endif
cmp rdi, rcx ;
- jne next_row_8x8
+ jne .next_row_8x8
; begin epilog
pop rdi
@@ -437,7 +437,7 @@ sym(vp8_bilinear_predict8x4_mmx):
packuswb mm7, mm4 ;
add rsi, rdx ; next line
-next_row_8x4:
+.next_row_8x4:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
@@ -504,7 +504,7 @@ next_row_8x4:
add rdi, r8
%endif
cmp rdi, rcx ;
- jne next_row_8x4
+ jne .next_row_8x4
; begin epilog
pop rdi
@@ -579,7 +579,7 @@ sym(vp8_bilinear_predict4x4_mmx):
packuswb mm7, mm0 ;
add rsi, rdx ; next line
-next_row_4x4:
+.next_row_4x4:
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
@@ -622,7 +622,7 @@ next_row_4x4:
%endif
cmp rdi, rcx ;
- jne next_row_4x4
+ jne .next_row_4x4
; begin epilog
pop rdi
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index 83e3b1479..b62b5c68d 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -55,7 +55,7 @@ sym(vp8_filter_block1d8_h6_sse2):
%endif
pxor xmm0, xmm0 ; clear xmm0 for unpack
-filter_block1d8_h6_rowloop:
+.filter_block1d8_h6_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
@@ -124,7 +124,7 @@ filter_block1d8_h6_rowloop:
%endif
dec rcx
- jnz filter_block1d8_h6_rowloop ; next row
+ jnz .filter_block1d8_h6_rowloop ; next row
; begin epilog
pop rdi
@@ -176,7 +176,7 @@ sym(vp8_filter_block1d16_h6_sse2):
pxor xmm0, xmm0 ; clear xmm0 for unpack
-filter_block1d16_h6_sse2_rowloop:
+.filter_block1d16_h6_sse2_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
@@ -301,7 +301,7 @@ filter_block1d16_h6_sse2_rowloop:
%endif
dec rcx
- jnz filter_block1d16_h6_sse2_rowloop ; next row
+ jnz .filter_block1d16_h6_sse2_rowloop ; next row
; begin epilog
pop rdi
@@ -356,7 +356,7 @@ sym(vp8_filter_block1d8_v6_sse2):
movsxd r8, dword ptr arg(2) ; dst_ptich
%endif
-vp8_filter_block1d8_v6_sse2_loop:
+.vp8_filter_block1d8_v6_sse2_loop:
movdqa xmm1, XMMWORD PTR [rsi]
pmullw xmm1, [rax]
@@ -396,7 +396,7 @@ vp8_filter_block1d8_v6_sse2_loop:
add rdi, r8
%endif
dec rcx ; decrement count
- jnz vp8_filter_block1d8_v6_sse2_loop ; next row
+ jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
; begin epilog
pop rdi
@@ -448,7 +448,7 @@ sym(vp8_filter_block1d16_v6_sse2):
movsxd r8, dword ptr arg(2) ; dst_ptich
%endif
-vp8_filter_block1d16_v6_sse2_loop:
+.vp8_filter_block1d16_v6_sse2_loop:
; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
@@ -511,7 +511,7 @@ vp8_filter_block1d16_v6_sse2_loop:
add rdi, r8
%endif
dec rcx ; decrement count
- jnz vp8_filter_block1d16_v6_sse2_loop ; next row
+ jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
; begin epilog
pop rdi
@@ -556,7 +556,7 @@ sym(vp8_filter_block1d8_h6_only_sse2):
%endif
pxor xmm0, xmm0 ; clear xmm0 for unpack
-filter_block1d8_h6_only_rowloop:
+.filter_block1d8_h6_only_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
@@ -624,7 +624,7 @@ filter_block1d8_h6_only_rowloop:
%endif
dec rcx
- jnz filter_block1d8_h6_only_rowloop ; next row
+ jnz .filter_block1d8_h6_only_rowloop ; next row
; begin epilog
pop rdi
@@ -670,7 +670,7 @@ sym(vp8_filter_block1d16_h6_only_sse2):
pxor xmm0, xmm0 ; clear xmm0 for unpack
-filter_block1d16_h6_only_sse2_rowloop:
+.filter_block1d16_h6_only_sse2_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
@@ -789,7 +789,7 @@ filter_block1d16_h6_only_sse2_rowloop:
%endif
dec rcx
- jnz filter_block1d16_h6_only_sse2_rowloop ; next row
+ jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
; begin epilog
pop rdi
@@ -837,7 +837,7 @@ sym(vp8_filter_block1d8_v6_only_sse2):
movsxd r8, dword ptr arg(3) ; dst_ptich
%endif
-vp8_filter_block1d8_v6_only_sse2_loop:
+.vp8_filter_block1d8_v6_only_sse2_loop:
movq xmm1, MMWORD PTR [rsi]
movq xmm2, MMWORD PTR [rsi + rdx]
movq xmm3, MMWORD PTR [rsi + rdx * 2]
@@ -883,7 +883,7 @@ vp8_filter_block1d8_v6_only_sse2_loop:
add rdi, r8
%endif
dec rcx ; decrement count
- jnz vp8_filter_block1d8_v6_only_sse2_loop ; next row
+ jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
; begin epilog
pop rdi
@@ -924,7 +924,7 @@ sym(vp8_unpack_block1d16_h6_sse2):
movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
%endif
-unpack_block1d16_h6_sse2_rowloop:
+.unpack_block1d16_h6_sse2_rowloop:
movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
@@ -941,7 +941,7 @@ unpack_block1d16_h6_sse2_rowloop:
add rdi, r8
%endif
dec rcx
- jnz unpack_block1d16_h6_sse2_rowloop ; next row
+ jnz .unpack_block1d16_h6_sse2_rowloop ; next row
; begin epilog
pop rdi
@@ -980,7 +980,7 @@ sym(vp8_bilinear_predict16x16_sse2):
movsxd rax, dword ptr arg(2) ;xoffset
cmp rax, 0 ;skip first_pass filter if xoffset=0
- je b16x16_sp_only
+ je .b16x16_sp_only
shl rax, 5
add rax, rcx ;HFilter
@@ -995,7 +995,7 @@ sym(vp8_bilinear_predict16x16_sse2):
movsxd rax, dword ptr arg(3) ;yoffset
cmp rax, 0 ;skip second_pass filter if yoffset=0
- je b16x16_fp_only
+ je .b16x16_fp_only
shl rax, 5
add rax, rcx ;VFilter
@@ -1041,7 +1041,7 @@ sym(vp8_bilinear_predict16x16_sse2):
packuswb xmm7, xmm4
add rsi, rdx ; next line
-next_row:
+.next_row:
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm4, xmm3 ; make a copy of current line
@@ -1104,11 +1104,11 @@ next_row:
%endif
cmp rdi, rcx
- jne next_row
+ jne .next_row
- jmp done
+ jmp .done
-b16x16_sp_only:
+.b16x16_sp_only:
movsxd rax, dword ptr arg(3) ;yoffset
shl rax, 5
add rax, rcx ;VFilter
@@ -1130,7 +1130,7 @@ b16x16_sp_only:
movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
add rsi, rax ; next line
-next_row_spo:
+.next_row_spo:
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm5, xmm7
@@ -1164,17 +1164,17 @@ next_row_spo:
add rsi, rax ; next line
add rdi, rdx ;dst_pitch
cmp rdi, rcx
- jne next_row_spo
+ jne .next_row_spo
- jmp done
+ jmp .done
-b16x16_fp_only:
+.b16x16_fp_only:
lea rcx, [rdi+rdx*8]
lea rcx, [rcx+rdx*8]
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
pxor xmm0, xmm0
-next_row_fpo:
+.next_row_fpo:
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm4, xmm3 ; make a copy of current line
@@ -1208,9 +1208,9 @@ next_row_fpo:
add rsi, rax ; next line
add rdi, rdx ; dst_pitch
cmp rdi, rcx
- jne next_row_fpo
+ jne .next_row_fpo
-done:
+.done:
; begin epilog
pop rdi
pop rsi
@@ -1318,7 +1318,7 @@ sym(vp8_bilinear_predict8x8_sse2):
movdqa xmm7, xmm3
add rsp, 16 ; next line
-next_row8x8:
+.next_row8x8:
movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
movdqa xmm4, xmm3 ; make a copy of current line
psrldq xmm4, 1
@@ -1352,7 +1352,7 @@ next_row8x8:
add rdi, rdx
cmp rdi, rcx
- jne next_row8x8
+ jne .next_row8x8
;add rsp, 144
pop rsp
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 1ddbc54bd..6bca82bfb 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -70,7 +70,7 @@ sym(vp8_filter_block1d8_h6_ssse3):
sub rdi, rdx
;xmm3 free
-filter_block1d8_h6_rowloop_ssse3:
+.filter_block1d8_h6_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
@@ -102,7 +102,7 @@ filter_block1d8_h6_rowloop_ssse3:
packuswb xmm0, xmm0
movq MMWORD Ptr [rdi], xmm0
- jnz filter_block1d8_h6_rowloop_ssse3
+ jnz .filter_block1d8_h6_rowloop_ssse3
; begin epilog
pop rdi
@@ -129,7 +129,7 @@ vp8_filter_block1d8_h4_ssse3:
sub rdi, rdx
-filter_block1d8_h4_rowloop_ssse3:
+.filter_block1d8_h4_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
@@ -158,7 +158,7 @@ filter_block1d8_h4_rowloop_ssse3:
movq MMWORD Ptr [rdi], xmm0
- jnz filter_block1d8_h4_rowloop_ssse3
+ jnz .filter_block1d8_h4_rowloop_ssse3
; begin epilog
pop rdi
@@ -207,7 +207,7 @@ sym(vp8_filter_block1d16_h6_ssse3):
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rdx, dword ptr arg(3) ;output_pitch
-filter_block1d16_h6_rowloop_ssse3:
+.filter_block1d16_h6_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
@@ -264,7 +264,7 @@ filter_block1d16_h6_rowloop_ssse3:
lea rdi, [rdi + rdx]
dec rcx
- jnz filter_block1d16_h6_rowloop_ssse3
+ jnz .filter_block1d16_h6_rowloop_ssse3
; begin epilog
pop rdi
@@ -304,7 +304,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
movdqa xmm7, [GLOBAL(rd)]
cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d4_h4_ssse3
+ je .vp8_filter_block1d4_h4_ssse3
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
@@ -318,7 +318,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
movsxd rdx, dword ptr arg(3) ;output_pitch
;xmm3 free
-filter_block1d4_h6_rowloop_ssse3:
+.filter_block1d4_h6_rowloop_ssse3:
movdqu xmm0, XMMWORD PTR [rsi - 2]
movdqa xmm1, xmm0
@@ -346,7 +346,7 @@ filter_block1d4_h6_rowloop_ssse3:
add rdi, rdx
dec rcx
- jnz filter_block1d4_h6_rowloop_ssse3
+ jnz .filter_block1d4_h6_rowloop_ssse3
; begin epilog
pop rdi
@@ -356,7 +356,7 @@ filter_block1d4_h6_rowloop_ssse3:
pop rbp
ret
-vp8_filter_block1d4_h4_ssse3:
+.vp8_filter_block1d4_h4_ssse3:
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
@@ -369,7 +369,7 @@ vp8_filter_block1d4_h4_ssse3:
movsxd rdx, dword ptr arg(3) ;output_pitch
-filter_block1d4_h4_rowloop_ssse3:
+.filter_block1d4_h4_rowloop_ssse3:
movdqu xmm1, XMMWORD PTR [rsi - 2]
movdqa xmm2, xmm1
@@ -391,7 +391,7 @@ filter_block1d4_h4_rowloop_ssse3:
add rdi, rdx
dec rcx
- jnz filter_block1d4_h4_rowloop_ssse3
+ jnz .filter_block1d4_h4_rowloop_ssse3
; begin epilog
pop rdi
@@ -432,7 +432,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
add rax, rdx
cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d16_v4_ssse3
+ je .vp8_filter_block1d16_v4_ssse3
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
@@ -450,7 +450,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
add rax, rdx
-vp8_filter_block1d16_v6_ssse3_loop:
+.vp8_filter_block1d16_v6_ssse3_loop:
movq xmm1, MMWORD PTR [rsi] ;A
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
@@ -508,7 +508,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d16_v6_ssse3_loop
+ jnz .vp8_filter_block1d16_v6_ssse3_loop
; begin epilog
pop rdi
@@ -519,7 +519,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
pop rbp
ret
-vp8_filter_block1d16_v4_ssse3:
+.vp8_filter_block1d16_v4_ssse3:
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
@@ -534,7 +534,7 @@ vp8_filter_block1d16_v4_ssse3:
movsxd rcx, DWORD PTR arg(4) ;output_height
add rax, rdx
-vp8_filter_block1d16_v4_ssse3_loop:
+.vp8_filter_block1d16_v4_ssse3_loop:
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
@@ -581,7 +581,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d16_v4_ssse3_loop
+ jnz .vp8_filter_block1d16_v4_ssse3_loop
; begin epilog
pop rdi
@@ -627,7 +627,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
movsxd rcx, DWORD PTR arg(4) ;[output_height]
cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d8_v4_ssse3
+ je .vp8_filter_block1d8_v4_ssse3
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
@@ -638,7 +638,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
mov rax, rsi
add rax, rdx
-vp8_filter_block1d8_v6_ssse3_loop:
+.vp8_filter_block1d8_v6_ssse3_loop:
movq xmm1, MMWORD PTR [rsi] ;A
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
@@ -673,7 +673,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d8_v6_ssse3_loop
+ jnz .vp8_filter_block1d8_v6_ssse3_loop
; begin epilog
pop rdi
@@ -684,7 +684,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
pop rbp
ret
-vp8_filter_block1d8_v4_ssse3:
+.vp8_filter_block1d8_v4_ssse3:
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
movdqa xmm5, [GLOBAL(rd)]
@@ -694,7 +694,7 @@ vp8_filter_block1d8_v4_ssse3:
mov rax, rsi
add rax, rdx
-vp8_filter_block1d8_v4_ssse3_loop:
+.vp8_filter_block1d8_v4_ssse3_loop:
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
@@ -722,7 +722,7 @@ vp8_filter_block1d8_v4_ssse3_loop:
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d8_v4_ssse3_loop
+ jnz .vp8_filter_block1d8_v4_ssse3_loop
; begin epilog
pop rdi
@@ -766,7 +766,7 @@ sym(vp8_filter_block1d4_v6_ssse3):
movsxd rcx, DWORD PTR arg(4) ;[output_height]
cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d4_v4_ssse3
+ je .vp8_filter_block1d4_v4_ssse3
movq mm5, MMWORD PTR [rax] ;k0_k5
movq mm6, MMWORD PTR [rax+256] ;k2_k4
@@ -777,7 +777,7 @@ sym(vp8_filter_block1d4_v6_ssse3):
mov rax, rsi
add rax, rdx
-vp8_filter_block1d4_v6_ssse3_loop:
+.vp8_filter_block1d4_v6_ssse3_loop:
movd mm1, DWORD PTR [rsi] ;A
movd mm2, DWORD PTR [rsi + rdx] ;B
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
@@ -813,7 +813,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d4_v6_ssse3_loop
+ jnz .vp8_filter_block1d4_v6_ssse3_loop
; begin epilog
pop rdi
@@ -823,7 +823,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
pop rbp
ret
-vp8_filter_block1d4_v4_ssse3:
+.vp8_filter_block1d4_v4_ssse3:
movq mm6, MMWORD PTR [rax+256] ;k2_k4
movq mm7, MMWORD PTR [rax+128] ;k1_k3
movq mm5, MMWORD PTR [GLOBAL(rd)]
@@ -833,7 +833,7 @@ vp8_filter_block1d4_v4_ssse3:
mov rax, rsi
add rax, rdx
-vp8_filter_block1d4_v4_ssse3_loop:
+.vp8_filter_block1d4_v4_ssse3_loop:
movd mm2, DWORD PTR [rsi + rdx] ;B
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
movd mm4, DWORD PTR [rax + rdx * 2] ;D
@@ -861,7 +861,7 @@ vp8_filter_block1d4_v4_ssse3_loop:
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d4_v4_ssse3_loop
+ jnz .vp8_filter_block1d4_v4_ssse3_loop
; begin epilog
pop rdi
@@ -895,7 +895,7 @@ sym(vp8_bilinear_predict16x16_ssse3):
movsxd rax, dword ptr arg(2) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
- je b16x16_sp_only
+ je .b16x16_sp_only
shl rax, 4
lea rax, [rax + rcx] ; HFilter
@@ -909,7 +909,7 @@ sym(vp8_bilinear_predict16x16_ssse3):
movsxd rax, dword ptr arg(3) ; yoffset
cmp rax, 0 ; skip second_pass filter if yoffset=0
- je b16x16_fp_only
+ je .b16x16_fp_only
shl rax, 4
lea rax, [rax + rcx] ; VFilter
@@ -996,9 +996,9 @@ sym(vp8_bilinear_predict16x16_ssse3):
cmp rdi, rcx
jne .next_row
- jmp done
+ jmp .done
-b16x16_sp_only:
+.b16x16_sp_only:
movsxd rax, dword ptr arg(3) ; yoffset
shl rax, 4
lea rax, [rax + rcx] ; VFilter
@@ -1018,7 +1018,7 @@ b16x16_sp_only:
movq xmm2, [rsi + 8] ; load row 0
lea rsi, [rsi + rax] ; next line
-.next_row:
+.next_row_sp:
movq xmm3, [rsi] ; load row + 1
movq xmm5, [rsi + 8] ; load row + 1
@@ -1062,16 +1062,16 @@ b16x16_sp_only:
lea rdi, [rdi + 2*rdx]
cmp rdi, rcx
- jne .next_row
+ jne .next_row_sp
- jmp done
+ jmp .done
-b16x16_fp_only:
+.b16x16_fp_only:
lea rcx, [rdi+rdx*8]
lea rcx, [rcx+rdx*8]
movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-.next_row:
+.next_row_fp:
movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
@@ -1122,9 +1122,9 @@ b16x16_fp_only:
cmp rdi, rcx
- jne .next_row
+ jne .next_row_fp
-done:
+.done:
; begin epilog
pop rdi
pop rsi
@@ -1191,7 +1191,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
movsxd rax, dword ptr arg(2) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
- je b8x8_sp_only
+ je .b8x8_sp_only
shl rax, 4
add rax, rcx ; HFilter
@@ -1203,7 +1203,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
movsxd rax, dword ptr arg(3) ; yoffset
cmp rax, 0 ; skip second_pass filter if yoffset=0
- je b8x8_fp_only
+ je .b8x8_fp_only
shl rax, 4
lea rax, [rax + rcx] ; VFilter
@@ -1260,9 +1260,9 @@ sym(vp8_bilinear_predict8x8_ssse3):
cmp rdi, rcx
jne .next_row
- jmp done8x8
+ jmp .done8x8
-b8x8_sp_only:
+.b8x8_sp_only:
movsxd rax, dword ptr arg(3) ; yoffset
shl rax, 4
lea rax, [rax + rcx] ; VFilter
@@ -1364,12 +1364,12 @@ b8x8_sp_only:
movq [rdi+rdx], xmm1
lea rsp, [rsp + 144]
- jmp done8x8
+ jmp .done8x8
-b8x8_fp_only:
+.b8x8_fp_only:
lea rcx, [rdi+rdx*8]
-.next_row:
+.next_row_fp:
movdqa xmm1, XMMWORD PTR [rsp]
movdqa xmm3, XMMWORD PTR [rsp+16]
@@ -1430,11 +1430,11 @@ b8x8_fp_only:
lea rdi, [rdi + 2*rdx]
cmp rdi, rcx
- jne .next_row
+ jne .next_row_fp
lea rsp, [rsp + 16]
-done8x8:
+.done8x8:
;add rsp, 144
pop rsp
; begin epilog
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index 994629499..7ec7d603c 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -148,7 +148,7 @@ sym(vp8_mbblock_error_mmx_impl):
pcmpeqw mm1, mm7
mov rcx, 16
-mberror_loop_mmx:
+.mberror_loop_mmx:
movq mm3, [rsi]
movq mm4, [rdi]
@@ -186,7 +186,7 @@ mberror_loop_mmx:
add rdi, 32
sub rcx, 1
- jnz mberror_loop_mmx
+ jnz .mberror_loop_mmx
movq mm0, mm2
psrlq mm2, 32
@@ -226,7 +226,7 @@ sym(vp8_mbblock_error_xmm_impl):
pcmpeqw xmm5, xmm6
mov rcx, 16
-mberror_loop:
+.mberror_loop:
movdqa xmm0, [rsi]
movdqa xmm1, [rdi]
@@ -249,7 +249,7 @@ mberror_loop:
paddd xmm4, xmm2
paddd xmm4, xmm0
- jnz mberror_loop
+ jnz .mberror_loop
movdqa xmm0, xmm4
punpckldq xmm0, xmm6
@@ -289,7 +289,7 @@ sym(vp8_mbuverror_mmx_impl):
mov rcx, 16
pxor mm7, mm7
-mbuverror_loop_mmx:
+.mbuverror_loop_mmx:
movq mm1, [rsi]
movq mm2, [rdi]
@@ -313,7 +313,7 @@ mbuverror_loop_mmx:
add rdi, 16
dec rcx
- jnz mbuverror_loop_mmx
+ jnz .mbuverror_loop_mmx
movq mm0, mm7
psrlq mm7, 32
@@ -346,7 +346,7 @@ sym(vp8_mbuverror_xmm_impl):
mov rcx, 16
pxor xmm3, xmm3
-mbuverror_loop:
+.mbuverror_loop:
movdqa xmm1, [rsi]
movdqa xmm2, [rdi]
@@ -360,7 +360,7 @@ mbuverror_loop:
add rdi, 16
dec rcx
- jnz mbuverror_loop
+ jnz .mbuverror_loop
pxor xmm0, xmm0
movdqa xmm1, xmm3
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 056b64c39..c483933df 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -137,17 +137,17 @@ sym(vp8_regular_quantize_b_sse2):
; if (x >= zbin)
sub cx, WORD PTR[rdx] ; x - zbin
lea rdx, [rdx + 2] ; zbin_boost_ptr++
- jl rq_zigzag_loop_%1 ; x < zbin
+ jl .rq_zigzag_loop_%1 ; x < zbin
movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
; downshift by quant_shift[rc]
movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
sar edi, cl ; also sets Z bit
- je rq_zigzag_loop_%1 ; !y
+ je .rq_zigzag_loop_%1 ; !y
mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP 0
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index 258899eed..95e1c2074 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -140,21 +140,21 @@ sym(vp8_regular_quantize_b_sse4):
; if (x >= zbin)
sub cx, WORD PTR[rdx] ; x - zbin
lea rdx, [rdx + 2] ; zbin_boost_ptr++
- jl rq_zigzag_loop_%1 ; x < zbin
+ jl .rq_zigzag_loop_%1 ; x < zbin
pextrw edi, %3, %2 ; y
; downshift by quant_shift[rc]
pextrb ecx, xmm5, %1 ; quant_shift[rc]
sar edi, cl ; also sets Z bit
- je rq_zigzag_loop_%1 ; !y
+ je .rq_zigzag_loop_%1 ; !y
%if ABI_IS_32BIT
mov WORD PTR[rsp + qcoeff + %1 *2], di
%else
pinsrw %5, edi, %2 ; qcoeff[rc]
%endif
mov rdx, rax ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
index 85cb023a4..407b39979 100644
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -43,7 +43,7 @@ sym(vp8_sad16x16_mmx):
pxor mm6, mm6
-x16x16sad_mmx_loop:
+.x16x16sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
@@ -83,7 +83,7 @@ x16x16sad_mmx_loop:
paddw mm7, mm1
cmp rsi, rcx
- jne x16x16sad_mmx_loop
+ jne .x16x16sad_mmx_loop
movq mm0, mm7
@@ -135,7 +135,7 @@ sym(vp8_sad8x16_mmx):
pxor mm6, mm6
-x8x16sad_mmx_loop:
+.x8x16sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -158,7 +158,7 @@ x8x16sad_mmx_loop:
paddw mm7, mm2
cmp rsi, rcx
- jne x8x16sad_mmx_loop
+ jne .x8x16sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
@@ -205,7 +205,7 @@ sym(vp8_sad8x8_mmx):
pxor mm6, mm6
-x8x8sad_mmx_loop:
+.x8x8sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -228,7 +228,7 @@ x8x8sad_mmx_loop:
paddw mm7, mm0
cmp rsi, rcx
- jne x8x8sad_mmx_loop
+ jne .x8x8sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
@@ -364,7 +364,7 @@ sym(vp8_sad16x8_mmx):
pxor mm6, mm6
-x16x8sad_mmx_loop:
+.x16x8sad_mmx_loop:
movq mm0, [rsi]
movq mm1, [rdi]
@@ -404,7 +404,7 @@ x16x8sad_mmx_loop:
paddw mm7, mm0
cmp rsi, rcx
- jne x16x8sad_mmx_loop
+ jne .x16x8sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 1011c9553..fa8e3e3f8 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -37,7 +37,7 @@ sym(vp8_sad16x16_wmt):
lea rcx, [rcx+rax*8]
pxor xmm6, xmm6
-x16x16sad_wmt_loop:
+.x16x16sad_wmt_loop:
movq xmm0, QWORD PTR [rsi]
movq xmm2, QWORD PTR [rsi+8]
@@ -68,7 +68,7 @@ x16x16sad_wmt_loop:
paddw xmm6, xmm4
cmp rsi, rcx
- jne x16x16sad_wmt_loop
+ jne .x16x16sad_wmt_loop
movq xmm0, xmm6
psrldq xmm6, 8
@@ -111,11 +111,11 @@ sym(vp8_sad8x16_wmt):
lea rcx, [rcx+rbx*8]
pxor mm7, mm7
-x8x16sad_wmt_loop:
+.x8x16sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x8x16sad_wmt_early_exit
+ jg .x8x16sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -133,11 +133,11 @@ x8x16sad_wmt_loop:
paddw mm7, mm2
cmp rsi, rcx
- jne x8x16sad_wmt_loop
+ jne .x8x16sad_wmt_loop
movq rax, mm7
-x8x16sad_wmt_early_exit:
+.x8x16sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -172,11 +172,11 @@ sym(vp8_sad8x8_wmt):
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
-x8x8sad_wmt_loop:
+.x8x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x8x8sad_wmt_early_exit
+ jg .x8x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -188,10 +188,10 @@ x8x8sad_wmt_loop:
paddw mm7, mm0
cmp rsi, rcx
- jne x8x8sad_wmt_loop
+ jne .x8x8sad_wmt_loop
movq rax, mm7
-x8x8sad_wmt_early_exit:
+.x8x8sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -281,11 +281,11 @@ sym(vp8_sad16x8_wmt):
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
-x16x8sad_wmt_loop:
+.x16x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x16x8sad_wmt_early_exit
+ jg .x16x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
@@ -315,11 +315,11 @@ x16x8sad_wmt_loop:
paddw mm7, mm4
cmp rsi, rcx
- jne x16x8sad_wmt_loop
+ jne .x16x8sad_wmt_loop
movq rax, mm7
-x16x8sad_wmt_early_exit:
+.x16x8sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -352,7 +352,7 @@ sym(vp8_copy32xn_sse2):
movsxd rdx, dword ptr arg(3) ;dst_stride
movsxd rcx, dword ptr arg(4) ;height
-block_copy_sse2_loopx4:
+.block_copy_sse2_loopx4:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
movdqu xmm2, XMMWORD PTR [rsi + rax]
@@ -383,12 +383,12 @@ block_copy_sse2_loopx4:
sub rcx, 4
cmp rcx, 4
- jge block_copy_sse2_loopx4
+ jge .block_copy_sse2_loopx4
cmp rcx, 0
- je copy_is_done
+ je .copy_is_done
-block_copy_sse2_loop:
+.block_copy_sse2_loop:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
lea rsi, [rsi+rax]
@@ -398,9 +398,9 @@ block_copy_sse2_loop:
lea rdi, [rdi+rdx]
sub rcx, 1
- jne block_copy_sse2_loop
+ jne .block_copy_sse2_loop
-copy_is_done:
+.copy_is_done:
; begin epilog
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 9e0552166..a2550974c 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -647,7 +647,7 @@ sym(vp8_copy32xn_sse3):
STACK_FRAME_CREATE_X3
-block_copy_sse3_loopx4:
+.block_copy_sse3_loopx4:
lea end_ptr, [src_ptr+src_stride*2]
movdqu xmm0, XMMWORD PTR [src_ptr]
@@ -676,13 +676,13 @@ block_copy_sse3_loopx4:
sub height, 4
cmp height, 4
- jge block_copy_sse3_loopx4
+ jge .block_copy_sse3_loopx4
;Check to see if there is more rows need to be copied.
cmp height, 0
- je copy_is_done
+ je .copy_is_done
-block_copy_sse3_loop:
+.block_copy_sse3_loop:
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
lea src_ptr, [src_ptr+src_stride]
@@ -692,9 +692,9 @@ block_copy_sse3_loop:
lea ref_ptr, [ref_ptr+ref_stride]
sub height, 1
- jne block_copy_sse3_loop
+ jne .block_copy_sse3_loop
-copy_is_done:
+.copy_is_done:
STACK_FRAME_DESTROY_X3
;void vp8_sad16x16x4d_sse3(
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 6ecf08184..95b6c89e6 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -169,30 +169,30 @@ sym(vp8_sad16x16x3_ssse3):
mov rdx, 0xf
and rdx, rdi
- jmp vp8_sad16x16x3_ssse3_skiptable
-vp8_sad16x16x3_ssse3_jumptable:
- dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_skiptable:
-
- call vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_do_jump:
+ jmp .vp8_sad16x16x3_ssse3_skiptable
+.vp8_sad16x16x3_ssse3_jumptable:
+ dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_skiptable:
+
+ call .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
- mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+ mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
@@ -203,23 +203,23 @@ vp8_sad16x16x3_ssse3_do_jump:
jmp rcx
- PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
-
-vp8_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
+
+.vp8_sad16x16x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
@@ -229,7 +229,7 @@ vp8_sad16x16x3_ssse3_aligned_by_15:
PROCESS_16X2X3 0
PROCESS_16X2X3 0
-vp8_sad16x16x3_ssse3_store_off:
+.vp8_sad16x16x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
@@ -282,30 +282,30 @@ sym(vp8_sad16x8x3_ssse3):
mov rdx, 0xf
and rdx, rdi
- jmp vp8_sad16x8x3_ssse3_skiptable
-vp8_sad16x8x3_ssse3_jumptable:
- dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_skiptable:
-
- call vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_do_jump:
+ jmp .vp8_sad16x8x3_ssse3_skiptable
+.vp8_sad16x8x3_ssse3_jumptable:
+ dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_skiptable:
+
+ call .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
- mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+ mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
@@ -316,30 +316,30 @@ vp8_sad16x8x3_ssse3_do_jump:
jmp rcx
- PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
-
-vp8_sad16x8x3_ssse3_aligned_by_15:
+ PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
+
+.vp8_sad16x8x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
-vp8_sad16x8x3_ssse3_store_off:
+.vp8_sad16x8x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
index 8af4b4533..c6db3d1c6 100644
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -84,7 +84,7 @@ sym(vp8_ssim_parms_16x16_sse2):
pxor xmm11,xmm11 ;sum_sxr
mov rdx, 16 ;row counter
-NextRow:
+.NextRow:
;grab source and reference pixels
movdqu xmm5, [rsi]
@@ -107,7 +107,7 @@ NextRow:
add rdi, rax ; next r row
dec rdx ; counter
- jnz NextRow
+ jnz .NextRow
SUM_ACROSS_W xmm15
SUM_ACROSS_W xmm14
@@ -174,7 +174,7 @@ sym(vp8_ssim_parms_8x8_sse2):
pxor xmm11,xmm11 ;sum_sxr
mov rdx, 8 ;row counter
-NextRow2:
+.NextRow:
;grab source and reference pixels
movq xmm3, [rsi]
@@ -188,7 +188,7 @@ NextRow2:
add rdi, rax ; next r row
dec rdx ; counter
- jnz NextRow2
+ jnz .NextRow
SUM_ACROSS_W xmm15
SUM_ACROSS_W xmm14
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index a47e1f0d6..4ce16ce90 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -93,7 +93,7 @@ sym(vp8_subtract_mby_mmx):
mov rcx, 16
pxor mm0, mm0
-submby_loop:
+.submby_loop:
movq mm1, [rsi]
movq mm3, [rax]
@@ -139,7 +139,7 @@ submby_loop:
lea rsi, [rsi+rdx]
sub rcx, 1
- jnz submby_loop
+ jnz .submby_loop
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 95888f6be..3bd1ff678 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -91,7 +91,7 @@ sym(vp8_subtract_mby_sse2):
mov rcx, 8 ; do two lines at one time
-submby_loop:
+.submby_loop:
movdqa xmm0, XMMWORD PTR [rsi] ; src
movdqa xmm1, XMMWORD PTR [rax] ; pred
@@ -133,7 +133,7 @@ submby_loop:
lea rsi, [rsi+rdx*2]
sub rcx, 1
- jnz submby_loop
+ jnz .submby_loop
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index b777ef566..b97c69439 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -71,26 +71,26 @@ sym(vp8_temporal_filter_apply_sse2):
lea rcx, [rdx + 16*16*1]
cmp dword ptr [rsp + block_size], 8
- jne temporal_filter_apply_load_16
+ jne .temporal_filter_apply_load_16
lea rcx, [rdx + 8*8*1]
-temporal_filter_apply_load_8:
+.temporal_filter_apply_load_8:
movq xmm0, [rsi] ; first row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm0, xmm7 ; src[ 0- 7]
movq xmm1, [rsi] ; second row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm1, xmm7 ; src[ 8-15]
- jmp temporal_filter_apply_load_finished
+ jmp .temporal_filter_apply_load_finished
-temporal_filter_apply_load_16:
+.temporal_filter_apply_load_16:
movdqa xmm0, [rsi] ; src (frame1)
lea rsi, [rsi + rbp] ; += stride
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7 ; src[ 0- 7]
punpckhbw xmm1, xmm7 ; src[ 8-15]
-temporal_filter_apply_load_finished:
+.temporal_filter_apply_load_finished:
movdqa xmm2, [rdx] ; predictor (frame2)
movdqa xmm3, xmm2
punpcklbw xmm2, xmm7 ; pred[ 0- 7]
@@ -176,13 +176,13 @@ temporal_filter_apply_load_finished:
lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
cmp rdx, rcx
- je temporal_filter_apply_epilog
+ je .temporal_filter_apply_epilog
pxor xmm7, xmm7 ; zero for extraction
cmp dword ptr [rsp + block_size], 16
- je temporal_filter_apply_load_16
- jmp temporal_filter_apply_load_8
+ je .temporal_filter_apply_load_16
+ jmp .temporal_filter_apply_load_8
-temporal_filter_apply_epilog:
+.temporal_filter_apply_epilog:
; begin epilog
mov rbp, [rsp + rbp_backup]
add rsp, stack_size
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
index 13b76ea91..2be8bbeb3 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -27,7 +27,7 @@ sym(vp8_get_mb_ss_mmx):
mov rcx, 16
pxor mm4, mm4
-NEXTROW:
+.NEXTROW:
movq mm0, [rax]
movq mm1, [rax+8]
movq mm2, [rax+16]
@@ -44,7 +44,7 @@ NEXTROW:
add rax, 32
dec rcx
- ja NEXTROW
+ ja .NEXTROW
movq QWORD PTR [rsp], mm4
;return sum[0]+sum[1];
@@ -568,7 +568,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx):
add rsi, r8
%endif
-filter_block2d_bil4x4_var_mmx_loop:
+.filter_block2d_bil4x4_var_mmx_loop:
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
@@ -614,7 +614,7 @@ filter_block2d_bil4x4_var_mmx_loop:
add rdi, r9
%endif
sub rcx, 1 ;
- jnz filter_block2d_bil4x4_var_mmx_loop ;
+ jnz .filter_block2d_bil4x4_var_mmx_loop ;
pxor mm3, mm3 ;
@@ -726,7 +726,7 @@ sym(vp8_filter_block2d_bil_var_mmx):
add rsi, r8
%endif
-filter_block2d_bil_var_mmx_loop:
+.filter_block2d_bil_var_mmx_loop:
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
@@ -807,7 +807,7 @@ filter_block2d_bil_var_mmx_loop:
add rdi, r9
%endif
sub rcx, 1 ;
- jnz filter_block2d_bil_var_mmx_loop ;
+ jnz .filter_block2d_bil_var_mmx_loop ;
pxor mm3, mm3 ;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index b7a6b3286..762922091 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -33,7 +33,7 @@ sym(vp8_get_mb_ss_sse2):
mov rcx, 8
pxor xmm4, xmm4
-NEXTROW:
+.NEXTROW:
movdqa xmm0, [rax]
movdqa xmm1, [rax+16]
movdqa xmm2, [rax+32]
@@ -50,7 +50,7 @@ NEXTROW:
add rax, 0x40
dec rcx
- ja NEXTROW
+ ja .NEXTROW
movdqa xmm3,xmm4
psrldq xmm4,8
@@ -126,7 +126,7 @@ sym(vp8_get16x16var_sse2):
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
-var16loop:
+.var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
@@ -160,7 +160,7 @@ var16loop:
add rdi, rdx
sub rcx, 1
- jnz var16loop
+ jnz .var16loop
movdqa xmm1, xmm6
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
index a582f8dc5..97e8b0e2e 100644
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -47,7 +47,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
- je filter_block2d_bil_var_ssse3_sp_only
+ je .filter_block2d_bil_var_ssse3_sp_only
shl rax, 4 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
@@ -55,7 +55,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je filter_block2d_bil_var_ssse3_fp_only
+ je .filter_block2d_bil_var_ssse3_fp_only
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
@@ -88,7 +88,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
lea rsi, [rsi + r8]
%endif
-filter_block2d_bil_var_ssse3_loop:
+.filter_block2d_bil_var_ssse3_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
@@ -142,15 +142,15 @@ filter_block2d_bil_var_ssse3_loop:
%endif
sub rcx, 1
- jnz filter_block2d_bil_var_ssse3_loop
+ jnz .filter_block2d_bil_var_ssse3_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_sp_only:
+.filter_block2d_bil_var_ssse3_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
- je filter_block2d_bil_var_ssse3_full_pixel
+ je .filter_block2d_bil_var_ssse3_full_pixel
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
@@ -169,7 +169,7 @@ filter_block2d_bil_var_ssse3_sp_only:
lea rsi, [rsi + rax]
-filter_block2d_bil_sp_only_loop:
+.filter_block2d_bil_sp_only_loop:
movdqu xmm3, XMMWORD PTR [rsi]
movdqa xmm2, xmm1
movdqa xmm0, xmm3
@@ -209,11 +209,11 @@ filter_block2d_bil_sp_only_loop:
%endif
sub rcx, 1
- jnz filter_block2d_bil_sp_only_loop
+ jnz .filter_block2d_bil_sp_only_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_full_pixel:
+.filter_block2d_bil_var_ssse3_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
@@ -221,7 +221,7 @@ filter_block2d_bil_var_ssse3_full_pixel:
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0
-filter_block2d_bil_full_pixel_loop:
+.filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi]
punpcklbw xmm1, xmm0
movq xmm2, QWORD PTR [rsi+8]
@@ -244,11 +244,11 @@ filter_block2d_bil_full_pixel_loop:
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rdx] ;src_pixels_per_line
sub rcx, 1
- jnz filter_block2d_bil_full_pixel_loop
+ jnz .filter_block2d_bil_full_pixel_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_fp_only:
+.filter_block2d_bil_var_ssse3_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
@@ -260,7 +260,7 @@ filter_block2d_bil_var_ssse3_fp_only:
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
-filter_block2d_bil_fp_only_loop:
+.filter_block2d_bil_fp_only_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
@@ -298,11 +298,11 @@ filter_block2d_bil_fp_only_loop:
%endif
sub rcx, 1
- jnz filter_block2d_bil_fp_only_loop
+ jnz .filter_block2d_bil_fp_only_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_variance:
+.filter_block2d_bil_variance:
pxor xmm0, xmm0
pxor xmm1, xmm1
pxor xmm5, xmm5