summaryrefslogtreecommitdiff
path: root/vp8/encoder
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2011-08-24 00:05:11 -0400
committerJohn Koleszar <jkoleszar@google.com>2011-08-24 00:05:11 -0400
commitd2a2d5a6d5ea1886eef6078d180be364d80501bc (patch)
treef127bda8bd187da850a9c43e8c98163f4ee758d1 /vp8/encoder
parent7cb25d9c562e094c47f37ad34a94d5cafcba2ece (diff)
parentc5f890af2cff951048cc41630f2523b61fb74a0b (diff)
downloadlibvpx-d2a2d5a6d5ea1886eef6078d180be364d80501bc.tar
libvpx-d2a2d5a6d5ea1886eef6078d180be364d80501bc.tar.gz
libvpx-d2a2d5a6d5ea1886eef6078d180be364d80501bc.tar.bz2
libvpx-d2a2d5a6d5ea1886eef6078d180be364d80501bc.zip
Merge remote branch 'origin/master' into experimental
Change-Id: If53ec5c1219b31e5ef9ae552d9cc79432ebda267
Diffstat (limited to 'vp8/encoder')
-rw-r--r--vp8/encoder/generic/csystemdependent.c9
-rw-r--r--vp8/encoder/ssim.c19
-rw-r--r--vp8/encoder/variance.h16
-rw-r--r--vp8/encoder/x86/encodeopt.asm16
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm6
-rw-r--r--vp8/encoder/x86/quantize_sse4.asm6
-rw-r--r--vp8/encoder/x86/sad_mmx.asm16
-rw-r--r--vp8/encoder/x86/sad_sse2.asm40
-rw-r--r--vp8/encoder/x86/sad_sse3.asm12
-rw-r--r--vp8/encoder/x86/sad_ssse3.asm164
-rw-r--r--vp8/encoder/x86/ssim_opt.asm20
-rw-r--r--vp8/encoder/x86/subtract_mmx.asm4
-rw-r--r--vp8/encoder/x86/subtract_sse2.asm4
-rw-r--r--vp8/encoder/x86/temporal_filter_apply_sse2.asm18
-rw-r--r--vp8/encoder/x86/variance_impl_mmx.asm12
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm8
-rw-r--r--vp8/encoder/x86/variance_impl_ssse3.asm38
-rw-r--r--vp8/encoder/x86/variance_x86.h10
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c38
19 files changed, 217 insertions, 239 deletions
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index bd1959dff..1ca0f962f 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -94,16 +94,15 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
#if !(CONFIG_REALTIME_ONLY)
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_c;
#endif
+#if CONFIG_INTERNAL_STATS
+ cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_c;
+ cpi->rtcd.variance.ssimpf_16x16 = vp8_ssim_parms_16x16_c;
+#endif
#endif
// Pure C:
vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
-#if CONFIG_INTERNAL_STATS
- cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c;
- cpi->rtcd.variance.ssimpf = ssim_parms_c;
-#endif
-
#if ARCH_X86 || ARCH_X86_64
vp8_arch_x86_encoder_init(cpi);
#endif
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
index fea756f7b..d0f8e490a 100644
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -9,18 +9,9 @@
*/
-#include "vpx_scale/yv12config.h"
-#include "math.h"
#include "onyx_int.h"
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-
-void ssim_parms_c
+void vp8_ssim_parms_16x16_c
(
unsigned char *s,
int sp,
@@ -46,7 +37,7 @@ void ssim_parms_c
}
}
}
-void ssim_parms_8x8_c
+void vp8_ssim_parms_8x8_c
(
unsigned char *s,
int sp,
@@ -107,14 +98,14 @@ static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp,
const vp8_variance_rtcd_vtable_t *rtcd)
{
unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
- rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
}
static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
const vp8_variance_rtcd_vtable_t *rtcd)
{
unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
- rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ SSIMPF_INVOKE(rtcd,8x8)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
}
@@ -134,7 +125,7 @@ long dssim(unsigned char *s,int sp, unsigned char *r,int rp,
c1 = cc1*16;
c2 = cc2*16;
- rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
ssim_n1 = (2*sum_s*sum_r+ c1);
ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 5fd6d3ae0..d9bf66975 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -320,16 +320,16 @@ extern prototype_variance(vp8_variance_mse16x16);
#endif
extern prototype_get16x16prederror(vp8_variance_get4x4sse_cs);
-#ifndef vp8_ssimpf
-#define vp8_ssimpf ssim_parms_c
-#endif
-extern prototype_ssimpf(vp8_ssimpf)
-
#ifndef vp8_ssimpf_8x8
-#define vp8_ssimpf_8x8 ssim_parms_8x8_c
+#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_c
#endif
extern prototype_ssimpf(vp8_ssimpf_8x8)
+#ifndef vp8_ssimpf_16x16
+#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf_16x16)
+
typedef prototype_sad(*vp8_sad_fn_t);
typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
@@ -394,7 +394,7 @@ typedef struct
#if CONFIG_INTERNAL_STATS
vp8_ssimpf_fn_t ssimpf_8x8;
- vp8_ssimpf_fn_t ssimpf;
+ vp8_ssimpf_fn_t ssimpf_16x16;
#endif
} vp8_variance_rtcd_vtable_t;
@@ -417,8 +417,10 @@ typedef struct
#if CONFIG_RUNTIME_CPU_DETECT
#define VARIANCE_INVOKE(ctx,fn) (ctx)->fn
+#define SSIMPF_INVOKE(ctx,fn) (ctx)->ssimpf_##fn
#else
#define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
+#define SSIMPF_INVOKE(ctx,fn) vp8_ssimpf_##fn
#endif
#endif
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index 994629499..7ec7d603c 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -148,7 +148,7 @@ sym(vp8_mbblock_error_mmx_impl):
pcmpeqw mm1, mm7
mov rcx, 16
-mberror_loop_mmx:
+.mberror_loop_mmx:
movq mm3, [rsi]
movq mm4, [rdi]
@@ -186,7 +186,7 @@ mberror_loop_mmx:
add rdi, 32
sub rcx, 1
- jnz mberror_loop_mmx
+ jnz .mberror_loop_mmx
movq mm0, mm2
psrlq mm2, 32
@@ -226,7 +226,7 @@ sym(vp8_mbblock_error_xmm_impl):
pcmpeqw xmm5, xmm6
mov rcx, 16
-mberror_loop:
+.mberror_loop:
movdqa xmm0, [rsi]
movdqa xmm1, [rdi]
@@ -249,7 +249,7 @@ mberror_loop:
paddd xmm4, xmm2
paddd xmm4, xmm0
- jnz mberror_loop
+ jnz .mberror_loop
movdqa xmm0, xmm4
punpckldq xmm0, xmm6
@@ -289,7 +289,7 @@ sym(vp8_mbuverror_mmx_impl):
mov rcx, 16
pxor mm7, mm7
-mbuverror_loop_mmx:
+.mbuverror_loop_mmx:
movq mm1, [rsi]
movq mm2, [rdi]
@@ -313,7 +313,7 @@ mbuverror_loop_mmx:
add rdi, 16
dec rcx
- jnz mbuverror_loop_mmx
+ jnz .mbuverror_loop_mmx
movq mm0, mm7
psrlq mm7, 32
@@ -346,7 +346,7 @@ sym(vp8_mbuverror_xmm_impl):
mov rcx, 16
pxor xmm3, xmm3
-mbuverror_loop:
+.mbuverror_loop:
movdqa xmm1, [rsi]
movdqa xmm2, [rdi]
@@ -360,7 +360,7 @@ mbuverror_loop:
add rdi, 16
dec rcx
- jnz mbuverror_loop
+ jnz .mbuverror_loop
pxor xmm0, xmm0
movdqa xmm1, xmm3
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 056b64c39..c483933df 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -137,17 +137,17 @@ sym(vp8_regular_quantize_b_sse2):
; if (x >= zbin)
sub cx, WORD PTR[rdx] ; x - zbin
lea rdx, [rdx + 2] ; zbin_boost_ptr++
- jl rq_zigzag_loop_%1 ; x < zbin
+ jl .rq_zigzag_loop_%1 ; x < zbin
movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
; downshift by quant_shift[rc]
movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
sar edi, cl ; also sets Z bit
- je rq_zigzag_loop_%1 ; !y
+ je .rq_zigzag_loop_%1 ; !y
mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP 0
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index 258899eed..95e1c2074 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -140,21 +140,21 @@ sym(vp8_regular_quantize_b_sse4):
; if (x >= zbin)
sub cx, WORD PTR[rdx] ; x - zbin
lea rdx, [rdx + 2] ; zbin_boost_ptr++
- jl rq_zigzag_loop_%1 ; x < zbin
+ jl .rq_zigzag_loop_%1 ; x < zbin
pextrw edi, %3, %2 ; y
; downshift by quant_shift[rc]
pextrb ecx, xmm5, %1 ; quant_shift[rc]
sar edi, cl ; also sets Z bit
- je rq_zigzag_loop_%1 ; !y
+ je .rq_zigzag_loop_%1 ; !y
%if ABI_IS_32BIT
mov WORD PTR[rsp + qcoeff + %1 *2], di
%else
pinsrw %5, edi, %2 ; qcoeff[rc]
%endif
mov rdx, rax ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
index 85cb023a4..407b39979 100644
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -43,7 +43,7 @@ sym(vp8_sad16x16_mmx):
pxor mm6, mm6
-x16x16sad_mmx_loop:
+.x16x16sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
@@ -83,7 +83,7 @@ x16x16sad_mmx_loop:
paddw mm7, mm1
cmp rsi, rcx
- jne x16x16sad_mmx_loop
+ jne .x16x16sad_mmx_loop
movq mm0, mm7
@@ -135,7 +135,7 @@ sym(vp8_sad8x16_mmx):
pxor mm6, mm6
-x8x16sad_mmx_loop:
+.x8x16sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -158,7 +158,7 @@ x8x16sad_mmx_loop:
paddw mm7, mm2
cmp rsi, rcx
- jne x8x16sad_mmx_loop
+ jne .x8x16sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
@@ -205,7 +205,7 @@ sym(vp8_sad8x8_mmx):
pxor mm6, mm6
-x8x8sad_mmx_loop:
+.x8x8sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -228,7 +228,7 @@ x8x8sad_mmx_loop:
paddw mm7, mm0
cmp rsi, rcx
- jne x8x8sad_mmx_loop
+ jne .x8x8sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
@@ -364,7 +364,7 @@ sym(vp8_sad16x8_mmx):
pxor mm6, mm6
-x16x8sad_mmx_loop:
+.x16x8sad_mmx_loop:
movq mm0, [rsi]
movq mm1, [rdi]
@@ -404,7 +404,7 @@ x16x8sad_mmx_loop:
paddw mm7, mm0
cmp rsi, rcx
- jne x16x8sad_mmx_loop
+ jne .x16x8sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 1011c9553..fa8e3e3f8 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -37,7 +37,7 @@ sym(vp8_sad16x16_wmt):
lea rcx, [rcx+rax*8]
pxor xmm6, xmm6
-x16x16sad_wmt_loop:
+.x16x16sad_wmt_loop:
movq xmm0, QWORD PTR [rsi]
movq xmm2, QWORD PTR [rsi+8]
@@ -68,7 +68,7 @@ x16x16sad_wmt_loop:
paddw xmm6, xmm4
cmp rsi, rcx
- jne x16x16sad_wmt_loop
+ jne .x16x16sad_wmt_loop
movq xmm0, xmm6
psrldq xmm6, 8
@@ -111,11 +111,11 @@ sym(vp8_sad8x16_wmt):
lea rcx, [rcx+rbx*8]
pxor mm7, mm7
-x8x16sad_wmt_loop:
+.x8x16sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x8x16sad_wmt_early_exit
+ jg .x8x16sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -133,11 +133,11 @@ x8x16sad_wmt_loop:
paddw mm7, mm2
cmp rsi, rcx
- jne x8x16sad_wmt_loop
+ jne .x8x16sad_wmt_loop
movq rax, mm7
-x8x16sad_wmt_early_exit:
+.x8x16sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -172,11 +172,11 @@ sym(vp8_sad8x8_wmt):
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
-x8x8sad_wmt_loop:
+.x8x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x8x8sad_wmt_early_exit
+ jg .x8x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -188,10 +188,10 @@ x8x8sad_wmt_loop:
paddw mm7, mm0
cmp rsi, rcx
- jne x8x8sad_wmt_loop
+ jne .x8x8sad_wmt_loop
movq rax, mm7
-x8x8sad_wmt_early_exit:
+.x8x8sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -281,11 +281,11 @@ sym(vp8_sad16x8_wmt):
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
-x16x8sad_wmt_loop:
+.x16x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x16x8sad_wmt_early_exit
+ jg .x16x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
@@ -315,11 +315,11 @@ x16x8sad_wmt_loop:
paddw mm7, mm4
cmp rsi, rcx
- jne x16x8sad_wmt_loop
+ jne .x16x8sad_wmt_loop
movq rax, mm7
-x16x8sad_wmt_early_exit:
+.x16x8sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -352,7 +352,7 @@ sym(vp8_copy32xn_sse2):
movsxd rdx, dword ptr arg(3) ;dst_stride
movsxd rcx, dword ptr arg(4) ;height
-block_copy_sse2_loopx4:
+.block_copy_sse2_loopx4:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
movdqu xmm2, XMMWORD PTR [rsi + rax]
@@ -383,12 +383,12 @@ block_copy_sse2_loopx4:
sub rcx, 4
cmp rcx, 4
- jge block_copy_sse2_loopx4
+ jge .block_copy_sse2_loopx4
cmp rcx, 0
- je copy_is_done
+ je .copy_is_done
-block_copy_sse2_loop:
+.block_copy_sse2_loop:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
lea rsi, [rsi+rax]
@@ -398,9 +398,9 @@ block_copy_sse2_loop:
lea rdi, [rdi+rdx]
sub rcx, 1
- jne block_copy_sse2_loop
+ jne .block_copy_sse2_loop
-copy_is_done:
+.copy_is_done:
; begin epilog
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 9e0552166..a2550974c 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -647,7 +647,7 @@ sym(vp8_copy32xn_sse3):
STACK_FRAME_CREATE_X3
-block_copy_sse3_loopx4:
+.block_copy_sse3_loopx4:
lea end_ptr, [src_ptr+src_stride*2]
movdqu xmm0, XMMWORD PTR [src_ptr]
@@ -676,13 +676,13 @@ block_copy_sse3_loopx4:
sub height, 4
cmp height, 4
- jge block_copy_sse3_loopx4
+ jge .block_copy_sse3_loopx4
;Check to see if there is more rows need to be copied.
cmp height, 0
- je copy_is_done
+ je .copy_is_done
-block_copy_sse3_loop:
+.block_copy_sse3_loop:
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
lea src_ptr, [src_ptr+src_stride]
@@ -692,9 +692,9 @@ block_copy_sse3_loop:
lea ref_ptr, [ref_ptr+ref_stride]
sub height, 1
- jne block_copy_sse3_loop
+ jne .block_copy_sse3_loop
-copy_is_done:
+.copy_is_done:
STACK_FRAME_DESTROY_X3
;void vp8_sad16x16x4d_sse3(
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 6ecf08184..95b6c89e6 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -169,30 +169,30 @@ sym(vp8_sad16x16x3_ssse3):
mov rdx, 0xf
and rdx, rdi
- jmp vp8_sad16x16x3_ssse3_skiptable
-vp8_sad16x16x3_ssse3_jumptable:
- dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_skiptable:
-
- call vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_do_jump:
+ jmp .vp8_sad16x16x3_ssse3_skiptable
+.vp8_sad16x16x3_ssse3_jumptable:
+ dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_skiptable:
+
+ call .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
- mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+ mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
@@ -203,23 +203,23 @@ vp8_sad16x16x3_ssse3_do_jump:
jmp rcx
- PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
-
-vp8_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
+
+.vp8_sad16x16x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
@@ -229,7 +229,7 @@ vp8_sad16x16x3_ssse3_aligned_by_15:
PROCESS_16X2X3 0
PROCESS_16X2X3 0
-vp8_sad16x16x3_ssse3_store_off:
+.vp8_sad16x16x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
@@ -282,30 +282,30 @@ sym(vp8_sad16x8x3_ssse3):
mov rdx, 0xf
and rdx, rdi
- jmp vp8_sad16x8x3_ssse3_skiptable
-vp8_sad16x8x3_ssse3_jumptable:
- dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_skiptable:
-
- call vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_do_jump:
+ jmp .vp8_sad16x8x3_ssse3_skiptable
+.vp8_sad16x8x3_ssse3_jumptable:
+ dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_skiptable:
+
+ call .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
- mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+ mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
@@ -316,30 +316,30 @@ vp8_sad16x8x3_ssse3_do_jump:
jmp rcx
- PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
-
-vp8_sad16x8x3_ssse3_aligned_by_15:
+ PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
+
+.vp8_sad16x8x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
-vp8_sad16x8x3_ssse3_store_off:
+.vp8_sad16x8x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
index d5d267a69..c6db3d1c6 100644
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -44,7 +44,7 @@
paddd %1, xmm1
SUM_ACROSS_Q %1
%endmacro
-;void ssim_parms_sse3(
+;void ssim_parms_sse2(
; unsigned char *s,
; int sp,
; unsigned char *r,
@@ -61,8 +61,8 @@
; or pavgb At this point this is just meant to be first pass for calculating
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
; in mode selection code.
-global sym(vp8_ssim_parms_16x16_sse3)
-sym(vp8_ssim_parms_16x16_sse3):
+global sym(vp8_ssim_parms_16x16_sse2)
+sym(vp8_ssim_parms_16x16_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
@@ -84,7 +84,7 @@ sym(vp8_ssim_parms_16x16_sse3):
pxor xmm11,xmm11 ;sum_sxr
mov rdx, 16 ;row counter
-NextRow:
+.NextRow:
;grab source and reference pixels
movdqu xmm5, [rsi]
@@ -107,7 +107,7 @@ NextRow:
add rdi, rax ; next r row
dec rdx ; counter
- jnz NextRow
+ jnz .NextRow
SUM_ACROSS_W xmm15
SUM_ACROSS_W xmm14
@@ -134,7 +134,7 @@ NextRow:
pop rbp
ret
-;void ssim_parms_sse3(
+;void ssim_parms_sse2(
; unsigned char *s,
; int sp,
; unsigned char *r,
@@ -151,8 +151,8 @@ NextRow:
; or pavgb At this point this is just meant to be first pass for calculating
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
; in mode selection code.
-global sym(vp8_ssim_parms_8x8_sse3)
-sym(vp8_ssim_parms_8x8_sse3):
+global sym(vp8_ssim_parms_8x8_sse2)
+sym(vp8_ssim_parms_8x8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
@@ -174,7 +174,7 @@ sym(vp8_ssim_parms_8x8_sse3):
pxor xmm11,xmm11 ;sum_sxr
mov rdx, 8 ;row counter
-NextRow2:
+.NextRow:
;grab source and reference pixels
movq xmm3, [rsi]
@@ -188,7 +188,7 @@ NextRow2:
add rdi, rax ; next r row
dec rdx ; counter
- jnz NextRow2
+ jnz .NextRow
SUM_ACROSS_W xmm15
SUM_ACROSS_W xmm14
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index a47e1f0d6..4ce16ce90 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -93,7 +93,7 @@ sym(vp8_subtract_mby_mmx):
mov rcx, 16
pxor mm0, mm0
-submby_loop:
+.submby_loop:
movq mm1, [rsi]
movq mm3, [rax]
@@ -139,7 +139,7 @@ submby_loop:
lea rsi, [rsi+rdx]
sub rcx, 1
- jnz submby_loop
+ jnz .submby_loop
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 95888f6be..3bd1ff678 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -91,7 +91,7 @@ sym(vp8_subtract_mby_sse2):
mov rcx, 8 ; do two lines at one time
-submby_loop:
+.submby_loop:
movdqa xmm0, XMMWORD PTR [rsi] ; src
movdqa xmm1, XMMWORD PTR [rax] ; pred
@@ -133,7 +133,7 @@ submby_loop:
lea rsi, [rsi+rdx*2]
sub rcx, 1
- jnz submby_loop
+ jnz .submby_loop
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index b777ef566..b97c69439 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -71,26 +71,26 @@ sym(vp8_temporal_filter_apply_sse2):
lea rcx, [rdx + 16*16*1]
cmp dword ptr [rsp + block_size], 8
- jne temporal_filter_apply_load_16
+ jne .temporal_filter_apply_load_16
lea rcx, [rdx + 8*8*1]
-temporal_filter_apply_load_8:
+.temporal_filter_apply_load_8:
movq xmm0, [rsi] ; first row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm0, xmm7 ; src[ 0- 7]
movq xmm1, [rsi] ; second row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm1, xmm7 ; src[ 8-15]
- jmp temporal_filter_apply_load_finished
+ jmp .temporal_filter_apply_load_finished
-temporal_filter_apply_load_16:
+.temporal_filter_apply_load_16:
movdqa xmm0, [rsi] ; src (frame1)
lea rsi, [rsi + rbp] ; += stride
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7 ; src[ 0- 7]
punpckhbw xmm1, xmm7 ; src[ 8-15]
-temporal_filter_apply_load_finished:
+.temporal_filter_apply_load_finished:
movdqa xmm2, [rdx] ; predictor (frame2)
movdqa xmm3, xmm2
punpcklbw xmm2, xmm7 ; pred[ 0- 7]
@@ -176,13 +176,13 @@ temporal_filter_apply_load_finished:
lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
cmp rdx, rcx
- je temporal_filter_apply_epilog
+ je .temporal_filter_apply_epilog
pxor xmm7, xmm7 ; zero for extraction
cmp dword ptr [rsp + block_size], 16
- je temporal_filter_apply_load_16
- jmp temporal_filter_apply_load_8
+ je .temporal_filter_apply_load_16
+ jmp .temporal_filter_apply_load_8
-temporal_filter_apply_epilog:
+.temporal_filter_apply_epilog:
; begin epilog
mov rbp, [rsp + rbp_backup]
add rsp, stack_size
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
index 13b76ea91..2be8bbeb3 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -27,7 +27,7 @@ sym(vp8_get_mb_ss_mmx):
mov rcx, 16
pxor mm4, mm4
-NEXTROW:
+.NEXTROW:
movq mm0, [rax]
movq mm1, [rax+8]
movq mm2, [rax+16]
@@ -44,7 +44,7 @@ NEXTROW:
add rax, 32
dec rcx
- ja NEXTROW
+ ja .NEXTROW
movq QWORD PTR [rsp], mm4
;return sum[0]+sum[1];
@@ -568,7 +568,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx):
add rsi, r8
%endif
-filter_block2d_bil4x4_var_mmx_loop:
+.filter_block2d_bil4x4_var_mmx_loop:
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
@@ -614,7 +614,7 @@ filter_block2d_bil4x4_var_mmx_loop:
add rdi, r9
%endif
sub rcx, 1 ;
- jnz filter_block2d_bil4x4_var_mmx_loop ;
+ jnz .filter_block2d_bil4x4_var_mmx_loop ;
pxor mm3, mm3 ;
@@ -726,7 +726,7 @@ sym(vp8_filter_block2d_bil_var_mmx):
add rsi, r8
%endif
-filter_block2d_bil_var_mmx_loop:
+.filter_block2d_bil_var_mmx_loop:
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
@@ -807,7 +807,7 @@ filter_block2d_bil_var_mmx_loop:
add rdi, r9
%endif
sub rcx, 1 ;
- jnz filter_block2d_bil_var_mmx_loop ;
+ jnz .filter_block2d_bil_var_mmx_loop ;
pxor mm3, mm3 ;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index b7a6b3286..762922091 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -33,7 +33,7 @@ sym(vp8_get_mb_ss_sse2):
mov rcx, 8
pxor xmm4, xmm4
-NEXTROW:
+.NEXTROW:
movdqa xmm0, [rax]
movdqa xmm1, [rax+16]
movdqa xmm2, [rax+32]
@@ -50,7 +50,7 @@ NEXTROW:
add rax, 0x40
dec rcx
- ja NEXTROW
+ ja .NEXTROW
movdqa xmm3,xmm4
psrldq xmm4,8
@@ -126,7 +126,7 @@ sym(vp8_get16x16var_sse2):
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
-var16loop:
+.var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
@@ -160,7 +160,7 @@ var16loop:
add rdi, rdx
sub rcx, 1
- jnz var16loop
+ jnz .var16loop
movdqa xmm1, xmm6
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
index a582f8dc5..97e8b0e2e 100644
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -47,7 +47,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
- je filter_block2d_bil_var_ssse3_sp_only
+ je .filter_block2d_bil_var_ssse3_sp_only
shl rax, 4 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
@@ -55,7 +55,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je filter_block2d_bil_var_ssse3_fp_only
+ je .filter_block2d_bil_var_ssse3_fp_only
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
@@ -88,7 +88,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
lea rsi, [rsi + r8]
%endif
-filter_block2d_bil_var_ssse3_loop:
+.filter_block2d_bil_var_ssse3_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
@@ -142,15 +142,15 @@ filter_block2d_bil_var_ssse3_loop:
%endif
sub rcx, 1
- jnz filter_block2d_bil_var_ssse3_loop
+ jnz .filter_block2d_bil_var_ssse3_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_sp_only:
+.filter_block2d_bil_var_ssse3_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
- je filter_block2d_bil_var_ssse3_full_pixel
+ je .filter_block2d_bil_var_ssse3_full_pixel
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
@@ -169,7 +169,7 @@ filter_block2d_bil_var_ssse3_sp_only:
lea rsi, [rsi + rax]
-filter_block2d_bil_sp_only_loop:
+.filter_block2d_bil_sp_only_loop:
movdqu xmm3, XMMWORD PTR [rsi]
movdqa xmm2, xmm1
movdqa xmm0, xmm3
@@ -209,11 +209,11 @@ filter_block2d_bil_sp_only_loop:
%endif
sub rcx, 1
- jnz filter_block2d_bil_sp_only_loop
+ jnz .filter_block2d_bil_sp_only_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_full_pixel:
+.filter_block2d_bil_var_ssse3_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
@@ -221,7 +221,7 @@ filter_block2d_bil_var_ssse3_full_pixel:
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0
-filter_block2d_bil_full_pixel_loop:
+.filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi]
punpcklbw xmm1, xmm0
movq xmm2, QWORD PTR [rsi+8]
@@ -244,11 +244,11 @@ filter_block2d_bil_full_pixel_loop:
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rdx] ;src_pixels_per_line
sub rcx, 1
- jnz filter_block2d_bil_full_pixel_loop
+ jnz .filter_block2d_bil_full_pixel_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_fp_only:
+.filter_block2d_bil_var_ssse3_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
@@ -260,7 +260,7 @@ filter_block2d_bil_var_ssse3_fp_only:
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
-filter_block2d_bil_fp_only_loop:
+.filter_block2d_bil_fp_only_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
@@ -298,11 +298,11 @@ filter_block2d_bil_fp_only_loop:
%endif
sub rcx, 1
- jnz filter_block2d_bil_fp_only_loop
+ jnz .filter_block2d_bil_fp_only_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_variance:
+.filter_block2d_bil_variance:
pxor xmm0, xmm0
pxor xmm1, xmm1
pxor xmm5, xmm5
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index af6c4d27e..4b41b5436 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -140,6 +140,8 @@ extern prototype_getmbss(vp8_get_mb_ss_sse2);
extern prototype_variance(vp8_mse16x16_wmt);
extern prototype_variance2(vp8_get8x8var_sse2);
extern prototype_variance2(vp8_get16x16var_sse2);
+extern prototype_ssimpf(vp8_ssim_parms_8x8_sse2)
+extern prototype_ssimpf(vp8_ssim_parms_16x16_sse2)
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_variance_sad4x4
@@ -208,6 +210,14 @@ extern prototype_variance2(vp8_get16x16var_sse2);
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_wmt
+#if ARCH_X86_64
+#undef vp8_ssimpf_8x8
+#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_sse2
+
+#undef vp8_ssimpf_16x16
+#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_sse2
+#endif
+
#endif
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index badb9f044..36b7b7194 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -111,29 +111,6 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
#endif
-#if HAVE_SSSE3
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
-typedef void ssimpf
-(
- unsigned char *s,
- int sp,
- unsigned char *r,
- int rp,
- unsigned long *sum_s,
- unsigned long *sum_r,
- unsigned long *sum_sq_s,
- unsigned long *sum_sq_r,
- unsigned long *sum_sxr
-);
-
-extern ssimpf vp8_ssim_parms_16x16_sse3;
-extern ssimpf vp8_ssim_parms_8x8_sse3;
-#endif
-#endif
-#endif
-
-
void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
{
#if CONFIG_RUNTIME_CPU_DETECT
@@ -246,6 +223,13 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
#if !(CONFIG_REALTIME_ONLY)
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;
#endif
+
+#if CONFIG_INTERNAL_STATS
+#if ARCH_X86_64
+ cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse2;
+ cpi->rtcd.variance.ssimpf_16x16 = vp8_ssim_parms_16x16_sse2;
+#endif
+#endif
}
#endif
@@ -280,14 +264,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
-
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
- cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3;
- cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3;
-#endif
-#endif
-
}
#endif