summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r--vp8/encoder/x86/sad_sse2.asm80
-rw-r--r--vp8/encoder/x86/sad_sse3.asm66
-rw-r--r--vp8/encoder/x86/variance_x86.h8
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c2
4 files changed, 156 insertions, 0 deletions
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 04ee72f72..1011c9553 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -328,3 +328,83 @@ x16x8sad_wmt_early_exit:
UNSHADOW_ARGS
pop rbp
ret
+
+;void vp8_copy32xn_sse2(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse2)
+sym(vp8_copy32xn_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;dst_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;dst_stride
+ movsxd rcx, dword ptr arg(4) ;height
+
+block_copy_sse2_loopx4:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ movdqu xmm2, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqu xmm4, XMMWORD PTR [rsi]
+ movdqu xmm5, XMMWORD PTR [rsi + 16]
+ movdqu xmm6, XMMWORD PTR [rsi + rax]
+ movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ movdqa XMMWORD PTR [rdi + rdx], xmm2
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
+
+ lea rdi, [rdi+rdx*2]
+
+ movdqa XMMWORD PTR [rdi], xmm4
+ movdqa XMMWORD PTR [rdi + 16], xmm5
+ movdqa XMMWORD PTR [rdi + rdx], xmm6
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
+
+ lea rdi, [rdi+rdx*2]
+
+ sub rcx, 4
+ cmp rcx, 4
+ jge block_copy_sse2_loopx4
+
+ cmp rcx, 0
+ je copy_is_done
+
+block_copy_sse2_loop:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ lea rsi, [rsi+rax]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ lea rdi, [rdi+rdx]
+
+ sub rcx, 1
+ jne block_copy_sse2_loop
+
+copy_is_done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 2dbcc7dc9..1c41c322a 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -20,6 +20,7 @@
%define ret_var rbx
%define result_ptr arg(4)
%define max_err arg(4)
+ %define height dword ptr arg(4)
push rbp
mov rbp, rsp
push rsi
@@ -42,6 +43,7 @@
%define ret_var r11
%define result_ptr [rsp+xmm_stack_space+8+4*8]
%define max_err [rsp+xmm_stack_space+8+4*8]
+ %define height [rsp+xmm_stack_space+8+4*8]
%else
%define src_ptr rdi
%define src_stride rsi
@@ -51,6 +53,7 @@
%define ret_var r10
%define result_ptr r8
%define max_err r8
+ %define height r8
%endif
%endif
@@ -65,6 +68,7 @@
%define ret_var
%define result_ptr
%define max_err
+ %define height
%if ABI_IS_32BIT
pop rbx
@@ -632,6 +636,67 @@ sym(vp8_sad16x16_sse3):
STACK_FRAME_DESTROY_X3
+;void vp8_copy32xn_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse3)
+sym(vp8_copy32xn_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+block_copy_sse3_loopx4:
+ lea end_ptr, [src_ptr+src_stride*2]
+
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
+ movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
+ movdqu xmm4, XMMWORD PTR [end_ptr]
+ movdqu xmm5, XMMWORD PTR [end_ptr + 16]
+ movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
+ movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
+
+ lea src_ptr, [src_ptr+src_stride*4]
+
+ lea end_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
+ movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+ movdqa XMMWORD PTR [end_ptr], xmm4
+ movdqa XMMWORD PTR [end_ptr + 16], xmm5
+ movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
+ movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+ lea ref_ptr, [ref_ptr+ref_stride*4]
+
+ sub height, 4
+ cmp height, 4
+ jge block_copy_sse3_loopx4
+
+ ;Check to see if there is more rows need to be copied.
+ cmp height, 0
+ je copy_is_done
+
+block_copy_sse3_loop:
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ lea src_ptr, [src_ptr+src_stride]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ lea ref_ptr, [ref_ptr+ref_stride]
+
+ sub height, 1
+ jne block_copy_sse3_loop
+
+copy_is_done:
+ STACK_FRAME_DESTROY_X3
+
;void vp8_sad16x16x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
@@ -892,3 +957,4 @@ sym(vp8_sad4x4x4d_sse3):
STACK_FRAME_DESTROY_X4
+
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 0ee8eb7e5..af6c4d27e 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -121,6 +121,7 @@ extern prototype_sad(vp8_sad8x8_wmt);
extern prototype_sad(vp8_sad8x16_wmt);
extern prototype_sad(vp8_sad16x8_wmt);
extern prototype_sad(vp8_sad16x16_wmt);
+extern prototype_sad(vp8_copy32xn_sse2);
extern prototype_variance(vp8_variance4x4_wmt);
extern prototype_variance(vp8_variance8x8_wmt);
extern prototype_variance(vp8_variance8x16_wmt);
@@ -156,6 +157,9 @@ extern prototype_variance2(vp8_get16x16var_sse2);
#undef vp8_variance_sad16x16
#define vp8_variance_sad16x16 vp8_sad16x16_wmt
+#undef vp8_variance_copy32xn
+#define vp8_variance_copy32xn vp8_copy32xn_sse2
+
#undef vp8_variance_var4x4
#define vp8_variance_var4x4 vp8_variance4x4_wmt
@@ -222,6 +226,7 @@ extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3);
extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3);
extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3);
extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
+extern prototype_sad(vp8_copy32xn_sse3);
#if !CONFIG_RUNTIME_CPU_DETECT
@@ -258,6 +263,9 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
#undef vp8_variance_sad4x4x4d
#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3
+#undef vp8_variance_copy32xn
+#define vp8_variance_copy32xn vp8_copy32xn_sse3
+
#endif
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 9a324ec12..badb9f044 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -203,6 +203,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt;
cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt;
cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt;
+ cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse2;
cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt;
cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt;
@@ -263,6 +264,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3;
cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3;
cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
+ cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse3;
cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;
cpi->rtcd.search.refining_search = vp8_refining_search_sadx4;
}