diff options
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r-- | vp8/encoder/x86/sad_sse2.asm | 80 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_sse3.asm | 66 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_x86.h | 8 | ||||
-rw-r--r-- | vp8/encoder/x86/x86_csystemdependent.c | 2 |
4 files changed, 156 insertions, 0 deletions
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index 04ee72f72..1011c9553 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -328,3 +328,83 @@ x16x8sad_wmt_early_exit: UNSHADOW_ARGS pop rbp ret + +;void vp8_copy32xn_sse2( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse2) +sym(vp8_copy32xn_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;dst_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;dst_stride + movsxd rcx, dword ptr arg(4) ;height + +block_copy_sse2_loopx4: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + movdqu xmm2, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqu xmm4, XMMWORD PTR [rsi] + movdqu xmm5, XMMWORD PTR [rsi + 16] + movdqu xmm6, XMMWORD PTR [rsi + rax] + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + movdqa XMMWORD PTR [rdi + rdx], xmm2 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 + + lea rdi, [rdi+rdx*2] + + movdqa XMMWORD PTR [rdi], xmm4 + movdqa XMMWORD PTR [rdi + 16], xmm5 + movdqa XMMWORD PTR [rdi + rdx], xmm6 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 + + lea rdi, [rdi+rdx*2] + + sub rcx, 4 + cmp rcx, 4 + jge block_copy_sse2_loopx4 + + cmp rcx, 0 + je copy_is_done + +block_copy_sse2_loop: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + lea rsi, [rsi+rax] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + lea rdi, [rdi+rdx] + + sub rcx, 1 + jne block_copy_sse2_loop + +copy_is_done: + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index 2dbcc7dc9..1c41c322a 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -20,6 +20,7 @@ %define ret_var rbx %define result_ptr arg(4) %define max_err arg(4) + %define height dword ptr arg(4) push rbp mov rbp, rsp push rsi @@ -42,6 +43,7 @@ %define ret_var r11 %define result_ptr [rsp+xmm_stack_space+8+4*8] %define max_err [rsp+xmm_stack_space+8+4*8] + %define height [rsp+xmm_stack_space+8+4*8] %else %define src_ptr rdi %define src_stride rsi @@ -51,6 +53,7 @@ %define ret_var r10 %define result_ptr r8 %define max_err r8 + %define height r8 %endif %endif @@ -65,6 +68,7 @@ %define ret_var %define result_ptr %define max_err + %define height %if ABI_IS_32BIT pop rbx @@ -632,6 +636,67 @@ sym(vp8_sad16x16_sse3): STACK_FRAME_DESTROY_X3 +;void vp8_copy32xn_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse3) +sym(vp8_copy32xn_sse3): + + STACK_FRAME_CREATE_X3 + +block_copy_sse3_loopx4: + lea end_ptr, [src_ptr+src_stride*2] + + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] + movdqu xmm4, XMMWORD PTR [end_ptr] + movdqu xmm5, XMMWORD PTR [end_ptr + 16] + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] + + lea src_ptr, [src_ptr+src_stride*4] + + lea end_ptr, [ref_ptr+ref_stride*2] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 + movdqa XMMWORD PTR [end_ptr], xmm4 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 + + lea ref_ptr, [ref_ptr+ref_stride*4] + + sub height, 4 + cmp height, 4 + jge block_copy_sse3_loopx4 + + ;Check to see if there is more rows need to be copied. + cmp height, 0 + je copy_is_done + +block_copy_sse3_loop: + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + lea src_ptr, [src_ptr+src_stride] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + lea ref_ptr, [ref_ptr+ref_stride] + + sub height, 1 + jne block_copy_sse3_loop + +copy_is_done: + STACK_FRAME_DESTROY_X3 + ;void vp8_sad16x16x4d_sse3( ; unsigned char *src_ptr, ; int src_stride, @@ -892,3 +957,4 @@ sym(vp8_sad4x4x4d_sse3): STACK_FRAME_DESTROY_X4 + diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 0ee8eb7e5..af6c4d27e 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -121,6 +121,7 @@ extern prototype_sad(vp8_sad8x8_wmt); extern prototype_sad(vp8_sad8x16_wmt); extern prototype_sad(vp8_sad16x8_wmt); extern prototype_sad(vp8_sad16x16_wmt); +extern prototype_sad(vp8_copy32xn_sse2); extern prototype_variance(vp8_variance4x4_wmt); extern prototype_variance(vp8_variance8x8_wmt); extern prototype_variance(vp8_variance8x16_wmt); @@ -156,6 +157,9 @@ extern prototype_variance2(vp8_get16x16var_sse2); #undef vp8_variance_sad16x16 #define vp8_variance_sad16x16 vp8_sad16x16_wmt +#undef vp8_variance_copy32xn +#define vp8_variance_copy32xn vp8_copy32xn_sse2 + #undef vp8_variance_var4x4 #define vp8_variance_var4x4 vp8_variance4x4_wmt @@ -222,6 +226,7 @@ extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); +extern prototype_sad(vp8_copy32xn_sse3); #if !CONFIG_RUNTIME_CPU_DETECT @@ -258,6 +263,9 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); #undef vp8_variance_sad4x4x4d #define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3 +#undef vp8_variance_copy32xn +#define vp8_variance_copy32xn vp8_copy32xn_sse3 + #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 9a324ec12..badb9f044 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -203,6 +203,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt; cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt; cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt; + cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse2; cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt; cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt; @@ -263,6 +264,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3; cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; + cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse3; cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; cpi->rtcd.search.refining_search = vp8_refining_search_sadx4; } |