diff options
author | John Koleszar <jkoleszar@google.com> | 2013-02-27 12:23:45 -0800 |
---|---|---|
committer | Gerrit Code Review <gerrit@gerrit.golo.chromium.org> | 2013-02-27 12:23:45 -0800 |
commit | 5ac141187a38175eb4981a370678e0f44964d324 (patch) | |
tree | 6a7a38efd5e48fa085cee04b0d749ff2fc3aa23d /vp9/encoder | |
parent | d6ff6fe2edb774ae979447aa394eeade35e21d18 (diff) | |
parent | 7ad8dbe417b03619855892e32726b75de9e9da1a (diff) | |
download | libvpx-5ac141187a38175eb4981a370678e0f44964d324.tar libvpx-5ac141187a38175eb4981a370678e0f44964d324.tar.gz libvpx-5ac141187a38175eb4981a370678e0f44964d324.tar.bz2 libvpx-5ac141187a38175eb4981a370678e0f44964d324.zip |
Merge "Remove unused vp9_copy32xn" into experimental
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/vp9_onyx_if.c | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_sad_c.c | 58 | ||||
-rw-r--r-- | vp9/encoder/vp9_variance.h | 7 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad4d_sse2.asm (renamed from vp9/encoder/x86/vp9_sad4d_sse2_yasm.asm) | 0 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse2.asm | 254 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse2_yasm.asm | 182 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse3.asm | 61 |
7 files changed, 172 insertions, 398 deletions
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index ced6eddca..ab3d19936 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -1683,14 +1683,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4, NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) -#if ARCH_X86 || ARCH_X86_64 - cpi->fn_ptr[BLOCK_16X16].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_16X8].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_8X16].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_8X8].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_4X4].copymem = vp9_copy32xn; -#endif - cpi->full_search_sad = vp9_full_search_sad; cpi->diamond_search_sad = vp9_diamond_search_sad; cpi->refining_search_sad = vp9_refining_search_sad; diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c index 84121f79c..f9c2f03ab 100644 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad_c.c @@ -484,61 +484,3 @@ void vp9_sad4x4x4d_c(const uint8_t *src_ptr, sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); } - -/* Copy 2 macroblocks to a buffer */ -void vp9_copy32xn_c(uint8_t *src_ptr, - int src_stride, - uint8_t *dst_ptr, - int dst_stride, - int height) { - int r; - - for (r = 0; r < height; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - dst_ptr[2] = src_ptr[2]; - dst_ptr[3] = src_ptr[3]; - dst_ptr[4] = src_ptr[4]; - dst_ptr[5] = src_ptr[5]; - dst_ptr[6] = src_ptr[6]; - dst_ptr[7] = src_ptr[7]; - dst_ptr[8] = src_ptr[8]; - dst_ptr[9] = src_ptr[9]; - dst_ptr[10] = src_ptr[10]; - dst_ptr[11] = src_ptr[11]; - dst_ptr[12] = src_ptr[12]; - dst_ptr[13] = src_ptr[13]; - dst_ptr[14] = src_ptr[14]; - dst_ptr[15] = src_ptr[15]; - dst_ptr[16] = src_ptr[16]; - dst_ptr[17] = src_ptr[17]; - dst_ptr[18] = src_ptr[18]; - dst_ptr[19] = src_ptr[19]; - dst_ptr[20] = src_ptr[20]; - dst_ptr[21] = src_ptr[21]; - dst_ptr[22] = src_ptr[22]; - dst_ptr[23] = src_ptr[23]; - dst_ptr[24] = src_ptr[24]; - dst_ptr[25] = src_ptr[25]; - dst_ptr[26] = src_ptr[26]; - dst_ptr[27] = src_ptr[27]; - dst_ptr[28] = src_ptr[28]; - dst_ptr[29] = src_ptr[29]; - dst_ptr[30] = src_ptr[30]; - dst_ptr[31] = src_ptr[31]; -#else - ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0]; - ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1]; - ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2]; - ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3]; - ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4]; - ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5]; - ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6]; - ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7]; -#endif - src_ptr += src_stride; - dst_ptr += dst_stride; - - } -} diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 3f0af0855..eb903bf94 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -19,12 +19,6 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int ref_stride, unsigned int max_sad); -typedef void (*vp9_copy32xn_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride, - int n); - typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -79,7 +73,6 @@ typedef struct variance_vtable { vp9_sad_multi_fn_t sdx3f; vp9_sad_multi1_fn_t sdx8f; vp9_sad_multi_d_fn_t sdx4df; - vp9_copy32xn_fn_t copymem; } vp9_variance_fn_ptr_t; #endif // VP9_ENCODER_VP9_VARIANCE_H_ diff --git a/vp9/encoder/x86/vp9_sad4d_sse2_yasm.asm b/vp9/encoder/x86/vp9_sad4d_sse2.asm index 3716d91ec..3716d91ec 100644 --- a/vp9/encoder/x86/vp9_sad4d_sse2_yasm.asm +++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm index c6b7d4cbd..ea482e071 100644 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/vp9/encoder/x86/vp9_sad_sse2.asm @@ -8,85 +8,175 @@ ; be found in the AUTHORS file in the root of the source tree. ; - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_copy32xn_sse2( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp9_copy32xn_sse2) PRIVATE -sym(vp9_copy32xn_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;dst_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;dst_stride - movsxd rcx, dword ptr arg(4) ;height - -.block_copy_sse2_loopx4: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - movdqu xmm2, XMMWORD PTR [rsi + rax] - movdqu xmm3, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqu xmm4, XMMWORD PTR [rsi] - movdqu xmm5, XMMWORD PTR [rsi + 16] - movdqu xmm6, XMMWORD PTR [rsi + rax] - movdqu xmm7, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - movdqa XMMWORD PTR [rdi + rdx], xmm2 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 - - lea rdi, [rdi+rdx*2] - - movdqa XMMWORD PTR [rdi], xmm4 - movdqa XMMWORD PTR [rdi + 16], xmm5 - movdqa XMMWORD PTR [rdi + rdx], xmm6 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 - - lea rdi, [rdi+rdx*2] - - sub rcx, 4 - cmp rcx, 4 - jge .block_copy_sse2_loopx4 - - cmp rcx, 0 - je .copy_is_done - -.block_copy_sse2_loop: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - lea rsi, [rsi+rax] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - lea rdi, [rdi+rdx] - - sub rcx, 1 - jne .block_copy_sse2_loop - -.copy_is_done: - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_XMM sse2 +cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov n_rowsd, 64 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_XMM sse2 +cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov n_rowsd, 16 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1 +cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 + +; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1 +cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 + +; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_MMX sse +cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + movd m0, [refq] + movd m1, [refq+ref_strideq] + movd m2, [srcq] + movd m3, [srcq+src_strideq] + lea refq, [refq+ref_strideq*2] + lea srcq, [srcq+src_strideq*2] + movd m4, [refq] + movd m5, [refq+ref_strideq] + movd m6, [srcq] + movd m7, [srcq+src_strideq] + punpckldq m0, m1 + punpckldq m2, m3 + punpckldq m4, m5 + punpckldq m6, m7 + psadbw m0, m2 + psadbw m4, m6 + paddd m0, m4 + movd eax, m0 + RET diff --git a/vp9/encoder/x86/vp9_sad_sse2_yasm.asm b/vp9/encoder/x86/vp9_sad_sse2_yasm.asm deleted file mode 100644 index ea482e071..000000000 --- a/vp9/encoder/x86/vp9_sad_sse2_yasm.asm +++ /dev/null @@ -1,182 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -INIT_XMM sse2 -cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - mov n_rowsd, 64 - pxor m0, m0 -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+32] - movu m4, [refq+48] - psadbw m1, [srcq] - psadbw m2, [srcq+16] - psadbw m3, [srcq+32] - psadbw m4, [srcq+48] - paddd m1, m2 - paddd m3, m4 - add refq, ref_strideq - paddd m0, m1 - add srcq, src_strideq - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET - -; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -INIT_XMM sse2 -cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - mov n_rowsd, 16 - pxor m0, m0 - -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+ref_strideq] - movu m4, [refq+ref_strideq+16] - psadbw m1, [srcq] - psadbw m2, [srcq+16] - psadbw m3, [srcq+src_strideq] - psadbw m4, [srcq+src_strideq+16] - paddd m1, m2 - paddd m3, m4 - lea refq, [refq+ref_strideq*2] - paddd m0, m1 - lea srcq, [srcq+src_strideq*2] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET - -; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD16XN 1 -cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] - mov n_rowsd, %1/4 - pxor m0, m0 - -.loop: - movu m1, [refq] - movu m2, [refq+ref_strideq] - movu m3, [refq+ref_strideq*2] - movu m4, [refq+ref_stride3q] - psadbw m1, [srcq] - psadbw m2, [srcq+src_strideq] - psadbw m3, [srcq+src_strideq*2] - psadbw m4, [srcq+src_stride3q] - paddd m1, m2 - paddd m3, m4 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD16XN 16 ; sad16x16_sse2 -SAD16XN 8 ; sad16x8_sse2 - -; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD8XN 1 -cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] - mov n_rowsd, %1/4 - pxor m0, m0 - -.loop: - movh m1, [refq] - movhps m1, [refq+ref_strideq] - movh m2, [refq+ref_strideq*2] - movhps m2, [refq+ref_stride3q] - movh m3, [srcq] - movhps m3, [srcq+src_strideq] - movh m4, [srcq+src_strideq*2] - movhps m4, [srcq+src_stride3q] - psadbw m1, m3 - psadbw m2, m4 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m2 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD8XN 16 ; sad8x16_sse2 -SAD8XN 8 ; sad8x8_sse2 - -; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -INIT_MMX sse -cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - movd m0, [refq] - movd m1, [refq+ref_strideq] - movd m2, [srcq] - movd m3, [srcq+src_strideq] - lea refq, [refq+ref_strideq*2] - lea srcq, [srcq+src_strideq*2] - movd m4, [refq] - movd m5, [refq+ref_strideq] - movd m6, [srcq] - movd m7, [srcq+src_strideq] - punpckldq m0, m1 - punpckldq m2, m3 - punpckldq m4, m5 - punpckldq m6, m7 - psadbw m0, m2 - psadbw m4, m6 - paddd m0, m4 - movd eax, m0 - RET diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm index 75e9d0ca4..5d8417270 100644 --- a/vp9/encoder/x86/vp9_sad_sse3.asm +++ b/vp9/encoder/x86/vp9_sad_sse3.asm @@ -376,64 +376,3 @@ sym(vp9_sad4x4x3_sse3): movd [rcx+8], mm7 STACK_FRAME_DESTROY_X3 - -;void vp9_copy32xn_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp9_copy32xn_sse3) PRIVATE -sym(vp9_copy32xn_sse3): - - STACK_FRAME_CREATE_X3 - -.block_copy_sse3_loopx4: - lea end_ptr, [src_ptr+src_stride*2] - - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] - movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] - movdqu xmm4, XMMWORD PTR [end_ptr] - movdqu xmm5, XMMWORD PTR [end_ptr + 16] - movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] - movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] - - lea src_ptr, [src_ptr+src_stride*4] - - lea end_ptr, [ref_ptr+ref_stride*2] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 - movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 - movdqa XMMWORD PTR [end_ptr], xmm4 - movdqa XMMWORD PTR [end_ptr + 16], xmm5 - movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 - movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 - - lea ref_ptr, [ref_ptr+ref_stride*4] - - sub height, 4 - cmp height, 4 - jge .block_copy_sse3_loopx4 - - ;Check to see if there is more rows need to be copied. - cmp height, 0 - je .copy_is_done - -.block_copy_sse3_loop: - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - lea src_ptr, [src_ptr+src_stride] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - lea ref_ptr, [ref_ptr+ref_stride] - - sub height, 1 - jne .block_copy_sse3_loop - -.copy_is_done: - STACK_FRAME_DESTROY_X3 |