aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/i386/i686/multiarch/memcpy-ssse3.S')
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy-ssse3.S3162
1 files changed, 0 insertions, 3162 deletions
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 53e8a6ca1d..0000000000
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3162 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc) \
- && (defined SHARED \
- || defined USE_AS_MEMMOVE \
- || !defined USE_MULTIARCH)
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
-# ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# endif
-
-# ifdef USE_AS_BCOPY
-# define SRC PARMS
-# define DEST SRC+4
-# define LEN DEST+4
-# else
-# define DEST PARMS
-# define SRC DEST+4
-# define LEN SRC+4
-# endif
-
-# define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
-
-# define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
-
-# define PUSH(REG) pushl REG; CFI_PUSH (REG)
-# define POP(REG) popl REG; CFI_POP (REG)
-
-# ifdef SHARED
-# define PARMS 8 /* Preserve EBX. */
-# define ENTRANCE PUSH (%ebx);
-# define RETURN_END POP (%ebx); ret
-# define RETURN RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B) I - B
-
-/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- /* We first load PC into EBX. */ \
- SETUP_PIC_REG(bx); \
- /* Get the address of the jump table. */ \
- addl $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- addl (%ebx, INDEX, SCALE), %ebx; \
- /* We loaded the jump table. Go. */ \
- jmp *%ebx
-# else
-
-# define PARMS 4
-# define ENTRANCE
-# define RETURN_END ret
-# define RETURN RETURN_END
-# define JMPTBL(I, B) I
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- absolute offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- jmp *TABLE(, INDEX, SCALE)
-# endif
-
- .section .text.ssse3,"ax",@progbits
-# if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- movl 12(%esp), %eax
- cmpl %eax, 16(%esp)
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-# endif
-ENTRY (MEMCPY)
- ENTRANCE
- movl LEN(%esp), %ecx
- movl SRC(%esp), %eax
- movl DEST(%esp), %edx
-
-# ifdef USE_AS_MEMMOVE
- cmp %eax, %edx
- jb L(copy_forward)
- je L(fwd_write_0bytes)
- cmp $32, %ecx
- jae L(memmove_bwd)
- jmp L(bk_write_less32bytes_2)
-
- .p2align 4
-L(memmove_bwd):
- add %ecx, %eax
- cmp %eax, %edx
- movl SRC(%esp), %eax
- jb L(copy_backward)
-
-L(copy_forward):
-# endif
- cmp $48, %ecx
- jae L(48bytesormore)
-
-L(fwd_write_less32bytes):
-# ifndef USE_AS_MEMMOVE
- cmp %dl, %al
- jb L(bk_write)
-# endif
- add %ecx, %edx
- add %ecx, %eax
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-# ifndef USE_AS_MEMMOVE
- .p2align 4
-L(bk_write):
- BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-# endif
-
- .p2align 4
-L(48bytesormore):
-# ifndef USE_AS_MEMMOVE
- movlpd (%eax), %xmm0
- movlpd 8(%eax), %xmm1
- movlpd %xmm0, (%edx)
- movlpd %xmm1, 8(%edx)
-# else
- movdqu (%eax), %xmm0
-# endif
- PUSH (%edi)
- movl %edx, %edi
- and $-16, %edx
- add $16, %edx
- sub %edx, %edi
- add %edi, %ecx
- sub %edi, %eax
-
-# ifdef SHARED_CACHE_SIZE_HALF
- cmp $SHARED_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_shared_cache_size_half, %ecx
-# endif
-# endif
-
- mov %eax, %edi
- jae L(large_page)
- and $0xf, %edi
- jz L(shl_0)
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
-
- .p2align 4
-L(shl_0):
-# ifdef USE_AS_MEMMOVE
- movl DEST+4(%esp), %edi
- movdqu %xmm0, (%edi)
-# endif
- xor %edi, %edi
- cmp $127, %ecx
- ja L(shl_0_gobble)
- lea -32(%ecx), %ecx
-
- .p2align 4
-L(shl_0_loop):
- movdqa (%eax, %edi), %xmm0
- movdqa 16(%eax, %edi), %xmm1
- sub $32, %ecx
- movdqa %xmm0, (%edx, %edi)
- movdqa %xmm1, 16(%edx, %edi)
- lea 32(%edi), %edi
- jb L(shl_0_end)
-
- movdqa (%eax, %edi), %xmm0
- movdqa 16(%eax, %edi), %xmm1
- sub $32, %ecx
- movdqa %xmm0, (%edx, %edi)
- movdqa %xmm1, 16(%edx, %edi)
- lea 32(%edi), %edi
- jb L(shl_0_end)
-
- movdqa (%eax, %edi), %xmm0
- movdqa 16(%eax, %edi), %xmm1
- sub $32, %ecx
- movdqa %xmm0, (%edx, %edi)
- movdqa %xmm1, 16(%edx, %edi)
- lea 32(%edi), %edi
- jb L(shl_0_end)
-
- movdqa (%eax, %edi), %xmm0
- movdqa 16(%eax, %edi), %xmm1
- sub $32, %ecx
- movdqa %xmm0, (%edx, %edi)
- movdqa %xmm1, 16(%edx, %edi)
- lea 32(%edi), %edi
-
-L(shl_0_end):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- add %edi, %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_0_gobble):
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- POP (%edi)
- lea -128(%ecx), %ecx
- jae L(shl_0_gobble_mem_loop)
-
- .p2align 4
-L(shl_0_gobble_cache_loop):
- movdqa (%eax), %xmm0
- movdqa 0x10(%eax), %xmm1
- movdqa 0x20(%eax), %xmm2
- movdqa 0x30(%eax), %xmm3
- movdqa 0x40(%eax), %xmm4
- movdqa 0x50(%eax), %xmm5
- movdqa 0x60(%eax), %xmm6
- movdqa 0x70(%eax), %xmm7
- lea 0x80(%eax), %eax
- sub $128, %ecx
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- movdqa %xmm2, 0x20(%edx)
- movdqa %xmm3, 0x30(%edx)
- movdqa %xmm4, 0x40(%edx)
- movdqa %xmm5, 0x50(%edx)
- movdqa %xmm6, 0x60(%edx)
- movdqa %xmm7, 0x70(%edx)
- lea 0x80(%edx), %edx
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %ecx
- lea 0x80(%ecx), %ecx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%eax), %xmm0
- sub $0x40, %ecx
- movdqa 0x10(%eax), %xmm1
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- movdqa 0x20(%eax), %xmm0
- movdqa 0x30(%eax), %xmm1
- add $0x40, %eax
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm1, 0x30(%edx)
- add $0x40, %edx
-
-L(shl_0_cache_less_64bytes):
- cmp $0x20, %ecx
- jb L(shl_0_cache_less_32bytes)
- movdqa (%eax), %xmm0
- sub $0x20, %ecx
- movdqa 0x10(%eax), %xmm1
- add $0x20, %eax
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- add $0x20, %edx
-
-L(shl_0_cache_less_32bytes):
- cmp $0x10, %ecx
- jb L(shl_0_cache_less_16bytes)
- sub $0x10, %ecx
- movdqa (%eax), %xmm0
- add $0x10, %eax
- movdqa %xmm0, (%edx)
- add $0x10, %edx
-
-L(shl_0_cache_less_16bytes):
- add %ecx, %edx
- add %ecx, %eax
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x280(%eax)
- prefetcht0 0x1c0(%edx)
-
- movdqa (%eax), %xmm0
- movdqa 0x10(%eax), %xmm1
- movdqa 0x20(%eax), %xmm2
- movdqa 0x30(%eax), %xmm3
- movdqa 0x40(%eax), %xmm4
- movdqa 0x50(%eax), %xmm5
- movdqa 0x60(%eax), %xmm6
- movdqa 0x70(%eax), %xmm7
- lea 0x80(%eax), %eax
- sub $0x80, %ecx
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- movdqa %xmm2, 0x20(%edx)
- movdqa %xmm3, 0x30(%edx)
- movdqa %xmm4, 0x40(%edx)
- movdqa %xmm5, 0x50(%edx)
- movdqa %xmm6, 0x60(%edx)
- movdqa %xmm7, 0x70(%edx)
- lea 0x80(%edx), %edx
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %ecx
- lea 0x80(%ecx), %ecx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%eax), %xmm0
- sub $0x40, %ecx
- movdqa 0x10(%eax), %xmm1
-
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
-
- movdqa 0x20(%eax), %xmm0
- movdqa 0x30(%eax), %xmm1
- add $0x40, %eax
-
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm1, 0x30(%edx)
- add $0x40, %edx
-
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %ecx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%eax), %xmm0
- sub $0x20, %ecx
- movdqa 0x10(%eax), %xmm1
- add $0x20, %eax
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- add $0x20, %edx
-
-L(shl_0_mem_less_32bytes):
- cmp $0x10, %ecx
- jb L(shl_0_mem_less_16bytes)
- sub $0x10, %ecx
- movdqa (%eax), %xmm0
- add $0x10, %eax
- movdqa %xmm0, (%edx)
- add $0x10, %edx
-
-L(shl_0_mem_less_16bytes):
- add %ecx, %edx
- add %ecx, %eax
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
-
- .p2align 4
-L(shl_1):
-# ifndef USE_AS_MEMMOVE
- movaps -1(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -1(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_1_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl1LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 15(%eax), %xmm2
- movaps 31(%eax), %xmm3
- movaps 47(%eax), %xmm4
- movaps 63(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- palignr $1, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $1, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $1, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl1LoopStart)
-
-L(Shl1LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 15(%eax), %xmm2
- movaps 31(%eax), %xmm3
- palignr $1, %xmm2, %xmm3
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_1_no_prefetch):
- lea -32(%ecx), %ecx
- lea -1(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_1_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- palignr $1, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_1_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $1, %xmm2, %xmm3
- palignr $1, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_1_no_prefetch_loop)
-
-L(sh_1_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 1(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_2):
-# ifndef USE_AS_MEMMOVE
- movaps -2(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -2(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_2_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl2LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 14(%eax), %xmm2
- movaps 30(%eax), %xmm3
- movaps 46(%eax), %xmm4
- movaps 62(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- palignr $2, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $2, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $2, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl2LoopStart)
-
-L(Shl2LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 14(%eax), %xmm2
- movaps 30(%eax), %xmm3
- palignr $2, %xmm2, %xmm3
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_2_no_prefetch):
- lea -32(%ecx), %ecx
- lea -2(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_2_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- palignr $2, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_2_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $2, %xmm2, %xmm3
- palignr $2, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_2_no_prefetch_loop)
-
-L(sh_2_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 2(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_3):
-# ifndef USE_AS_MEMMOVE
- movaps -3(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -3(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_3_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl3LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 13(%eax), %xmm2
- movaps 29(%eax), %xmm3
- movaps 45(%eax), %xmm4
- movaps 61(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- palignr $3, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $3, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $3, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl3LoopStart)
-
-L(Shl3LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 13(%eax), %xmm2
- movaps 29(%eax), %xmm3
- palignr $3, %xmm2, %xmm3
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_3_no_prefetch):
- lea -32(%ecx), %ecx
- lea -3(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_3_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- palignr $3, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jb L(sh_3_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $3, %xmm2, %xmm3
- palignr $3, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jae L(sh_3_no_prefetch_loop)
-
-L(sh_3_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 3(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_4):
-# ifndef USE_AS_MEMMOVE
- movaps -4(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -4(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_4_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl4LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 12(%eax), %xmm2
- movaps 28(%eax), %xmm3
- movaps 44(%eax), %xmm4
- movaps 60(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- palignr $4, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $4, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $4, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl4LoopStart)
-
-L(Shl4LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 12(%eax), %xmm2
- movaps 28(%eax), %xmm3
- palignr $4, %xmm2, %xmm3
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_4_no_prefetch):
- lea -32(%ecx), %ecx
- lea -4(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_4_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- palignr $4, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jb L(sh_4_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $4, %xmm2, %xmm3
- palignr $4, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jae L(sh_4_no_prefetch_loop)
-
-L(sh_4_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 4(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_5):
-# ifndef USE_AS_MEMMOVE
- movaps -5(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -5(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_5_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl5LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 11(%eax), %xmm2
- movaps 27(%eax), %xmm3
- movaps 43(%eax), %xmm4
- movaps 59(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- palignr $5, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $5, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $5, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl5LoopStart)
-
-L(Shl5LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 11(%eax), %xmm2
- movaps 27(%eax), %xmm3
- palignr $5, %xmm2, %xmm3
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_5_no_prefetch):
- lea -32(%ecx), %ecx
- lea -5(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_5_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- palignr $5, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jb L(sh_5_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $5, %xmm2, %xmm3
- palignr $5, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jae L(sh_5_no_prefetch_loop)
-
-L(sh_5_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 5(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_6):
-# ifndef USE_AS_MEMMOVE
- movaps -6(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -6(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_6_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl6LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 10(%eax), %xmm2
- movaps 26(%eax), %xmm3
- movaps 42(%eax), %xmm4
- movaps 58(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- palignr $6, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $6, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $6, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl6LoopStart)
-
-L(Shl6LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 10(%eax), %xmm2
- movaps 26(%eax), %xmm3
- palignr $6, %xmm2, %xmm3
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_6_no_prefetch):
- lea -32(%ecx), %ecx
- lea -6(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_6_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- palignr $6, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jb L(sh_6_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $6, %xmm2, %xmm3
- palignr $6, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jae L(sh_6_no_prefetch_loop)
-
-L(sh_6_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 6(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_7):
-# ifndef USE_AS_MEMMOVE
- movaps -7(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -7(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_7_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl7LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 9(%eax), %xmm2
- movaps 25(%eax), %xmm3
- movaps 41(%eax), %xmm4
- movaps 57(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- palignr $7, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $7, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $7, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl7LoopStart)
-
-L(Shl7LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 9(%eax), %xmm2
- movaps 25(%eax), %xmm3
- palignr $7, %xmm2, %xmm3
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_7_no_prefetch):
- lea -32(%ecx), %ecx
- lea -7(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_7_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- palignr $7, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_7_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $7, %xmm2, %xmm3
- palignr $7, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_7_no_prefetch_loop)
-
-L(sh_7_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 7(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_8):
-# ifndef USE_AS_MEMMOVE
- movaps -8(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -8(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_8_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl8LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 8(%eax), %xmm2
- movaps 24(%eax), %xmm3
- movaps 40(%eax), %xmm4
- movaps 56(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- palignr $8, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $8, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $8, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl8LoopStart)
-
-L(LoopLeave8):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 8(%eax), %xmm2
- movaps 24(%eax), %xmm3
- palignr $8, %xmm2, %xmm3
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_8_no_prefetch):
- lea -32(%ecx), %ecx
- lea -8(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_8_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- palignr $8, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_8_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $8, %xmm2, %xmm3
- palignr $8, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_8_no_prefetch_loop)
-
-L(sh_8_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 8(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_9):
-# ifndef USE_AS_MEMMOVE
- movaps -9(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -9(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_9_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl9LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 7(%eax), %xmm2
- movaps 23(%eax), %xmm3
- movaps 39(%eax), %xmm4
- movaps 55(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- palignr $9, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $9, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $9, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl9LoopStart)
-
-L(Shl9LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 7(%eax), %xmm2
- movaps 23(%eax), %xmm3
- palignr $9, %xmm2, %xmm3
- palignr $9, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_9_no_prefetch):
- lea -32(%ecx), %ecx
- lea -9(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_9_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- palignr $9, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_9_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $9, %xmm2, %xmm3
- palignr $9, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_9_no_prefetch_loop)
-
-L(sh_9_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 9(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_10):
-# ifndef USE_AS_MEMMOVE
- movaps -10(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -10(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_10_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl10LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 6(%eax), %xmm2
- movaps 22(%eax), %xmm3
- movaps 38(%eax), %xmm4
- movaps 54(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- palignr $10, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $10, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $10, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl10LoopStart)
-
-L(Shl10LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 6(%eax), %xmm2
- movaps 22(%eax), %xmm3
- palignr $10, %xmm2, %xmm3
- palignr $10, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_10_no_prefetch):
- lea -32(%ecx), %ecx
- lea -10(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_10_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- palignr $10, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_10_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $10, %xmm2, %xmm3
- palignr $10, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_10_no_prefetch_loop)
-
-L(sh_10_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 10(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_11):
-# ifndef USE_AS_MEMMOVE
- movaps -11(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -11(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_11_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl11LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 5(%eax), %xmm2
- movaps 21(%eax), %xmm3
- movaps 37(%eax), %xmm4
- movaps 53(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- palignr $11, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $11, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $11, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl11LoopStart)
-
-L(Shl11LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 5(%eax), %xmm2
- movaps 21(%eax), %xmm3
- palignr $11, %xmm2, %xmm3
- palignr $11, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_11_no_prefetch):
- lea -32(%ecx), %ecx
- lea -11(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_11_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- palignr $11, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_11_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $11, %xmm2, %xmm3
- palignr $11, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_11_no_prefetch_loop)
-
-L(sh_11_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 11(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_12):
-# ifndef USE_AS_MEMMOVE
- movaps -12(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -12(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_12_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl12LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 4(%eax), %xmm2
- movaps 20(%eax), %xmm3
- movaps 36(%eax), %xmm4
- movaps 52(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- palignr $12, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $12, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $12, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl12LoopStart)
-
-L(Shl12LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 4(%eax), %xmm2
- movaps 20(%eax), %xmm3
- palignr $12, %xmm2, %xmm3
- palignr $12, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_12_no_prefetch):
- lea -32(%ecx), %ecx
- lea -12(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_12_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- palignr $12, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_12_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $12, %xmm2, %xmm3
- palignr $12, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_12_no_prefetch_loop)
-
-L(sh_12_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 12(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_13):
-# ifndef USE_AS_MEMMOVE
- movaps -13(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -13(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_13_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl13LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 3(%eax), %xmm2
- movaps 19(%eax), %xmm3
- movaps 35(%eax), %xmm4
- movaps 51(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- palignr $13, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $13, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $13, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl13LoopStart)
-
-L(Shl13LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 3(%eax), %xmm2
- movaps 19(%eax), %xmm3
- palignr $13, %xmm2, %xmm3
- palignr $13, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_13_no_prefetch):
- lea -32(%ecx), %ecx
- lea -13(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_13_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- palignr $13, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_13_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $13, %xmm2, %xmm3
- palignr $13, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_13_no_prefetch_loop)
-
-L(sh_13_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 13(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_14):
-# ifndef USE_AS_MEMMOVE
- movaps -14(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -14(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_14_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl14LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 2(%eax), %xmm2
- movaps 18(%eax), %xmm3
- movaps 34(%eax), %xmm4
- movaps 50(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- palignr $14, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $14, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $14, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl14LoopStart)
-
-L(Shl14LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 2(%eax), %xmm2
- movaps 18(%eax), %xmm3
- palignr $14, %xmm2, %xmm3
- palignr $14, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_14_no_prefetch):
- lea -32(%ecx), %ecx
- lea -14(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_14_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- palignr $14, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_14_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $14, %xmm2, %xmm3
- palignr $14, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_14_no_prefetch_loop)
-
-L(sh_14_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 14(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_15):
-# ifndef USE_AS_MEMMOVE
- movaps -15(%eax), %xmm1
-# else
- movl DEST+4(%esp), %edi
- movaps -15(%eax), %xmm1
- movdqu %xmm0, (%edi)
-# endif
-# ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-# else
-# ifdef SHARED
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-# endif
- jb L(sh_15_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl15LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 1(%eax), %xmm2
- movaps 17(%eax), %xmm3
- movaps 33(%eax), %xmm4
- movaps 49(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- palignr $15, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $15, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $15, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl15LoopStart)
-
-L(Shl15LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 1(%eax), %xmm2
- movaps 17(%eax), %xmm3
- palignr $15, %xmm2, %xmm3
- palignr $15, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_15_no_prefetch):
- lea -32(%ecx), %ecx
- lea -15(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_15_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- palignr $15, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_15_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $15, %xmm2, %xmm3
- palignr $15, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_15_no_prefetch_loop)
-
-L(sh_15_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 15(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_end_0):
- lea 32(%ecx), %ecx
- lea (%edx, %ecx), %edx
- lea (%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- .p2align 4
-L(fwd_write_44bytes):
- movq -44(%eax), %xmm0
- movq %xmm0, -44(%edx)
-L(fwd_write_36bytes):
- movq -36(%eax), %xmm0
- movq %xmm0, -36(%edx)
-L(fwd_write_28bytes):
- movq -28(%eax), %xmm0
- movq %xmm0, -28(%edx)
-L(fwd_write_20bytes):
- movq -20(%eax), %xmm0
- movq %xmm0, -20(%edx)
-L(fwd_write_12bytes):
- movq -12(%eax), %xmm0
- movq %xmm0, -12(%edx)
-L(fwd_write_4bytes):
- movl -4(%eax), %ecx
- movl %ecx, -4(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_40bytes):
- movq -40(%eax), %xmm0
- movq %xmm0, -40(%edx)
-L(fwd_write_32bytes):
- movq -32(%eax), %xmm0
- movq %xmm0, -32(%edx)
-L(fwd_write_24bytes):
- movq -24(%eax), %xmm0
- movq %xmm0, -24(%edx)
-L(fwd_write_16bytes):
- movq -16(%eax), %xmm0
- movq %xmm0, -16(%edx)
-L(fwd_write_8bytes):
- movq -8(%eax), %xmm0
- movq %xmm0, -8(%edx)
-L(fwd_write_0bytes):
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_5bytes):
- movl -5(%eax), %ecx
- movl -4(%eax), %eax
- movl %ecx, -5(%edx)
- movl %eax, -4(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_45bytes):
- movq -45(%eax), %xmm0
- movq %xmm0, -45(%edx)
-L(fwd_write_37bytes):
- movq -37(%eax), %xmm0
- movq %xmm0, -37(%edx)
-L(fwd_write_29bytes):
- movq -29(%eax), %xmm0
- movq %xmm0, -29(%edx)
-L(fwd_write_21bytes):
- movq -21(%eax), %xmm0
- movq %xmm0, -21(%edx)
-L(fwd_write_13bytes):
- movq -13(%eax), %xmm0
- movq %xmm0, -13(%edx)
- movl -5(%eax), %ecx
- movl %ecx, -5(%edx)
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_41bytes):
- movq -41(%eax), %xmm0
- movq %xmm0, -41(%edx)
-L(fwd_write_33bytes):
- movq -33(%eax), %xmm0
- movq %xmm0, -33(%edx)
-L(fwd_write_25bytes):
- movq -25(%eax), %xmm0
- movq %xmm0, -25(%edx)
-L(fwd_write_17bytes):
- movq -17(%eax), %xmm0
- movq %xmm0, -17(%edx)
-L(fwd_write_9bytes):
- movq -9(%eax), %xmm0
- movq %xmm0, -9(%edx)
-L(fwd_write_1bytes):
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_46bytes):
- movq -46(%eax), %xmm0
- movq %xmm0, -46(%edx)
-L(fwd_write_38bytes):
- movq -38(%eax), %xmm0
- movq %xmm0, -38(%edx)
-L(fwd_write_30bytes):
- movq -30(%eax), %xmm0
- movq %xmm0, -30(%edx)
-L(fwd_write_22bytes):
- movq -22(%eax), %xmm0
- movq %xmm0, -22(%edx)
-L(fwd_write_14bytes):
- movq -14(%eax), %xmm0
- movq %xmm0, -14(%edx)
-L(fwd_write_6bytes):
- movl -6(%eax), %ecx
- movl %ecx, -6(%edx)
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_42bytes):
- movq -42(%eax), %xmm0
- movq %xmm0, -42(%edx)
-L(fwd_write_34bytes):
- movq -34(%eax), %xmm0
- movq %xmm0, -34(%edx)
-L(fwd_write_26bytes):
- movq -26(%eax), %xmm0
- movq %xmm0, -26(%edx)
-L(fwd_write_18bytes):
- movq -18(%eax), %xmm0
- movq %xmm0, -18(%edx)
-L(fwd_write_10bytes):
- movq -10(%eax), %xmm0
- movq %xmm0, -10(%edx)
-L(fwd_write_2bytes):
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_47bytes):
- movq -47(%eax), %xmm0
- movq %xmm0, -47(%edx)
-L(fwd_write_39bytes):
- movq -39(%eax), %xmm0
- movq %xmm0, -39(%edx)
-L(fwd_write_31bytes):
- movq -31(%eax), %xmm0
- movq %xmm0, -31(%edx)
-L(fwd_write_23bytes):
- movq -23(%eax), %xmm0
- movq %xmm0, -23(%edx)
-L(fwd_write_15bytes):
- movq -15(%eax), %xmm0
- movq %xmm0, -15(%edx)
-L(fwd_write_7bytes):
- movl -7(%eax), %ecx
- movl %ecx, -7(%edx)
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_43bytes):
- movq -43(%eax), %xmm0
- movq %xmm0, -43(%edx)
-L(fwd_write_35bytes):
- movq -35(%eax), %xmm0
- movq %xmm0, -35(%edx)
-L(fwd_write_27bytes):
- movq -27(%eax), %xmm0
- movq %xmm0, -27(%edx)
-L(fwd_write_19bytes):
- movq -19(%eax), %xmm0
- movq %xmm0, -19(%edx)
-L(fwd_write_11bytes):
- movq -11(%eax), %xmm0
- movq %xmm0, -11(%edx)
-L(fwd_write_3bytes):
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_40bytes_align):
- movdqa -40(%eax), %xmm0
- movdqa %xmm0, -40(%edx)
-L(fwd_write_24bytes_align):
- movdqa -24(%eax), %xmm0
- movdqa %xmm0, -24(%edx)
-L(fwd_write_8bytes_align):
- movq -8(%eax), %xmm0
- movq %xmm0, -8(%edx)
-L(fwd_write_0bytes_align):
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_32bytes_align):
- movdqa -32(%eax), %xmm0
- movdqa %xmm0, -32(%edx)
-L(fwd_write_16bytes_align):
- movdqa -16(%eax), %xmm0
- movdqa %xmm0, -16(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_5bytes_align):
- movl -5(%eax), %ecx
- movl -4(%eax), %eax
- movl %ecx, -5(%edx)
- movl %eax, -4(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_45bytes_align):
- movdqa -45(%eax), %xmm0
- movdqa %xmm0, -45(%edx)
-L(fwd_write_29bytes_align):
- movdqa -29(%eax), %xmm0
- movdqa %xmm0, -29(%edx)
-L(fwd_write_13bytes_align):
- movq -13(%eax), %xmm0
- movq %xmm0, -13(%edx)
- movl -5(%eax), %ecx
- movl %ecx, -5(%edx)
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_37bytes_align):
- movdqa -37(%eax), %xmm0
- movdqa %xmm0, -37(%edx)
-L(fwd_write_21bytes_align):
- movdqa -21(%eax), %xmm0
- movdqa %xmm0, -21(%edx)
- movl -5(%eax), %ecx
- movl %ecx, -5(%edx)
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_41bytes_align):
- movdqa -41(%eax), %xmm0
- movdqa %xmm0, -41(%edx)
-L(fwd_write_25bytes_align):
- movdqa -25(%eax), %xmm0
- movdqa %xmm0, -25(%edx)
-L(fwd_write_9bytes_align):
- movq -9(%eax), %xmm0
- movq %xmm0, -9(%edx)
-L(fwd_write_1bytes_align):
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_33bytes_align):
- movdqa -33(%eax), %xmm0
- movdqa %xmm0, -33(%edx)
-L(fwd_write_17bytes_align):
- movdqa -17(%eax), %xmm0
- movdqa %xmm0, -17(%edx)
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_46bytes_align):
- movdqa -46(%eax), %xmm0
- movdqa %xmm0, -46(%edx)
-L(fwd_write_30bytes_align):
- movdqa -30(%eax), %xmm0
- movdqa %xmm0, -30(%edx)
-L(fwd_write_14bytes_align):
- movq -14(%eax), %xmm0
- movq %xmm0, -14(%edx)
-L(fwd_write_6bytes_align):
- movl -6(%eax), %ecx
- movl %ecx, -6(%edx)
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_38bytes_align):
- movdqa -38(%eax), %xmm0
- movdqa %xmm0, -38(%edx)
-L(fwd_write_22bytes_align):
- movdqa -22(%eax), %xmm0
- movdqa %xmm0, -22(%edx)
- movl -6(%eax), %ecx
- movl %ecx, -6(%edx)
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_42bytes_align):
- movdqa -42(%eax), %xmm0
- movdqa %xmm0, -42(%edx)
-L(fwd_write_26bytes_align):
- movdqa -26(%eax), %xmm0
- movdqa %xmm0, -26(%edx)
-L(fwd_write_10bytes_align):
- movq -10(%eax), %xmm0
- movq %xmm0, -10(%edx)
-L(fwd_write_2bytes_align):
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_34bytes_align):
- movdqa -34(%eax), %xmm0
- movdqa %xmm0, -34(%edx)
-L(fwd_write_18bytes_align):
- movdqa -18(%eax), %xmm0
- movdqa %xmm0, -18(%edx)
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_47bytes_align):
- movdqa -47(%eax), %xmm0
- movdqa %xmm0, -47(%edx)
-L(fwd_write_31bytes_align):
- movdqa -31(%eax), %xmm0
- movdqa %xmm0, -31(%edx)
-L(fwd_write_15bytes_align):
- movq -15(%eax), %xmm0
- movq %xmm0, -15(%edx)
-L(fwd_write_7bytes_align):
- movl -7(%eax), %ecx
- movl %ecx, -7(%edx)
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_39bytes_align):
- movdqa -39(%eax), %xmm0
- movdqa %xmm0, -39(%edx)
-L(fwd_write_23bytes_align):
- movdqa -23(%eax), %xmm0
- movdqa %xmm0, -23(%edx)
- movl -7(%eax), %ecx
- movl %ecx, -7(%edx)
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_43bytes_align):
- movdqa -43(%eax), %xmm0
- movdqa %xmm0, -43(%edx)
-L(fwd_write_27bytes_align):
- movdqa -27(%eax), %xmm0
- movdqa %xmm0, -27(%edx)
-L(fwd_write_11bytes_align):
- movq -11(%eax), %xmm0
- movq %xmm0, -11(%edx)
-L(fwd_write_3bytes_align):
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_35bytes_align):
- movdqa -35(%eax), %xmm0
- movdqa %xmm0, -35(%edx)
-L(fwd_write_19bytes_align):
- movdqa -19(%eax), %xmm0
- movdqa %xmm0, -19(%edx)
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_44bytes_align):
- movdqa -44(%eax), %xmm0
- movdqa %xmm0, -44(%edx)
-L(fwd_write_28bytes_align):
- movdqa -28(%eax), %xmm0
- movdqa %xmm0, -28(%edx)
-L(fwd_write_12bytes_align):
- movq -12(%eax), %xmm0
- movq %xmm0, -12(%edx)
-L(fwd_write_4bytes_align):
- movl -4(%eax), %ecx
- movl %ecx, -4(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(fwd_write_36bytes_align):
- movdqa -36(%eax), %xmm0
- movdqa %xmm0, -36(%edx)
-L(fwd_write_20bytes_align):
- movdqa -20(%eax), %xmm0
- movdqa %xmm0, -20(%edx)
- movl -4(%eax), %ecx
- movl %ecx, -4(%edx)
-# ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-# else
- movl DEST(%esp), %eax
-# endif
-# endif
- RETURN_END
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(large_page):
- movdqu (%eax), %xmm1
-# ifdef USE_AS_MEMMOVE
- movl DEST+4(%esp), %edi
- movdqu %xmm0, (%edi)
-# endif
- lea 16(%eax), %eax
- movntdq %xmm1, (%edx)
- lea 16(%edx), %edx
- lea -0x90(%ecx), %ecx
- POP (%edi)
-
- .p2align 4
-L(large_page_loop):
- movdqu (%eax), %xmm0
- movdqu 0x10(%eax), %xmm1
- movdqu 0x20(%eax), %xmm2
- movdqu 0x30(%eax), %xmm3
- movdqu 0x40(%eax), %xmm4
- movdqu 0x50(%eax), %xmm5
- movdqu 0x60(%eax), %xmm6
- movdqu 0x70(%eax), %xmm7
- lea 0x80(%eax), %eax
-
- sub $0x80, %ecx
- movntdq %xmm0, (%edx)
- movntdq %xmm1, 0x10(%edx)
- movntdq %xmm2, 0x20(%edx)
- movntdq %xmm3, 0x30(%edx)
- movntdq %xmm4, 0x40(%edx)
- movntdq %xmm5, 0x50(%edx)
- movntdq %xmm6, 0x60(%edx)
- movntdq %xmm7, 0x70(%edx)
- lea 0x80(%edx), %edx
- jae L(large_page_loop)
- cmp $-0x40, %ecx
- lea 0x80(%ecx), %ecx
- jl L(large_page_less_64bytes)
-
- movdqu (%eax), %xmm0
- movdqu 0x10(%eax), %xmm1
- movdqu 0x20(%eax), %xmm2
- movdqu 0x30(%eax), %xmm3
- lea 0x40(%eax), %eax
-
- movntdq %xmm0, (%edx)
- movntdq %xmm1, 0x10(%edx)
- movntdq %xmm2, 0x20(%edx)
- movntdq %xmm3, 0x30(%edx)
- lea 0x40(%edx), %edx
- sub $0x40, %ecx
-L(large_page_less_64bytes):
- cmp $32, %ecx
- jb L(large_page_less_32bytes)
- movdqu (%eax), %xmm0
- movdqu 0x10(%eax), %xmm1
- lea 0x20(%eax), %eax
- movntdq %xmm0, (%edx)
- movntdq %xmm1, 0x10(%edx)
- lea 0x20(%edx), %edx
- sub $0x20, %ecx
-L(large_page_less_32bytes):
- add %ecx, %edx
- add %ecx, %eax
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- .p2align 4
-L(bk_write_44bytes):
- movq 36(%eax), %xmm0
- movq %xmm0, 36(%edx)
-L(bk_write_36bytes):
- movq 28(%eax), %xmm0
- movq %xmm0, 28(%edx)
-L(bk_write_28bytes):
- movq 20(%eax), %xmm0
- movq %xmm0, 20(%edx)
-L(bk_write_20bytes):
- movq 12(%eax), %xmm0
- movq %xmm0, 12(%edx)
-L(bk_write_12bytes):
- movq 4(%eax), %xmm0
- movq %xmm0, 4(%edx)
-L(bk_write_4bytes):
- movl (%eax), %ecx
- movl %ecx, (%edx)
-L(bk_write_0bytes):
-# ifndef USE_AS_BCOPY
- movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(bk_write_40bytes):
- movq 32(%eax), %xmm0
- movq %xmm0, 32(%edx)
-L(bk_write_32bytes):
- movq 24(%eax), %xmm0
- movq %xmm0, 24(%edx)
-L(bk_write_24bytes):
- movq 16(%eax), %xmm0
- movq %xmm0, 16(%edx)
-L(bk_write_16bytes):
- movq 8(%eax), %xmm0
- movq %xmm0, 8(%edx)
-L(bk_write_8bytes):
- movq (%eax), %xmm0
- movq %xmm0, (%edx)
-# ifndef USE_AS_BCOPY
- movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(bk_write_45bytes):
- movq 37(%eax), %xmm0
- movq %xmm0, 37(%edx)
-L(bk_write_37bytes):
- movq 29(%eax), %xmm0
- movq %xmm0, 29(%edx)
-L(bk_write_29bytes):
- movq 21(%eax), %xmm0
- movq %xmm0, 21(%edx)
-L(bk_write_21bytes):
- movq 13(%eax), %xmm0
- movq %xmm0, 13(%edx)
-L(bk_write_13bytes):
- movq 5(%eax), %xmm0
- movq %xmm0, 5(%edx)
-L(bk_write_5bytes):
- movl 1(%eax), %ecx
- movl %ecx, 1(%edx)
-L(bk_write_1bytes):
- movzbl (%eax), %ecx
- movb %cl, (%edx)
-# ifndef USE_AS_BCOPY
- movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(bk_write_41bytes):
- movq 33(%eax), %xmm0
- movq %xmm0, 33(%edx)
-L(bk_write_33bytes):
- movq 25(%eax), %xmm0
- movq %xmm0, 25(%edx)
-L(bk_write_25bytes):
- movq 17(%eax), %xmm0
- movq %xmm0, 17(%edx)
-L(bk_write_17bytes):
- movq 9(%eax), %xmm0
- movq %xmm0, 9(%edx)
-L(bk_write_9bytes):
- movq 1(%eax), %xmm0
- movq %xmm0, 1(%edx)
- movzbl (%eax), %ecx
- movb %cl, (%edx)
-# ifndef USE_AS_BCOPY
- movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(bk_write_46bytes):
- movq 38(%eax), %xmm0
- movq %xmm0, 38(%edx)
-L(bk_write_38bytes):
- movq 30(%eax), %xmm0
- movq %xmm0, 30(%edx)
-L(bk_write_30bytes):
- movq 22(%eax), %xmm0
- movq %xmm0, 22(%edx)
-L(bk_write_22bytes):
- movq 14(%eax), %xmm0
- movq %xmm0, 14(%edx)
-L(bk_write_14bytes):
- movq 6(%eax), %xmm0
- movq %xmm0, 6(%edx)
-L(bk_write_6bytes):
- movl 2(%eax), %ecx
- movl %ecx, 2(%edx)
- movzwl (%eax), %ecx
- movw %cx, (%edx)
-# ifndef USE_AS_BCOPY
- movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(bk_write_42bytes):
- movq 34(%eax), %xmm0
- movq %xmm0, 34(%edx)
-L(bk_write_34bytes):
- movq 26(%eax), %xmm0
- movq %xmm0, 26(%edx)
-L(bk_write_26bytes):
- movq 18(%eax), %xmm0
- movq %xmm0, 18(%edx)
-L(bk_write_18bytes):
- movq 10(%eax), %xmm0
- movq %xmm0, 10(%edx)
-L(bk_write_10bytes):
- movq 2(%eax), %xmm0
- movq %xmm0, 2(%edx)
-L(bk_write_2bytes):
- movzwl (%eax), %ecx
- movw %cx, (%edx)
-# ifndef USE_AS_BCOPY
- movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(bk_write_47bytes):
- movq 39(%eax), %xmm0
- movq %xmm0, 39(%edx)
-L(bk_write_39bytes):
- movq 31(%eax), %xmm0
- movq %xmm0, 31(%edx)
-L(bk_write_31bytes):
- movq 23(%eax), %xmm0
- movq %xmm0, 23(%edx)
-L(bk_write_23bytes):
- movq 15(%eax), %xmm0
- movq %xmm0, 15(%edx)
-L(bk_write_15bytes):
- movq 7(%eax), %xmm0
- movq %xmm0, 7(%edx)
-L(bk_write_7bytes):
- movl 3(%eax), %ecx
- movl %ecx, 3(%edx)
- movzwl 1(%eax), %ecx
- movw %cx, 1(%edx)
- movzbl (%eax), %eax
- movb %al, (%edx)
-# ifndef USE_AS_BCOPY
- movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-# endif
-# endif
- RETURN
-
- .p2align 4
-L(bk_write_43bytes):
- movq 35(%eax), %xmm0
- movq %xmm0, 35(%edx)
-L(bk_write_35bytes):
- movq 27(%eax), %xmm0
- movq %xmm0, 27(%edx)
-L(bk_write_27bytes):
- movq 19(%eax), %xmm0
- movq %xmm0, 19(%edx)
-L(bk_write_19bytes):
- movq 11(%eax), %xmm0
- movq %xmm0, 11(%edx)
-L(bk_write_11bytes):
- movq 3(%eax), %xmm0
- movq %xmm0, 3(%edx)
-L(bk_write_3bytes):
- movzwl 1(%eax), %ecx
- movw %cx, 1(%edx)
- movzbl (%eax), %eax
- movb %al, (%edx)
-# ifndef USE_AS_BCOPY
- movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-# endif
-# endif
- RETURN_END
-
-
- .pushsection .rodata.ssse3,"a",@progbits
- .p2align 2
-L(table_48bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
-
- .p2align 2
-L(table_48bytes_fwd_align):
- .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
-
- .p2align 2
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 2
-L(table_48_bytes_bwd):
- .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
-
- .popsection
-
-# ifdef USE_AS_MEMMOVE
- .p2align 4
-L(copy_backward):
- PUSH (%edi)
- movl %eax, %edi
- lea (%ecx,%edx,1),%edx
- lea (%ecx,%edi,1),%edi
- testl $0x3, %edx
- jnz L(bk_align)
-
-L(bk_aligned_4):
- cmp $64, %ecx
- jae L(bk_write_more64bytes)
-
-L(bk_write_64bytesless):
- cmp $32, %ecx
- jb L(bk_write_less32bytes)
-
-L(bk_write_more32bytes):
- /* Copy 32 bytes at a time. */
- sub $32, %ecx
- movq -8(%edi), %xmm0
- movq %xmm0, -8(%edx)
- movq -16(%edi), %xmm0
- movq %xmm0, -16(%edx)
- movq -24(%edi), %xmm0
- movq %xmm0, -24(%edx)
- movq -32(%edi), %xmm0
- movq %xmm0, -32(%edx)
- sub $32, %edx
- sub $32, %edi
-
-L(bk_write_less32bytes):
- movl %edi, %eax
- sub %ecx, %edx
- sub %ecx, %eax
- POP (%edi)
-L(bk_write_less32bytes_2):
- BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(bk_align):
- cmp $8, %ecx
- jbe L(bk_write_less32bytes)
- testl $1, %edx
- /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
- then (EDX & 2) must be != 0. */
- jz L(bk_got2)
- sub $1, %edi
- sub $1, %ecx
- sub $1, %edx
- movzbl (%edi), %eax
- movb %al, (%edx)
-
- testl $2, %edx
- jz L(bk_aligned_4)
-
-L(bk_got2):
- sub $2, %edi
- sub $2, %ecx
- sub $2, %edx
- movzwl (%edi), %eax
- movw %ax, (%edx)
- jmp L(bk_aligned_4)
-
- .p2align 4
-L(bk_write_more64bytes):
- /* Check alignment of last byte. */
- testl $15, %edx
- jz L(bk_ssse3_cpy_pre)
-
-/* EDX is aligned 4 bytes, but not 16 bytes. */
-L(bk_ssse3_align):
- sub $4, %edi
- sub $4, %ecx
- sub $4, %edx
- movl (%edi), %eax
- movl %eax, (%edx)
-
- testl $15, %edx
- jz L(bk_ssse3_cpy_pre)
-
- sub $4, %edi
- sub $4, %ecx
- sub $4, %edx
- movl (%edi), %eax
- movl %eax, (%edx)
-
- testl $15, %edx
- jz L(bk_ssse3_cpy_pre)
-
- sub $4, %edi
- sub $4, %ecx
- sub $4, %edx
- movl (%edi), %eax
- movl %eax, (%edx)
-
-L(bk_ssse3_cpy_pre):
- cmp $64, %ecx
- jb L(bk_write_more32bytes)
-
- .p2align 4
-L(bk_ssse3_cpy):
- sub $64, %edi
- sub $64, %ecx
- sub $64, %edx
- movdqu 0x30(%edi), %xmm3
- movdqa %xmm3, 0x30(%edx)
- movdqu 0x20(%edi), %xmm2
- movdqa %xmm2, 0x20(%edx)
- movdqu 0x10(%edi), %xmm1
- movdqa %xmm1, 0x10(%edx)
- movdqu (%edi), %xmm0
- movdqa %xmm0, (%edx)
- cmp $64, %ecx
- jae L(bk_ssse3_cpy)
- jmp L(bk_write_64bytesless)
-
-# endif
-
-END (MEMCPY)
-
-#endif