1 files changed, 376 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
new file mode 100644
index 0000000000..3cac1e33cd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -0,0 +1,376 @@
+/* memcpy with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef MEMCPY
+# define MEMCPY	__memcpy_avx_unaligned
+# define MEMCPY_CHK	__memcpy_chk_avx_unaligned
+#endif
+
+	.section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+	cmp	$256, %rdx
+	jae	L(256bytesormore)
+	cmp	$16, %dl
+	jb	L(less_16bytes)
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovdqu (%rsi), %xmm0
+	lea	(%rsi, %rdx), %rcx
+	vmovdqu 0x10(%rsi), %xmm1
+	vmovdqu 0x20(%rsi), %xmm2
+	vmovdqu 0x30(%rsi), %xmm3
+	vmovdqu 0x40(%rsi), %xmm4
+	vmovdqu 0x50(%rsi), %xmm5
+	vmovdqu 0x60(%rsi), %xmm6
+	vmovdqu 0x70(%rsi), %xmm7
+	vmovdqu -0x80(%rcx), %xmm8
+	vmovdqu -0x70(%rcx), %xmm9
+	vmovdqu -0x60(%rcx), %xmm10
+	vmovdqu -0x50(%rcx), %xmm11
+	vmovdqu -0x40(%rcx), %xmm12
+	vmovdqu -0x30(%rcx), %xmm13
+	vmovdqu -0x20(%rcx), %xmm14
+	vmovdqu -0x10(%rcx), %xmm15
+	lea	(%rdi, %rdx), %rdx
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, 0x10(%rdi)
+	vmovdqu %xmm2, 0x20(%rdi)
+	vmovdqu %xmm3, 0x30(%rdi)
+	vmovdqu %xmm4, 0x40(%rdi)
+	vmovdqu %xmm5, 0x50(%rdi)
+	vmovdqu %xmm6, 0x60(%rdi)
+	vmovdqu %xmm7, 0x70(%rdi)
+	vmovdqu %xmm8, -0x80(%rdx)
+	vmovdqu %xmm9, -0x70(%rdx)
+	vmovdqu %xmm10, -0x60(%rdx)
+	vmovdqu %xmm11, -0x50(%rdx)
+	vmovdqu %xmm12, -0x40(%rdx)
+	vmovdqu %xmm13, -0x30(%rdx)
+	vmovdqu %xmm14, -0x20(%rdx)
+	vmovdqu %xmm15, -0x10(%rdx)
+	ret
+	.p2align 4
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovdqu (%rsi), %xmm0
+	lea	(%rsi, %rdx), %rcx
+	vmovdqu 0x10(%rsi), %xmm1
+	vmovdqu 0x20(%rsi), %xmm2
+	lea	(%rdi, %rdx), %rdx
+	vmovdqu 0x30(%rsi), %xmm3
+	vmovdqu -0x40(%rcx), %xmm4
+	vmovdqu -0x30(%rcx), %xmm5
+	vmovdqu -0x20(%rcx), %xmm6
+	vmovdqu -0x10(%rcx), %xmm7
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, 0x10(%rdi)
+	vmovdqu %xmm2, 0x20(%rdi)
+	vmovdqu %xmm3, 0x30(%rdi)
+	vmovdqu %xmm4, -0x40(%rdx)
+	vmovdqu %xmm5, -0x30(%rdx)
+	vmovdqu %xmm6, -0x20(%rdx)
+	vmovdqu %xmm7, -0x10(%rdx)
+	ret
+
+	.p2align 4
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu (%rsi), %xmm0
+	vmovdqu 0x10(%rsi), %xmm1
+	vmovdqu -0x20(%rsi, %rdx), %xmm6
+	vmovdqu -0x10(%rsi, %rdx), %xmm7
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, 0x10(%rdi)
+	vmovdqu %xmm6, -0x20(%rdi, %rdx)
+	vmovdqu %xmm7, -0x10(%rdi, %rdx)
+	ret
+
+	.p2align 4
+L(less_32bytes):
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -0x10(%rsi, %rdx), %xmm7
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm7, -0x10(%rdi, %rdx)
+	ret
+
+	.p2align 4
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	movq -0x08(%rsi, %rdx),	%rcx
+	movq (%rsi),	%rsi
+	movq %rsi, (%rdi)
+	movq %rcx, -0x08(%rdi, %rdx)
+	ret
+
+	.p2align 4
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov -0x04(%rsi, %rdx), %ecx
+	mov (%rsi),	%esi
+	mov %esi, (%rdi)
+	mov %ecx, -0x04(%rdi, %rdx)
+	ret
+
+L(less_4bytes):
+	cmp	$1, %dl
+	jbe	L(less_2bytes)
+	mov -0x02(%rsi, %rdx),	%cx
+	mov (%rsi),	%si
+	mov %si, (%rdi)
+	mov %cx, -0x02(%rdi, %rdx)
+	ret
+
+L(less_2bytes):
+	jb	L(less_0bytes)
+	mov	(%rsi), %cl
+	mov	%cl,	(%rdi)
+L(less_0bytes):
+	ret
+
+	.p2align 4
+L(256bytesormore):
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %rcx
+	sub	%rsi, %rcx
+	cmp	%rdx, %rcx
+	jc	L(copy_backward)
+#endif
+	cmp	$2048, %rdx
+	jae	L(gobble_data_movsb)
+	mov	%rax, %r8
+	lea	(%rsi, %rdx), %rcx
+	mov	%rdi, %r10
+	vmovdqu -0x80(%rcx), %xmm5
+	vmovdqu -0x70(%rcx), %xmm6
+	mov	$0x80, %rax
+	and	$-32, %rdi
+	add	$32, %rdi
+	vmovdqu -0x60(%rcx), %xmm7
+	vmovdqu -0x50(%rcx), %xmm8
+	mov	%rdi, %r11
+	sub	%r10, %r11
+	vmovdqu -0x40(%rcx), %xmm9
+	vmovdqu -0x30(%rcx), %xmm10
+	sub	%r11, %rdx
+	vmovdqu -0x20(%rcx), %xmm11
+	vmovdqu -0x10(%rcx), %xmm12
+	vmovdqu	(%rsi), %ymm4
+	add	%r11, %rsi
+	sub	%eax, %edx
+L(goble_128_loop):
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 0x20(%rsi), %ymm1
+	vmovdqu 0x40(%rsi), %ymm2
+	vmovdqu 0x60(%rsi), %ymm3
+	add	%rax, %rsi
+	vmovdqa %ymm0, (%rdi)
+	vmovdqa %ymm1, 0x20(%rdi)
+	vmovdqa %ymm2, 0x40(%rdi)
+	vmovdqa %ymm3, 0x60(%rdi)
+	add	%rax, %rdi
+	sub	%eax, %edx
+	jae	L(goble_128_loop)
+	add	%eax, %edx
+	add	%rdi, %rdx
+	vmovdqu	%ymm4, (%r10)
+	vzeroupper
+	vmovdqu %xmm5, -0x80(%rdx)
+	vmovdqu %xmm6, -0x70(%rdx)
+	vmovdqu %xmm7, -0x60(%rdx)
+	vmovdqu %xmm8, -0x50(%rdx)
+	vmovdqu %xmm9, -0x40(%rdx)
+	vmovdqu %xmm10, -0x30(%rdx)
+	vmovdqu %xmm11, -0x20(%rdx)
+	vmovdqu %xmm12, -0x10(%rdx)
+	mov	%r8, %rax
+	ret
+
+	.p2align 4
+L(gobble_data_movsb):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx
+	cmp	%rcx, %rdx
+	jae	L(gobble_big_data_fwd)
+	mov	%rdx, %rcx
+	mov	%rdx, %rcx
+	rep	movsb
+	ret
+
+	.p2align 4
+L(gobble_big_data_fwd):
+	lea	(%rsi, %rdx), %rcx
+	vmovdqu	(%rsi), %ymm4
+	vmovdqu -0x80(%rsi,%rdx), %xmm5
+	vmovdqu -0x70(%rcx), %xmm6
+	vmovdqu -0x60(%rcx), %xmm7
+	vmovdqu -0x50(%rcx), %xmm8
+	vmovdqu -0x40(%rcx), %xmm9
+	vmovdqu -0x30(%rcx), %xmm10
+	vmovdqu -0x20(%rcx), %xmm11
+	vmovdqu -0x10(%rcx), %xmm12
+	mov	%rdi, %r8
+	and	$-32, %rdi
+	add	$32, %rdi
+	mov	%rdi, %r10
+	sub	%r8, %r10
+	sub	%r10, %rdx
+	add	%r10, %rsi
+	lea	(%rdi, %rdx), %rcx
+	add	$-0x80, %rdx
+L(gobble_mem_fwd_loop):
+	prefetchnta 0x1c0(%rsi)
+	prefetchnta 0x280(%rsi)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	0x20(%rsi), %ymm1
+	vmovdqu	0x40(%rsi), %ymm2
+	vmovdqu	0x60(%rsi), %ymm3
+	sub	$-0x80, %rsi
+	vmovntdq	%ymm0, (%rdi)
+	vmovntdq	%ymm1, 0x20(%rdi)
+	vmovntdq	%ymm2, 0x40(%rdi)
+	vmovntdq	%ymm3, 0x60(%rdi)
+	sub	$-0x80, %rdi
+	add	$-0x80, %rdx
+	jb	L(gobble_mem_fwd_loop)
+	sfence
+	vmovdqu	%ymm4, (%r8)
+	vzeroupper
+	vmovdqu %xmm5, -0x80(%rcx)
+	vmovdqu %xmm6, -0x70(%rcx)
+	vmovdqu %xmm7, -0x60(%rcx)
+	vmovdqu %xmm8, -0x50(%rcx)
+	vmovdqu %xmm9, -0x40(%rcx)
+	vmovdqu %xmm10, -0x30(%rcx)
+	vmovdqu %xmm11, -0x20(%rcx)
+	vmovdqu %xmm12, -0x10(%rcx)
+	ret
+
+#ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx
+	vmovdqu (%rsi), %xmm5
+	vmovdqu 0x10(%rsi), %xmm6
+	add	%rdx, %rdi
+	vmovdqu 0x20(%rsi), %xmm7
+	vmovdqu 0x30(%rsi), %xmm8
+	lea	-0x20(%rdi), %r10
+	mov %rdi, %r11
+	vmovdqu 0x40(%rsi), %xmm9
+	vmovdqu 0x50(%rsi), %xmm10
+	and	$0x1f, %r11
+	vmovdqu 0x60(%rsi), %xmm11
+	vmovdqu 0x70(%rsi), %xmm12
+	xor	%r11, %rdi
+	add	%rdx, %rsi
+	vmovdqu	-0x20(%rsi), %ymm4
+	sub	%r11, %rsi
+	sub	%r11, %rdx
+	cmp	%rcx, %rdx
+	ja	L(gobble_big_data_bwd)
+	add	$-0x80, %rdx
+L(gobble_mem_bwd_llc):
+	vmovdqu	-0x20(%rsi), %ymm0
+	vmovdqu	-0x40(%rsi), %ymm1
+	vmovdqu	-0x60(%rsi), %ymm2
+	vmovdqu	-0x80(%rsi), %ymm3
+	lea	-0x80(%rsi), %rsi
+	vmovdqa	%ymm0, -0x20(%rdi)
+	vmovdqa	%ymm1, -0x40(%rdi)
+	vmovdqa	%ymm2, -0x60(%rdi)
+	vmovdqa	%ymm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	add	$-0x80, %rdx
+	jb	L(gobble_mem_bwd_llc)
+	vmovdqu	%ymm4, (%r10)
+	vzeroupper
+	vmovdqu %xmm5, (%rax)
+	vmovdqu %xmm6, 0x10(%rax)
+	vmovdqu %xmm7, 0x20(%rax)
+	vmovdqu %xmm8, 0x30(%rax)
+	vmovdqu %xmm9, 0x40(%rax)
+	vmovdqu %xmm10, 0x50(%rax)
+	vmovdqu %xmm11, 0x60(%rax)
+	vmovdqu %xmm12, 0x70(%rax)
+	ret
+
+	.p2align 4
+L(gobble_big_data_bwd):
+	add	$-0x80, %rdx
+L(gobble_mem_bwd_loop):
+	prefetchnta -0x1c0(%rsi)
+	prefetchnta -0x280(%rsi)
+	vmovdqu	-0x20(%rsi), %ymm0
+	vmovdqu	-0x40(%rsi), %ymm1
+	vmovdqu	-0x60(%rsi), %ymm2
+	vmovdqu	-0x80(%rsi), %ymm3
+	lea	-0x80(%rsi), %rsi
+	vmovntdq	%ymm0, -0x20(%rdi)
+	vmovntdq	%ymm1, -0x40(%rdi)
+	vmovntdq	%ymm2, -0x60(%rdi)
+	vmovntdq	%ymm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	add	$-0x80, %rdx
+	jb	L(gobble_mem_bwd_loop)
+	sfence
+	vmovdqu	%ymm4, (%r10)
+	vzeroupper
+	vmovdqu %xmm5, (%rax)
+	vmovdqu %xmm6, 0x10(%rax)
+	vmovdqu %xmm7, 0x20(%rax)
+	vmovdqu %xmm8, 0x30(%rax)
+	vmovdqu %xmm9, 0x40(%rax)
+	vmovdqu %xmm10, 0x50(%rax)
+	vmovdqu %xmm11, 0x60(%rax)
+	vmovdqu %xmm12, 0x70(%rax)
+	ret
+#endif
+END (MEMCPY)
+#endif