aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/fpu/svml_d_wrapper_impl.h')
-rw-r--r--sysdeps/x86_64/fpu/svml_d_wrapper_impl.h202
1 files changed, 78 insertions, 124 deletions
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index bd93b8edfa..5c0ff897c0 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -194,39 +194,39 @@
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512 callee
- pushq %rbp
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
+ movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
- call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
- call HIDDEN_JUMPTARGET(\callee)
- movq %rbp, %rsp
+ andq $-64, %rsp
+ subq $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+ vmovupd (%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 64(%rsp)
+ vmovupd 32(%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x01
+ movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
- popq %rbp
+ popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
@@ -234,61 +234,50 @@
/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512_ff callee
- pushq %rbp
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
+ movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x4c
- .byte 0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovapd 64(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x40
- call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
-/* Below is encoding for vmovapd 96(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x60
- call HIDDEN_JUMPTARGET(\callee)
- movq %rbp, %rsp
+ andq $-64, %rsp
+ subq $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x4c
+ .byte 0x24
+ .byte 0x01
+ vmovupd (%rsp), %ymm0
+ vmovupd 64(%rsp), %ymm1
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 128(%rsp)
+ vmovupd 32(%rsp), %ymm0
+ vmovupd 96(%rsp), %ymm1
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x02
+ movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
- popq %rbp
+ popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
@@ -310,61 +299,26 @@
cfi_rel_offset (%r13, 0)
subq $176, %rsp
movq %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
+/* Below is encoding for vmovups %zmm0, (%rsp). */
.byte 0x62
.byte 0xf1
.byte 0x7c
.byte 0x48
- .byte 0x29
+ .byte 0x11
.byte 0x04
.byte 0x24
movq %rdi, %r12
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
+ vmovupd (%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
+ vmovupd 32(%rsp), %ymm0
lea 64(%rsp), %rdi
lea 96(%rsp), %rsi
call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 64(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x40
-/* Below is encoding for vmovapd 96(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x60
-/* Below is encoding for vmovapd %ymm0, 32(%r12). */
- .byte 0xc4
- .byte 0xc1
- .byte 0x7d
- .byte 0x29
- .byte 0x44
- .byte 0x24
- .byte 0x20
-/* Below is encoding for vmovapd %ymm1, 32(%r13). */
- .byte 0xc4
- .byte 0xc1
- .byte 0x7d
- .byte 0x29
- .byte 0x4d
- .byte 0x20
+ vmovupd 64(%rsp), %ymm0
+ vmovupd 96(%rsp), %ymm1
+ vmovupd %ymm0, 32(%r12)
+ vmovupd %ymm1, 32(%r13)
+ vzeroupper
addq $176, %rsp
popq %r13
cfi_adjust_cfa_offset (-8)