diff options
author | Ulrich Drepper <drepper@redhat.com> | 2010-09-02 23:36:25 -0700 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2010-09-02 23:36:25 -0700 |
commit | 0959ffc97b738c489087bcf45578c1580a87e66d (patch) | |
tree | ac76fbfa5e53376a579a3220a4a7873624e4a296 /sysdeps/x86_64 | |
parent | ece298407076558531796450af39199aa0b34bef (diff) | |
download | glibc-0959ffc97b738c489087bcf45578c1580a87e66d.tar glibc-0959ffc97b738c489087bcf45578c1580a87e66d.tar.gz glibc-0959ffc97b738c489087bcf45578c1580a87e66d.tar.bz2 glibc-0959ffc97b738c489087bcf45578c1580a87e66d.zip |
Update x86-64 mpn routines from GMP 5.0.1.
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r-- | sysdeps/x86_64/add_n.S | 99 | ||||
-rw-r--r-- | sysdeps/x86_64/addmul_1.S | 115 | ||||
-rw-r--r-- | sysdeps/x86_64/lshift.S | 127 | ||||
-rw-r--r-- | sysdeps/x86_64/mul_1.S | 119 | ||||
-rw-r--r-- | sysdeps/x86_64/rshift.S | 129 | ||||
-rw-r--r-- | sysdeps/x86_64/sub_n.S | 28 | ||||
-rw-r--r-- | sysdeps/x86_64/submul_1.S | 32 |
7 files changed, 467 insertions, 182 deletions
diff --git a/sysdeps/x86_64/add_n.S b/sysdeps/x86_64/add_n.S index 7883f6c840..f0b4c3f78c 100644 --- a/sysdeps/x86_64/add_n.S +++ b/sysdeps/x86_64/add_n.S @@ -1,6 +1,6 @@ -/* Add two limb vectors of the same length > 0 and store sum in a third - limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. +/* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -21,22 +21,81 @@ #include "sysdep.h" #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define vp %rdx +#define n %rcx +#define cy %r8 + +#ifndef func +# define func __mpn_add_n +# define ADCSBB adc +#endif + .text -ENTRY (__mpn_add_n) - leaq (%rsi,%rcx,8), %rsi - leaq (%rdi,%rcx,8), %rdi - leaq (%rdx,%rcx,8), %rdx - negq %rcx - xorl %eax, %eax # clear cy - .p2align 2 -L(loop): - movq (%rsi,%rcx,8), %rax - movq (%rdx,%rcx,8), %r10 - adcq %r10, %rax - movq %rax, (%rdi,%rcx,8) - incq %rcx - jne L(loop) - movq %rcx, %rax # zero %rax - adcq %rax, %rax +ENTRY (func) + xor %r8, %r8 + mov (up), %r10 + mov (vp), %r11 + + lea -8(up,n,8), up + lea -8(vp,n,8), vp + lea -16(rp,n,8), rp + mov %ecx, %eax + neg n + and $3, %eax + je L(b00) + add %rax, n /* clear low rcx bits for jrcxz */ + cmp $2, %eax + jl L(b01) + je L(b10) + +L(b11): shr %r8 /* set cy */ + jmp L(e11) + +L(b00): shr %r8 /* set cy */ + mov %r10, %r8 + mov %r11, %r9 + lea 4(n), n + jmp L(e00) + +L(b01): shr %r8 /* set cy */ + jmp L(e01) + +L(b10): shr %r8 /* set cy */ + mov %r10, %r8 + mov %r11, %r9 + jmp L(e10) + +L(end): ADCSBB %r11, %r10 + mov %r10, 8(rp) + mov %ecx, %eax /* clear eax, ecx contains 0 */ + adc %eax, %eax ret -END (__mpn_add_n) + + .p2align 4 +L(top): + mov -24(up,n,8), %r8 + mov -24(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -24(rp,n,8) +L(e00): + mov -16(up,n,8), %r10 + mov -16(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -16(rp,n,8) +L(e11): + mov -8(up,n,8), %r8 + mov -8(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -8(rp,n,8) +L(e10): + mov (up,n,8), %r10 + mov (vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, (rp,n,8) +L(e01): + jrcxz L(end) + lea 4(n), n + jmp L(top) +END (func) diff --git a/sysdeps/x86_64/addmul_1.S b/sysdeps/x86_64/addmul_1.S index bdb5226a33..e997896703 100644 --- a/sysdeps/x86_64/addmul_1.S +++ b/sysdeps/x86_64/addmul_1.S @@ -1,6 +1,6 @@ -/* AMD64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add +/* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add the result to a second limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -21,26 +21,95 @@ #include "sysdep.h" #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n %rdx +#define v0 %rcx + +#ifndef func +# define func __mpn_addmul_1 +# define ADDSUB add +#endif + .text -ENTRY (__mpn_addmul_1) - movq %rdx, %r11 - leaq (%rsi,%rdx,8), %rsi - leaq (%rdi,%rdx,8), %rdi - negq %r11 - xorl %r8d, %r8d - xorl %r10d, %r10d - .p2align 2 -L(loop): - movq (%rsi,%r11,8), %rax - mulq %rcx - addq (%rdi,%r11,8), %rax - adcq %r10, %rdx - addq %r8, %rax - movq %r10, %r8 - movq %rax, (%rdi,%r11,8) - adcq %rdx, %r8 - incq %r11 - jne L(loop) - movq %r8, %rax +ENTRY (func) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + + bt $0, %ebx + jc L(odd) + + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + mul %rcx + add $2, %rbx + jns L(n2) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + jmp L(mid) + +L(odd): add $1, %rbx + jns L(n1) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + mul %rcx + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + jmp L(e) + + .p2align 4 +L(top): mul %rcx + ADDSUB %r8, %r10 + lea (%rax), %r8 + mov (up,%rbx,8), %rax + adc %r9, %r11 + mov %r10, -8(rp,%rbx,8) + mov (rp,%rbx,8), %r10 + lea (%rdx), %r9 + adc $0, %rbp +L(mid): mul %rcx + ADDSUB %r11, %r10 + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + adc %rbp, %r8 + mov %r10, (rp,%rbx,8) + mov 8(rp,%rbx,8), %r10 + lea (%rdx), %rbp + adc $0, %r9 +L(e): add $2, %rbx + js L(top) + + mul %rcx + ADDSUB %r8, %r10 + adc %r9, %r11 + mov %r10, -8(rp) + adc $0, %rbp +L(n2): mov (rp), %r10 + ADDSUB %r11, %r10 + adc %rbp, %rax + mov %r10, (rp) + adc $0, %rdx +L(n1): mov 8(rp), %r10 + ADDSUB %rax, %r10 + mov %r10, 8(rp) + mov %ebx, %eax /* zero rax */ + adc %rdx, %rax + pop %rbp + pop %rbx ret -END (__mpn_addmul_1) +END (func) diff --git a/sysdeps/x86_64/lshift.S b/sysdeps/x86_64/lshift.S index 5ac66f0a36..f89d3e09b3 100644 --- a/sysdeps/x86_64/lshift.S +++ b/sysdeps/x86_64/lshift.S @@ -1,5 +1,5 @@ -/* AMD64 __mpn_lshift -- - Copyright 2004, 2006 Free Software Foundation, Inc. +/* x86-64 __mpn_lshift -- + Copyright (C) 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -20,41 +20,98 @@ #include "sysdep.h" #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n %rdx +#define cnt %cl .text ENTRY (__mpn_lshift) - movq -8(%rsi,%rdx,8), %mm7 - movd %ecx, %mm1 - movl $64, %eax - subl %ecx, %eax - movd %eax, %mm0 - movq %mm7, %mm3 - psrlq %mm0, %mm7 - movd %mm7, %rax - subq $2, %rdx - jl L(endo) - .p2align 2 -L(loop): - movq (%rsi,%rdx,8), %mm6 - movq %mm6, %mm2 - psrlq %mm0, %mm6 - psllq %mm1, %mm3 - por %mm6, %mm3 - movq %mm3, 8(%rdi,%rdx,8) - je L(ende) - movq -8(%rsi,%rdx,8), %mm7 - movq %mm7, %mm3 - psrlq %mm0, %mm7 - psllq %mm1, %mm2 - por %mm7, %mm2 - movq %mm2, (%rdi,%rdx,8) - subq $2, %rdx - jge L(loop) -L(endo): - movq %mm3, %mm2 -L(ende): - psllq %mm1, %mm2 - movq %mm2, (%rdi) - emms + lea -8(rp,n,8), rp + lea -8(up,n,8), up + + mov %edx, %eax + and $3, %eax + jne L(nb00) +L(b00): /* n = 4, 8, 12, ... */ + mov (up), %r10 + mov -8(up), %r11 + xor %eax, %eax + shld %cl, %r10, %rax + mov -16(up), %r8 + lea 24(rp), rp + sub $4, n + jmp L(00) + +L(nb00):/* n = 1, 5, 9, ... */ + cmp $2, %eax + jae L(nb01) +L(b01): mov (up), %r9 + xor %eax, %eax + shld %cl, %r9, %rax + sub $2, n + jb L(le1) + mov -8(up), %r10 + mov -16(up), %r11 + lea -8(up), up + lea 16(rp), rp + jmp L(01) +L(le1): shl %cl, %r9 + mov %r9, (rp) + ret + +L(nb01):/* n = 2, 6, 10, ... */ + jne L(b11) +L(b10): mov (up), %r8 + mov -8(up), %r9 + xor %eax, %eax + shld %cl, %r8, %rax + sub $3, n + jb L(le2) + mov -16(up), %r10 + lea -16(up), up + lea 8(rp), rp + jmp L(10) +L(le2): shld %cl, %r9, %r8 + mov %r8, (rp) + shl %cl, %r9 + mov %r9, -8(rp) + ret + + .p2align 4 /* performance critical! */ +L(b11): /* n = 3, 7, 11, ... */ + mov (up), %r11 + mov -8(up), %r8 + xor %eax, %eax + shld %cl, %r11, %rax + mov -16(up), %r9 + lea -24(up), up + sub $4, n + jb L(end) + + .p2align 4 +L(top): shld %cl, %r8, %r11 + mov (up), %r10 + mov %r11, (rp) +L(10): shld %cl, %r9, %r8 + mov -8(up), %r11 + mov %r8, -8(rp) +L(01): shld %cl, %r10, %r9 + mov -16(up), %r8 + mov %r9, -16(rp) +L(00): shld %cl, %r11, %r10 + mov -24(up), %r9 + mov %r10, -24(rp) + add $-32, up + lea -32(rp), rp + sub $4, n + jnc L(top) + +L(end): shld %cl, %r8, %r11 + mov %r11, (rp) + shld %cl, %r9, %r8 + mov %r8, -8(rp) + shl %cl, %r9 + mov %r9, -16(rp) ret END (__mpn_lshift) diff --git a/sysdeps/x86_64/mul_1.S b/sysdeps/x86_64/mul_1.S index 978916b72c..676afd1755 100644 --- a/sysdeps/x86_64/mul_1.S +++ b/sysdeps/x86_64/mul_1.S @@ -1,6 +1,6 @@ /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store the result in a second limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -21,22 +21,109 @@ #include <sysdep.h> #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n_param %rdx +#define vl %rcx + +#define n %r11 + .text ENTRY (__mpn_mul_1) - movq %rdx, %r11 - leaq (%rsi,%rdx,8), %rsi - leaq (%rdi,%rdx,8), %rdi - negq %r11 - xorl %r8d, %r8d -L(loop): - movq (%rsi,%r11,8), %rax - mulq %rcx - addq %r8, %rax - movl $0, %r8d - adcq %rdx, %r8 - movq %rax, (%rdi,%r11,8) - incq %r11 - jne L(loop) - movq %r8, %rax + push %rbx + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbx, 0) + xor %r10, %r10 + mov (up), %rax /* read first u limb early */ + mov n_param, %rbx /* move away n from rdx, mul uses it */ + mul vl + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz L(b0) + cmp $2, %ebx + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + mov %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + .p2align 4 +L(top): mov %r10, (rp,n,8) + add %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, %r10d +L(L1): mul vl + mov %r9, 8(rp,n,8) + add %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + mov %r8, 16(rp,n,8) + add %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + mov %rbx, 24(rp,n,8) + mov $0, %r8d # zero + mov %r8, %rbx # zero + add %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 # zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + mov %r10, (rp,n,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(rp,n,8) + add %r8, %rdx +L(ret): mov %rdx, %rax + + pop %rbx + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbx) ret END (__mpn_mul_1) diff --git a/sysdeps/x86_64/rshift.S b/sysdeps/x86_64/rshift.S index ee0c8aa15c..8ff055169a 100644 --- a/sysdeps/x86_64/rshift.S +++ b/sysdeps/x86_64/rshift.S @@ -1,5 +1,5 @@ -/* AMD64 __mpn_rshift -- - Copyright (C) 2004, 2006 Free Software Foundation, Inc. +/* x86-64 __mpn_rshift -- + Copyright (C) 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -20,43 +20,96 @@ #include "sysdep.h" #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n %rdx +#define cnt %cl + .text ENTRY (__mpn_rshift) - movq (%rsi), %mm7 - movd %ecx, %mm1 - movl $64, %eax - subl %ecx, %eax - movd %eax, %mm0 - movq %mm7, %mm3 - psllq %mm0, %mm7 - movd %mm7, %rax - leaq (%rsi,%rdx,8), %rsi - leaq (%rdi,%rdx,8), %rdi - negq %rdx - addq $2, %rdx - jg L(endo) - .p2align 2 -L(loop): - movq -8(%rsi,%rdx,8), %mm6 - movq %mm6, %mm2 - psllq %mm0, %mm6 - psrlq %mm1, %mm3 - por %mm6, %mm3 - movq %mm3, -16(%rdi,%rdx,8) - je L(ende) - movq (%rsi,%rdx,8), %mm7 - movq %mm7, %mm3 - psllq %mm0, %mm7 - psrlq %mm1, %mm2 - por %mm7, %mm2 - movq %mm2, -8(%rdi,%rdx,8) - addq $2, %rdx - jle L(loop) -L(endo): - movq %mm3, %mm2 -L(ende): - psrlq %mm1, %mm2 - movq %mm2, -8(%rdi) - emms + mov %edx, %eax + and $3, %eax + jne L(nb00) +L(b00): /* n = 4, 8, 12, ... */ + mov (up), %r10 + mov 8(up), %r11 + xor %eax, %eax + shrd %cl, %r10, %rax + mov 16(up), %r8 + lea 8(up), up + lea -24(rp), rp + sub $4, n + jmp L(00) + +L(nb00):/* n = 1, 5, 9, ... */ + cmp $2, %eax + jae L(nb01) +L(b01): mov (up), %r9 + xor %eax, %eax + shrd %cl, %r9, %rax + sub $2, n + jb L(le1) + mov 8(up), %r10 + mov 16(up), %r11 + lea 16(up), up + lea -16(rp), rp + jmp L(01) +L(le1): shr %cl, %r9 + mov %r9, (rp) + ret + +L(nb01):/* n = 2, 6, 10, ... */ + jne L(b11) +L(b10): mov (up), %r8 + mov 8(up), %r9 + xor %eax, %eax + shrd %cl, %r8, %rax + sub $3, n + jb L(le2) + mov 16(up), %r10 + lea 24(up), up + lea -8(rp), rp + jmp L(10) +L(le2): shrd %cl, %r9, %r8 + mov %r8, (rp) + shr %cl, %r9 + mov %r9, 8(rp) + ret + + .p2align 4 +L(b11): /* n = 3, 7, 11, ... */ + mov (up), %r11 + mov 8(up), %r8 + xor %eax, %eax + shrd %cl, %r11, %rax + mov 16(up), %r9 + lea 32(up), up + sub $4, n + jb L(end) + + .p2align 4 +L(top): shrd %cl, %r8, %r11 + mov -8(up), %r10 + mov %r11, (rp) +L(10): shrd %cl, %r9, %r8 + mov (up), %r11 + mov %r8, 8(rp) +L(01): shrd %cl, %r10, %r9 + mov 8(up), %r8 + mov %r9, 16(rp) +L(00): shrd %cl, %r11, %r10 + mov 16(up), %r9 + mov %r10, 24(rp) + add $32, up + lea 32(rp), rp + sub $4, n + jnc L(top) + +L(end): shrd %cl, %r8, %r11 + mov %r11, (rp) + shrd %cl, %r9, %r8 + mov %r8, 8(rp) + shr %cl, %r9 + mov %r9, 16(rp) ret END (__mpn_rshift) diff --git a/sysdeps/x86_64/sub_n.S b/sysdeps/x86_64/sub_n.S index 48e1a2e0f4..60c15fc3e1 100644 --- a/sysdeps/x86_64/sub_n.S +++ b/sysdeps/x86_64/sub_n.S @@ -1,6 +1,6 @@ -/* AMD64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store +/* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -18,25 +18,7 @@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include "sysdep.h" -#include "asm-syntax.h" +#define func __mpn_sub_n +#define ADCSBB sbb - .text -ENTRY (__mpn_sub_n) - leaq (%rsi,%rcx,8), %rsi - leaq (%rdi,%rcx,8), %rdi - leaq (%rdx,%rcx,8), %rdx - negq %rcx - xorl %eax, %eax # clear cy - .p2align 2 -L(loop): - movq (%rsi,%rcx,8), %rax - movq (%rdx,%rcx,8), %r10 - sbbq %r10, %rax - movq %rax, (%rdi,%rcx,8) - incq %rcx - jne L(loop) - movq %rcx, %rax # zero %rax - adcq %rax, %rax - ret -END (__mpn_sub_n) +#include "add_n.S" diff --git a/sysdeps/x86_64/submul_1.S b/sysdeps/x86_64/submul_1.S index e94c9a7bee..150a92762f 100644 --- a/sysdeps/x86_64/submul_1.S +++ b/sysdeps/x86_64/submul_1.S @@ -1,6 +1,6 @@ -/* AMD64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +/* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract the result from a second limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -18,29 +18,7 @@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include "sysdep.h" -#include "asm-syntax.h" +#define func __mpn_submul_1 +#define ADDSUB sub - .text -ENTRY (__mpn_submul_1) - movq %rdx, %r11 - leaq (%rsi,%r11,8), %rsi - leaq (%rdi,%r11,8), %rdi - negq %r11 - xorl %r8d, %r8d - .p2align 3 -L(loop): - movq (%rsi,%r11,8), %rax - movq (%rdi,%r11,8), %r10 - mulq %rcx - subq %r8, %r10 - movl $0, %r8d - adcl %r8d, %r8d - subq %rax, %r10 - adcq %rdx, %r8 - movq %r10, (%rdi,%r11,8) - incq %r11 - jne L(loop) - movq %r8, %rax - ret -END (__mpn_submul_1) +#include "addmul_1.S" |