diff options
Diffstat (limited to 'sysdeps/x86_64/mul_1.S')
-rw-r--r-- | sysdeps/x86_64/mul_1.S | 119 |
1 files changed, 103 insertions, 16 deletions
diff --git a/sysdeps/x86_64/mul_1.S b/sysdeps/x86_64/mul_1.S index 978916b72c..676afd1755 100644 --- a/sysdeps/x86_64/mul_1.S +++ b/sysdeps/x86_64/mul_1.S @@ -1,6 +1,6 @@ /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store the result in a second limb vector. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -21,22 +21,109 @@ #include <sysdep.h> #include "asm-syntax.h" +#define rp %rdi +#define up %rsi +#define n_param %rdx +#define vl %rcx + +#define n %r11 + .text ENTRY (__mpn_mul_1) - movq %rdx, %r11 - leaq (%rsi,%rdx,8), %rsi - leaq (%rdi,%rdx,8), %rdi - negq %r11 - xorl %r8d, %r8d -L(loop): - movq (%rsi,%r11,8), %rax - mulq %rcx - addq %r8, %rax - movl $0, %r8d - adcq %rdx, %r8 - movq %rax, (%rdi,%r11,8) - incq %r11 - jne L(loop) - movq %r8, %rax + push %rbx + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbx, 0) + xor %r10, %r10 + mov (up), %rax /* read first u limb early */ + mov n_param, %rbx /* move away n from rdx, mul uses it */ + mul vl + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz L(b0) + cmp $2, %ebx + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + mov %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + .p2align 4 +L(top): mov %r10, (rp,n,8) + add %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, %r10d +L(L1): mul vl + mov %r9, 8(rp,n,8) + add %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + mov %r8, 16(rp,n,8) + add %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + mov %rbx, 24(rp,n,8) + mov $0, %r8d # zero + mov %r8, %rbx # zero + add %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 # zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + mov %r10, (rp,n,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(rp,n,8) + add %r8, %rdx +L(ret): mov %rdx, %rax + + pop %rbx + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbx) ret END (__mpn_mul_1) |