aboutsummaryrefslogtreecommitdiff
path: root/REORG.TODO/sysdeps/i386/i586
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/i386/i586')
-rw-r--r--REORG.TODO/sysdeps/i386/i586/add_n.S143
-rw-r--r--REORG.TODO/sysdeps/i386/i586/addmul_1.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i586/bzero.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i586/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/i586/lshift.S255
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memcopy.h95
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memcpy.S124
-rw-r--r--REORG.TODO/sysdeps/i386/i586/mempcpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memset.S121
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memusage.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i586/mul_1.S90
-rw-r--r--REORG.TODO/sysdeps/i386/i586/rshift.S255
-rw-r--r--REORG.TODO/sysdeps/i386/i586/stpcpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strchr.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strcpy.S169
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strlen.S182
-rw-r--r--REORG.TODO/sysdeps/i386/i586/sub_n.S143
-rw-r--r--REORG.TODO/sysdeps/i386/i586/submul_1.S94
18 files changed, 2153 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/i586/add_n.S b/REORG.TODO/sysdeps/i386/i586/add_n.S
new file mode 100644
index 0000000000..f73df092f0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/add_n.S
@@ -0,0 +1,143 @@
+/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+ sum in a third limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define S2 S1+4
+#define SIZE S2+4
+
+ .text
+ENTRY (__mpn_add_n)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 12)
+ movl S1(%esp),%esi
+ cfi_rel_offset (esi, 8)
+ movl S2(%esp),%ebx
+ cfi_rel_offset (ebx, 0)
+ movl SIZE(%esp),%ecx
+ movl (%ebx),%ebp
+ cfi_rel_offset (ebp, 4)
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx /* zero carry flag */
+ jz L(end)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+
+ ALIGN (3)
+L(oop): movl 28(%edi),%eax /* fetch destination cache line */
+ leal 32(%edi),%edi
+
+L(1): movl (%esi),%eax
+ movl 4(%esi),%edx
+ adcl %ebp,%eax
+ movl 4(%ebx),%ebp
+ adcl %ebp,%edx
+ movl 8(%ebx),%ebp
+ movl %eax,-32(%edi)
+ movl %edx,-28(%edi)
+
+L(2): movl 8(%esi),%eax
+ movl 12(%esi),%edx
+ adcl %ebp,%eax
+ movl 12(%ebx),%ebp
+ adcl %ebp,%edx
+ movl 16(%ebx),%ebp
+ movl %eax,-24(%edi)
+ movl %edx,-20(%edi)
+
+L(3): movl 16(%esi),%eax
+ movl 20(%esi),%edx
+ adcl %ebp,%eax
+ movl 20(%ebx),%ebp
+ adcl %ebp,%edx
+ movl 24(%ebx),%ebp
+ movl %eax,-16(%edi)
+ movl %edx,-12(%edi)
+
+L(4): movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ adcl %ebp,%eax
+ movl 28(%ebx),%ebp
+ adcl %ebp,%edx
+ movl 32(%ebx),%ebp
+ movl %eax,-8(%edi)
+ movl %edx,-4(%edi)
+
+ leal 32(%esi),%esi
+ leal 32(%ebx),%ebx
+ decl %ecx
+ jnz L(oop)
+
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+L(end):
+ decl %edx /* test %edx w/o clobbering carry */
+ js L(end2)
+ incl %edx
+L(oop2):
+ leal 4(%edi),%edi
+ movl (%esi),%eax
+ adcl %ebp,%eax
+ movl 4(%ebx),%ebp
+ movl %eax,-4(%edi)
+ leal 4(%esi),%esi
+ leal 4(%ebx),%ebx
+ decl %edx
+ jnz L(oop2)
+L(end2):
+ movl (%esi),%eax
+ adcl %ebp,%eax
+ movl %eax,(%edi)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/i586/addmul_1.S b/REORG.TODO/sysdeps/i386/i586/addmul_1.S
new file mode 100644
index 0000000000..a713192982
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/addmul_1.S
@@ -0,0 +1,94 @@
+/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ the result to a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_addmul_1)
+
+ pushl %res_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %s1_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %s2_limb
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp), %res_ptr
+ cfi_rel_offset (res_ptr, 12)
+ movl S1(%esp), %s1_ptr
+ cfi_rel_offset (s1_ptr, 8)
+ movl SIZE(%esp), %size
+ movl S2LIMB(%esp), %s2_limb
+ cfi_rel_offset (s2_limb, 0)
+ leal (%res_ptr,%size,4), %res_ptr
+ leal (%s1_ptr,%size,4), %s1_ptr
+ negl %size
+ xorl %ebp, %ebp
+ cfi_rel_offset (ebp, 4)
+ ALIGN (3)
+
+L(oop): adcl $0, %ebp
+ movl (%s1_ptr,%size,4), %eax
+
+ mull %s2_limb
+
+ addl %ebp, %eax
+ movl (%res_ptr,%size,4), %ebp
+
+ adcl $0, %edx
+ addl %eax, %ebp
+
+ movl %ebp, (%res_ptr,%size,4)
+ incl %size
+
+ movl %edx, %ebp
+ jnz L(oop)
+
+ adcl $0, %ebp
+ movl %ebp, %eax
+ popl %s2_limb
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s2_limb)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %s1_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s1_ptr)
+ popl %res_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (res_ptr)
+
+ ret
+#undef size
+END (__mpn_addmul_1)
diff --git a/REORG.TODO/sysdeps/i386/i586/bzero.S b/REORG.TODO/sysdeps/i386/i586/bzero.S
new file mode 100644
index 0000000000..2a106719a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/bzero.S
@@ -0,0 +1,4 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include <sysdeps/i386/i586/memset.S>
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/i586/init-arch.h b/REORG.TODO/sysdeps/i386/i586/init-arch.h
new file mode 100644
index 0000000000..4711212e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define MINIMUM_ISA 586
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/i586/lshift.S b/REORG.TODO/sysdeps/i386/i586/lshift.S
new file mode 100644
index 0000000000..7941c28d9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/lshift.S
@@ -0,0 +1,255 @@
+/* Pentium optimized __mpn_lshift --
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S RES+4
+#define SIZE S+4
+#define CNT SIZE+4
+
+ .text
+ENTRY (__mpn_lshift)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebp, 0)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 12)
+ movl S(%esp),%esi
+ cfi_rel_offset (esi, 8)
+ movl SIZE(%esp),%ebx
+ cfi_rel_offset (ebx, 0)
+ movl CNT(%esp),%ecx
+
+/* We can use faster code for shift-by-1 under certain conditions. */
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%esi),%eax
+ cmpl %edi,%eax
+ jnc L(special) /* jump if s_ptr + 1 >= res_ptr */
+ leal (%esi,%ebx,4),%eax
+ cmpl %eax,%edi
+ jnc L(special) /* jump if res_ptr >= s_ptr + size */
+
+L(normal):
+ leal -4(%edi,%ebx,4),%edi
+ leal -4(%esi,%ebx,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+ xorl %eax,%eax
+ shldl %cl,%edx,%eax /* compute carry limb */
+ pushl %eax /* push carry limb onto stack */
+ cfi_adjust_cfa_offset (4)
+
+ decl %ebx
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ shrl $3,%ebx
+ jz L(end)
+
+ movl (%edi),%eax /* fetch destination cache line */
+
+ ALIGN (2)
+L(oop): movl -28(%edi),%eax /* fetch destination cache line */
+ movl %edx,%ebp
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ shldl %cl,%eax,%ebp
+ shldl %cl,%edx,%eax
+ movl %ebp,(%edi)
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebp
+ movl -12(%esi),%eax
+ shldl %cl,%ebp,%edx
+ shldl %cl,%eax,%ebp
+ movl %edx,-8(%edi)
+ movl %ebp,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebp
+ shldl %cl,%edx,%eax
+ shldl %cl,%ebp,%edx
+ movl %eax,-16(%edi)
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ shldl %cl,%eax,%ebp
+ shldl %cl,%edx,%eax
+ movl %ebp,-24(%edi)
+ movl %eax,-28(%edi)
+
+ subl $32,%esi
+ subl $32,%edi
+ decl %ebx
+ jnz L(oop)
+
+L(end): popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ andl $7,%ebx
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shldl %cl,%eax,%edx
+ movl %edx,(%edi)
+ movl %eax,%edx
+ subl $4,%esi
+ subl $4,%edi
+ decl %ebx
+ jnz L(oop2)
+
+L(end2):
+ shll %cl,%edx /* compute least significant limb */
+ movl %edx,(%edi) /* store it */
+
+ popl %eax /* pop carry limb */
+ cfi_adjust_cfa_offset (-4)
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+/* We loop from least significant end of the arrays, which is only
+ permissible if the source and destination don't overlap, since the
+ function is documented to work for overlapping source and destination.
+*/
+
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (edi, 12)
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebp, 4)
+ cfi_rel_offset (ebx, 0)
+L(special):
+ movl (%esi),%edx
+ addl $4,%esi
+
+ decl %ebx
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ shrl $3,%ebx
+
+ addl %edx,%edx
+ incl %ebx
+ decl %ebx
+ jz L(Lend)
+
+ movl (%edi),%eax /* fetch destination cache line */
+
+ ALIGN (2)
+L(Loop):
+ movl 28(%edi),%eax /* fetch destination cache line */
+ movl %edx,%ebp
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ adcl %eax,%eax
+ movl %ebp,(%edi)
+ adcl %edx,%edx
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebp
+ movl 12(%esi),%eax
+ adcl %ebp,%ebp
+ movl %edx,8(%edi)
+ adcl %eax,%eax
+ movl %ebp,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebp
+ adcl %edx,%edx
+ movl %eax,16(%edi)
+ adcl %ebp,%ebp
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ adcl %eax,%eax
+ movl %ebp,24(%edi)
+ adcl %edx,%edx
+ movl %eax,28(%edi)
+
+ leal 32(%esi),%esi /* use leal not to clobber carry */
+ leal 32(%edi),%edi
+ decl %ebx
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ sbbl %eax,%eax /* save carry in %eax */
+ andl $7,%ebx
+ jz L(Lend2)
+ addl %eax,%eax /* restore carry from eax */
+L(Loop2):
+ movl %edx,%ebp
+ movl (%esi),%edx
+ adcl %edx,%edx
+ movl %ebp,(%edi)
+
+ leal 4(%esi),%esi /* use leal not to clobber carry */
+ leal 4(%edi),%edi
+ decl %ebx
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax /* restore carry from eax */
+L(L1): movl %edx,(%edi) /* store last limb */
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_lshift)
diff --git a/REORG.TODO/sysdeps/i386/i586/memcopy.h b/REORG.TODO/sysdeps/i386/i586/memcopy.h
new file mode 100644
index 0000000000..39f020a746
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memcopy.h
@@ -0,0 +1,95 @@
+/* memcopy.h -- definitions for memory copy functions. Pentium version.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ Contributed by Torbjorn Granlund (tege@sics.se).
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Get the i386 definitions. We will override some of them below. */
+#include <sysdeps/i386/memcopy.h>
+
+/* Written like this, the Pentium pipeline can execute the loop at a
+ sustained rate of 2 instructions/clock, or asymptotically 480
+ Mbytes/second at 60Mhz. */
+
+#undef WORD_COPY_FWD
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
+ do \
+ { \
+ asm volatile ("subl $32,%2\n" \
+ "js 2f\n" \
+ "movl 0(%0),%%edx\n" /* alloc dest line */ \
+ "1:\n" \
+ "movl 28(%0),%%eax\n" /* alloc dest line */ \
+ "subl $32,%2\n" /* decr loop count */ \
+ "movl 0(%1),%%eax\n" /* U pipe */ \
+ "movl 4(%1),%%edx\n" /* V pipe */ \
+ "movl %%eax,0(%0)\n" /* U pipe */ \
+ "movl %%edx,4(%0)\n" /* V pipe */ \
+ "movl 8(%1),%%eax\n" \
+ "movl 12(%1),%%edx\n" \
+ "movl %%eax,8(%0)\n" \
+ "movl %%edx,12(%0)\n" \
+ "movl 16(%1),%%eax\n" \
+ "movl 20(%1),%%edx\n" \
+ "movl %%eax,16(%0)\n" \
+ "movl %%edx,20(%0)\n" \
+ "movl 24(%1),%%eax\n" \
+ "movl 28(%1),%%edx\n" \
+ "movl %%eax,24(%0)\n" \
+ "movl %%edx,28(%0)\n" \
+ "leal 32(%1),%1\n" /* update src ptr */ \
+ "leal 32(%0),%0\n" /* update dst ptr */ \
+ "jns 1b\n" \
+ "2: addl $32,%2" : \
+ "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \
+ "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \
+ "ax", "dx"); \
+ } while (0)
+
+#undef WORD_COPY_BWD
+#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes) \
+ do \
+ { \
+ asm volatile ("subl $32,%2\n" \
+ "js 2f\n" \
+ "movl -4(%0),%%edx\n" \
+ "1:\n" \
+ "movl -32(%0),%%eax\n" \
+ "subl $32,%2\n" \
+ "movl -4(%1),%%eax\n" \
+ "movl -8(%1),%%edx\n" \
+ "movl %%eax,-4(%0)\n" \
+ "movl %%edx,-8(%0)\n" \
+ "movl -12(%1),%%eax\n" \
+ "movl -16(%1),%%edx\n" \
+ "movl %%eax,-12(%0)\n" \
+ "movl %%edx,-16(%0)\n" \
+ "movl -20(%1),%%eax\n" \
+ "movl -24(%1),%%edx\n" \
+ "movl %%eax,-20(%0)\n" \
+ "movl %%edx,-24(%0)\n" \
+ "movl -28(%1),%%eax\n" \
+ "movl -32(%1),%%edx\n" \
+ "movl %%eax,-28(%0)\n" \
+ "movl %%edx,-32(%0)\n" \
+ "leal -32(%1),%1\n" \
+ "leal -32(%0),%0\n" \
+ "jns 1b\n" \
+ "2: addl $32,%2" : \
+ "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) : \
+ "0" (dst_ep), "1" (src_ep), "2" (nbytes) : \
+ "ax", "dx"); \
+ } while (0)
diff --git a/REORG.TODO/sysdeps/i386/i586/memcpy.S b/REORG.TODO/sysdeps/i386/i586/memcpy.S
new file mode 100644
index 0000000000..6474a3f653
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memcpy.S
@@ -0,0 +1,124 @@
+/* Highly optimized version for i586.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+8 /* space for 2 saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+#define LEN SRC+4
+
+ .text
+#if defined PIC && IS_IN (libc)
+ENTRY (__memcpy_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk)
+#endif
+ENTRY (memcpy)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+
+ movl DEST(%esp), %edi
+ cfi_rel_offset (edi, 4)
+ movl SRC(%esp), %esi
+ cfi_rel_offset (esi, 0)
+ movl LEN(%esp), %ecx
+ movl %edi, %eax
+
+ /* We need this in any case. */
+ cld
+
+ /* Cutoff for the big loop is a size of 32 bytes since otherwise
+ the loop will never be entered. */
+ cmpl $32, %ecx
+ jbe L(1)
+
+ negl %eax
+ andl $3, %eax
+ subl %eax, %ecx
+ xchgl %eax, %ecx
+
+ rep; movsb
+
+ movl %eax, %ecx
+ subl $32, %ecx
+ js L(2)
+
+ /* Read ahead to make sure we write in the cache since the stupid
+ i586 designers haven't implemented read-on-write-miss. */
+ movl (%edi), %eax
+L(3): movl 28(%edi), %edx
+
+ /* Now correct the loop counter. Please note that in the following
+ code the flags are not changed anymore. */
+ subl $32, %ecx
+
+ movl (%esi), %eax
+ movl 4(%esi), %edx
+ movl %eax, (%edi)
+ movl %edx, 4(%edi)
+ movl 8(%esi), %eax
+ movl 12(%esi), %edx
+ movl %eax, 8(%edi)
+ movl %edx, 12(%edi)
+ movl 16(%esi), %eax
+ movl 20(%esi), %edx
+ movl %eax, 16(%edi)
+ movl %edx, 20(%edi)
+ movl 24(%esi), %eax
+ movl 28(%esi), %edx
+ movl %eax, 24(%edi)
+ movl %edx, 28(%edi)
+
+ leal 32(%esi), %esi
+ leal 32(%edi), %edi
+
+ jns L(3)
+
+ /* Correct extra loop counter modification. */
+L(2): addl $32, %ecx
+#ifndef USE_AS_MEMPCPY
+ movl DEST(%esp), %eax
+#endif
+
+L(1): rep; movsb
+
+#ifdef USE_AS_MEMPCPY
+ movl %edi, %eax
+#endif
+
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (memcpy)
+#ifndef USE_AS_MEMPCPY
+libc_hidden_builtin_def (memcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/mempcpy.S b/REORG.TODO/sysdeps/i386/i586/mempcpy.S
new file mode 100644
index 0000000000..720a4c0923
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/mempcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_MEMPCPY
+#define memcpy __mempcpy
+#define __memcpy_chk __mempcpy_chk
+#include <sysdeps/i386/i586/memcpy.S>
+
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/i586/memset.S b/REORG.TODO/sysdeps/i386/i586/memset.S
new file mode 100644
index 0000000000..4f8f1bcf94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memset.S
@@ -0,0 +1,121 @@
+/* memset/bzero -- set memory area to CH/0
+ Highly optimized version for ix86, x>=5.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Torbjorn Granlund, <tege@matematik.su.se>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#define RTN PARMS
+#define DEST RTN
+#ifdef USE_AS_BZERO
+# define LEN DEST+4
+#else
+# define CHR DEST+4
+# define LEN CHR+4
+#endif
+
+ .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+
+ movl DEST(%esp), %edi
+ cfi_rel_offset (edi, 0)
+ movl LEN(%esp), %edx
+#ifdef USE_AS_BZERO
+ xorl %eax, %eax /* we fill with 0 */
+#else
+ movb CHR(%esp), %al
+ movb %al, %ah
+ movl %eax, %ecx
+ shll $16, %eax
+ movw %cx, %ax
+#endif
+ cld
+
+/* If less than 36 bytes to write, skip tricky code (it wouldn't work). */
+ cmpl $36, %edx
+ movl %edx, %ecx /* needed when branch is taken! */
+ jl L(2)
+
+/* First write 0-3 bytes to make the pointer 32-bit aligned. */
+ movl %edi, %ecx /* Copy ptr to ecx... */
+ negl %ecx /* ...and negate that and... */
+ andl $3, %ecx /* ...mask to get byte count. */
+ subl %ecx, %edx /* adjust global byte count */
+ rep
+ stosb
+
+ subl $32, %edx /* offset count for unrolled loop */
+ movl (%edi), %ecx /* Fetch destination cache line */
+
+ .align 2, 0x90 /* supply 0x90 for broken assemblers */
+L(1): movl 28(%edi), %ecx /* allocate cache line for destination */
+ subl $32, %edx /* decr loop count */
+ movl %eax, 0(%edi) /* store words pairwise */
+ movl %eax, 4(%edi)
+ movl %eax, 8(%edi)
+ movl %eax, 12(%edi)
+ movl %eax, 16(%edi)
+ movl %eax, 20(%edi)
+ movl %eax, 24(%edi)
+ movl %eax, 28(%edi)
+ leal 32(%edi), %edi /* update destination pointer */
+ jge L(1)
+
+ leal 32(%edx), %ecx /* reset offset count */
+
+/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
+L(2): shrl $2, %ecx /* convert byte count to longword count */
+ rep
+ stosl
+
+/* Finally write the last 0-3 bytes. */
+ movl %edx, %ecx
+ andl $3, %ecx
+ rep
+ stosb
+
+#ifndef USE_AS_BZERO
+ /* Load result (only if used as memset). */
+ movl DEST(%esp), %eax /* start address of destination is result */
+#endif
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (memset)
+libc_hidden_builtin_def (memset)
+
+#if defined SHARED && IS_IN (libc) && !defined __memset_chk \
+ && !defined USE_AS_BZERO
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+ .section .gnu.warning.__memset_zero_constant_len_parameter
+ .string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/memusage.h b/REORG.TODO/sysdeps/i386/i586/memusage.h
new file mode 100644
index 0000000000..c8170874d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memusage.h
@@ -0,0 +1 @@
+#include "../i686/memusage.h"
diff --git a/REORG.TODO/sysdeps/i386/i586/mul_1.S b/REORG.TODO/sysdeps/i386/i586/mul_1.S
new file mode 100644
index 0000000000..bd3a07de90
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/mul_1.S
@@ -0,0 +1,90 @@
+/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ the result in a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_mul_1)
+
+ pushl %res_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %s1_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %s2_limb
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp), %res_ptr
+ cfi_rel_offset (res_ptr, 12)
+ movl S1(%esp), %s1_ptr
+ cfi_rel_offset (s1_ptr, 8)
+ movl SIZE(%esp), %size
+ movl S2LIMB(%esp), %s2_limb
+ cfi_rel_offset (s2_limb, 0)
+ leal (%res_ptr,%size,4), %res_ptr
+ leal (%s1_ptr,%size,4), %s1_ptr
+ negl %size
+ xorl %ebp, %ebp
+ cfi_rel_offset (ebp, 4)
+ ALIGN (3)
+
+L(oop): adcl $0, %ebp
+ movl (%s1_ptr,%size,4), %eax
+
+ mull %s2_limb
+
+ addl %eax, %ebp
+
+ movl %ebp, (%res_ptr,%size,4)
+ incl %size
+
+ movl %edx, %ebp
+ jnz L(oop)
+
+ adcl $0, %ebp
+ movl %ebp, %eax
+ popl %s2_limb
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s2_limb)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %s1_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s1_ptr)
+ popl %res_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (res_ptr)
+
+ ret
+#undef size
+END (__mpn_mul_1)
diff --git a/REORG.TODO/sysdeps/i386/i586/rshift.S b/REORG.TODO/sysdeps/i386/i586/rshift.S
new file mode 100644
index 0000000000..24c76ee0bb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/rshift.S
@@ -0,0 +1,255 @@
+/* Pentium optimized __mpn_rshift --
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S RES+4
+#define SIZE S+4
+#define CNT SIZE+4
+
+ .text
+ENTRY (__mpn_rshift)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebp, 0)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 12)
+ movl S(%esp),%esi
+ cfi_rel_offset (esi, 8)
+ movl SIZE(%esp),%ebx
+ cfi_rel_offset (ebx, 0)
+ movl CNT(%esp),%ecx
+
+/* We can use faster code for shift-by-1 under certain conditions. */
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%edi),%eax
+ cmpl %esi,%eax
+ jnc L(special) /* jump if res_ptr + 1 >= s_ptr */
+ leal (%edi,%ebx,4),%eax
+ cmpl %eax,%esi
+ jnc L(special) /* jump if s_ptr >= res_ptr + size */
+
+L(normal):
+ movl (%esi),%edx
+ addl $4,%esi
+ xorl %eax,%eax
+ shrdl %cl,%edx,%eax /* compute carry limb */
+ pushl %eax /* push carry limb onto stack */
+ cfi_adjust_cfa_offset (4)
+
+ decl %ebx
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ shrl $3,%ebx
+ jz L(end)
+
+ movl (%edi),%eax /* fetch destination cache line */
+
+ ALIGN (2)
+L(oop): movl 28(%edi),%eax /* fetch destination cache line */
+ movl %edx,%ebp
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ shrdl %cl,%eax,%ebp
+ shrdl %cl,%edx,%eax
+ movl %ebp,(%edi)
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebp
+ movl 12(%esi),%eax
+ shrdl %cl,%ebp,%edx
+ shrdl %cl,%eax,%ebp
+ movl %edx,8(%edi)
+ movl %ebp,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebp
+ shrdl %cl,%edx,%eax
+ shrdl %cl,%ebp,%edx
+ movl %eax,16(%edi)
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ shrdl %cl,%eax,%ebp
+ shrdl %cl,%edx,%eax
+ movl %ebp,24(%edi)
+ movl %eax,28(%edi)
+
+ addl $32,%esi
+ addl $32,%edi
+ decl %ebx
+ jnz L(oop)
+
+L(end): popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ andl $7,%ebx
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shrdl %cl,%eax,%edx /* compute result limb */
+ movl %edx,(%edi)
+ movl %eax,%edx
+ addl $4,%esi
+ addl $4,%edi
+ decl %ebx
+ jnz L(oop2)
+
+L(end2):
+ shrl %cl,%edx /* compute most significant limb */
+ movl %edx,(%edi) /* store it */
+
+ popl %eax /* pop carry limb */
+ cfi_adjust_cfa_offset (-4)
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+/* We loop from least significant end of the arrays, which is only
+ permissible if the source and destination don't overlap, since the
+ function is documented to work for overlapping source and destination.
+*/
+
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (edi, 12)
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebp, 4)
+ cfi_rel_offset (ebx, 0)
+L(special):
+ leal -4(%edi,%ebx,4),%edi
+ leal -4(%esi,%ebx,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+
+ decl %ebx
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ shrl $3,%ebx
+
+ shrl $1,%edx
+ incl %ebx
+ decl %ebx
+ jz L(Lend)
+
+ movl (%edi),%eax /* fetch destination cache line */
+
+ ALIGN (2)
+L(Loop):
+ movl -28(%edi),%eax /* fetch destination cache line */
+ movl %edx,%ebp
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ rcrl $1,%eax
+ movl %ebp,(%edi)
+ rcrl $1,%edx
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebp
+ movl -12(%esi),%eax
+ rcrl $1,%ebp
+ movl %edx,-8(%edi)
+ rcrl $1,%eax
+ movl %ebp,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebp
+ rcrl $1,%edx
+ movl %eax,-16(%edi)
+ rcrl $1,%ebp
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ rcrl $1,%eax
+ movl %ebp,-24(%edi)
+ rcrl $1,%edx
+ movl %eax,-28(%edi)
+
+ leal -32(%esi),%esi /* use leal not to clobber carry */
+ leal -32(%edi),%edi
+ decl %ebx
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ sbbl %eax,%eax /* save carry in %eax */
+ andl $7,%ebx
+ jz L(Lend2)
+ addl %eax,%eax /* restore carry from eax */
+L(Loop2):
+ movl %edx,%ebp
+ movl (%esi),%edx
+ rcrl $1,%edx
+ movl %ebp,(%edi)
+
+ leal -4(%esi),%esi /* use leal not to clobber carry */
+ leal -4(%edi),%edi
+ decl %ebx
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax /* restore carry from eax */
+L(L1): movl %edx,(%edi) /* store last limb */
+
+ movl $0,%eax
+ rcrl $1,%eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_rshift)
diff --git a/REORG.TODO/sysdeps/i386/i586/stpcpy.S b/REORG.TODO/sysdeps/i386/i586/stpcpy.S
new file mode 100644
index 0000000000..8691efd01c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/stpcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+
+#include <sysdeps/i386/i586/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i586/strchr.S b/REORG.TODO/sysdeps/i386/i586/strchr.S
new file mode 100644
index 0000000000..02f66b8f72
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strchr.S
@@ -0,0 +1,348 @@
+/* Find character CH in a NUL terminated string.
+ Highly optimized version for ix85, x>=5.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+ processors. This is mainly done by using the two pipelines. The
+ version optimized for i486 is weak in this aspect because to get
+ as much parallelism we have to execute some *more* instructions.
+
+ The code below is structured to reflect the pairing of the instructions
+ as *I think* it is. I have no processor data book to verify this.
+ If you find something you think is incorrect let me know. */
+
+
+/* The magic value which is used throughout in the whole code. */
+#define magic 0xfefefeff
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RTN PARMS
+#define STR RTN
+#define CHR STR+4
+
+ .text
+ENTRY (strchr)
+
+ pushl %edi /* Save callee-safe registers. */
+ cfi_adjust_cfa_offset (-4)
+ pushl %esi
+ cfi_adjust_cfa_offset (-4)
+
+ pushl %ebx
+ cfi_adjust_cfa_offset (-4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (-4)
+
+ movl STR(%esp), %eax
+ movl CHR(%esp), %edx
+
+ movl %eax, %edi /* duplicate string pointer for later */
+ cfi_rel_offset (edi, 12)
+ xorl %ecx, %ecx /* clear %ecx */
+
+ /* At the moment %edx contains C. What we need for the
+ algorithm is C in all bytes of the dword. Avoid
+ operations on 16 bit words because these require an
+ prefix byte (and one more cycle). */
+ movb %dl, %dh /* now it is 0|0|c|c */
+ movb %dl, %cl /* we construct the lower half in %ecx */
+
+ shll $16, %edx /* now %edx is c|c|0|0 */
+ movb %cl, %ch /* now %ecx is 0|0|c|c */
+
+ orl %ecx, %edx /* and finally c|c|c|c */
+ andl $3, %edi /* mask alignment bits */
+
+ jz L(11) /* alignment is 0 => start loop */
+
+ movb %dl, %cl /* 0 is needed below */
+ jp L(0) /* exactly two bits set */
+
+ xorb (%eax), %cl /* is byte the one we are looking for? */
+ jz L(out) /* yes => return pointer */
+
+ xorb %dl, %cl /* load single byte and test for NUL */
+ je L(3) /* yes => return NULL */
+
+ movb 1(%eax), %cl /* load single byte */
+ incl %eax
+
+ cmpb %cl, %dl /* is byte == C? */
+ je L(out) /* aligned => return pointer */
+
+ cmpb $0, %cl /* is byte NUL? */
+ je L(3) /* yes => return NULL */
+
+ incl %eax
+ decl %edi
+
+ jne L(11)
+
+L(0): movb (%eax), %cl /* load single byte */
+
+ cmpb %cl, %dl /* is byte == C? */
+ je L(out) /* aligned => return pointer */
+
+ cmpb $0, %cl /* is byte NUL? */
+ je L(3) /* yes => return NULL */
+
+ incl %eax /* increment pointer */
+
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebx, 4)
+ cfi_rel_offset (ebp, 0)
+
+ /* The following code is the preparation for the loop. The
+ four instruction up to `L1' will not be executed in the loop
+ because the same code is found at the end of the loop, but
+ there it is executed in parallel with other instructions. */
+L(11): movl (%eax), %ecx
+ movl $magic, %ebp
+
+ movl $magic, %edi
+ addl %ecx, %ebp
+
+ /* The main loop: it looks complex and indeed it is. I would
+ love to say `it was hard to write, so it should he hard to
+ read' but I will give some more hints. To fully understand
+ this code you should first take a look at the i486 version.
+ The basic algorithm is the same, but here the code organized
+ in a way which permits to use both pipelines all the time.
+
+ I tried to make it a bit more understandable by indenting
+ the code according to stage in the algorithm. It goes as
+ follows:
+ check for 0 in 1st word
+ check for C in 1st word
+ check for 0 in 2nd word
+ check for C in 2nd word
+ check for 0 in 3rd word
+ check for C in 3rd word
+ check for 0 in 4th word
+ check for C in 4th word
+
+ Please note that doing the test for NUL before the test for
+ C allows us to overlap the test for 0 in the next word with
+ the test for C. */
+
+L(1): xorl %ecx, %ebp /* (word^magic) */
+ addl %ecx, %edi /* add magic word */
+
+ leal 4(%eax), %eax /* increment pointer */
+ jnc L(4) /* previous addl caused overflow? */
+
+ movl %ecx, %ebx /* duplicate original word */
+ orl $magic, %ebp /* (word^magic)|magic */
+
+ addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */
+ jne L(4) /* yes => we found word with NUL */
+
+ movl $magic, %esi /* load magic value */
+ xorl %edx, %ebx /* clear words which are C */
+
+ movl (%eax), %ecx
+ addl %ebx, %esi /* (word+magic) */
+
+ movl $magic, %edi
+ jnc L(5) /* previous addl caused overflow? */
+
+ movl %edi, %ebp
+ xorl %ebx, %esi /* (word+magic)^word */
+
+ addl %ecx, %ebp
+ orl $magic, %esi /* ((word+magic)^word)|magic */
+
+ addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/
+ jne L(5) /* yes => we found word with C */
+
+ xorl %ecx, %ebp
+ addl %ecx, %edi
+
+ leal 4(%eax), %eax
+ jnc L(4)
+
+ movl %ecx, %ebx
+ orl $magic, %ebp
+
+ addl $1, %ebp
+ jne L(4)
+
+ movl $magic, %esi
+ xorl %edx, %ebx
+
+ movl (%eax), %ecx
+ addl %ebx, %esi
+
+ movl $magic, %edi
+ jnc L(5)
+
+ movl %edi, %ebp
+ xorl %ebx, %esi
+
+ addl %ecx, %ebp
+ orl $magic, %esi
+
+ addl $1, %esi
+ jne L(5)
+
+ xorl %ecx, %ebp
+ addl %ecx, %edi
+
+ leal 4(%eax), %eax
+ jnc L(4)
+
+ movl %ecx, %ebx
+ orl $magic, %ebp
+
+ addl $1, %ebp
+ jne L(4)
+
+ movl $magic, %esi
+ xorl %edx, %ebx
+
+ movl (%eax), %ecx
+ addl %ebx, %esi
+
+ movl $magic, %edi
+ jnc L(5)
+
+ movl %edi, %ebp
+ xorl %ebx, %esi
+
+ addl %ecx, %ebp
+ orl $magic, %esi
+
+ addl $1, %esi
+ jne L(5)
+
+ xorl %ecx, %ebp
+ addl %ecx, %edi
+
+ leal 4(%eax), %eax
+ jnc L(4)
+
+ movl %ecx, %ebx
+ orl $magic, %ebp
+
+ addl $1, %ebp
+ jne L(4)
+
+ movl $magic, %esi
+ xorl %edx, %ebx
+
+ movl (%eax), %ecx
+ addl %ebx, %esi
+
+ movl $magic, %edi
+ jnc L(5)
+
+ movl %edi, %ebp
+ xorl %ebx, %esi
+
+ addl %ecx, %ebp
+ orl $magic, %esi
+
+ addl $1, %esi
+
+ je L(1)
+
+ /* We know there is no NUL byte but a C byte in the word.
+ %ebx contains NUL in this particular byte. */
+L(5): subl $4, %eax /* adjust pointer */
+ testb %bl, %bl /* first byte == C? */
+
+ jz L(out) /* yes => return pointer */
+
+ incl %eax /* increment pointer */
+ testb %bh, %bh /* second byte == C? */
+
+ jz L(out) /* yes => return pointer */
+
+ shrl $16, %ebx /* make upper bytes accessible */
+ incl %eax /* increment pointer */
+
+ cmp $0, %bl /* third byte == C */
+ je L(out) /* yes => return pointer */
+
+ incl %eax /* increment pointer */
+
+L(out): popl %ebp /* restore saved registers */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (edi, 12)
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebx, 4)
+ cfi_rel_offset (ebp, 0)
+ /* We know there is a NUL byte in the word. But we have to test
+ whether there is an C byte before it in the word. */
+L(4): subl $4, %eax /* adjust pointer */
+ cmpb %dl, %cl /* first byte == C? */
+
+ je L(out) /* yes => return pointer */
+
+ cmpb $0, %cl /* first byte == NUL? */
+ je L(3) /* yes => return NULL */
+
+ incl %eax /* increment pointer */
+
+ cmpb %dl, %ch /* second byte == C? */
+ je L(out) /* yes => return pointer */
+
+ cmpb $0, %ch /* second byte == NUL? */
+ je L(3) /* yes => return NULL */
+
+ shrl $16, %ecx /* make upper bytes accessible */
+ incl %eax /* increment pointer */
+
+ cmpb %dl, %cl /* third byte == C? */
+ je L(out) /* yes => return pointer */
+
+ cmpb $0, %cl /* third byte == NUL? */
+ je L(3) /* yes => return NULL */
+
+ incl %eax /* increment pointer */
+
+ /* The test four the fourth byte is necessary! */
+ cmpb %dl, %ch /* fourth byte == C? */
+ je L(out) /* yes => return pointer */
+
+L(3): xorl %eax, %eax
+ jmp L(out)
+END (strchr)
+
+#undef index
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/i386/i586/strcpy.S b/REORG.TODO/sysdeps/i386/i586/strcpy.S
new file mode 100644
index 0000000000..a444604f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strcpy.S
@@ -0,0 +1,169 @@
+/* strcpy/stpcpy implementation for i586.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+12 /* space for 3 saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+
+#ifndef USE_AS_STPCPY
+# define STRCPY strcpy
+#endif
+
+#define magic 0xfefefeff
+
+ .text
+ENTRY (STRCPY)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl DEST(%esp), %edi
+ cfi_rel_offset (edi, 8)
+ movl SRC(%esp), %esi
+ cfi_rel_offset (esi, 4)
+
+ xorl %eax, %eax
+ leal -1(%esi), %ecx
+
+ movl $magic, %ebx
+ cfi_rel_offset (ebx, 0)
+ andl $3, %ecx
+
+#ifdef PIC
+ call 2f
+ cfi_adjust_cfa_offset (4)
+2: popl %edx
+ cfi_adjust_cfa_offset (-4)
+ /* 0xb is the distance between 2: and 1: but we avoid writing
+ 1f-2b because the assembler generates worse code. */
+ leal 0xb(%edx,%ecx,8), %ecx
+#else
+ leal 1f(,%ecx,8), %ecx
+#endif
+
+ jmp *%ecx
+
+ .align 8
+1:
+ orb (%esi), %al
+ jz L(end)
+ stosb
+ xorl %eax, %eax
+ incl %esi
+
+ orb (%esi), %al
+ jz L(end)
+ stosb
+ xorl %eax, %eax
+ incl %esi
+
+ orb (%esi), %al
+ jz L(end)
+ stosb
+ xorl %eax, %eax
+ incl %esi
+
+L(1): movl (%esi), %ecx
+ leal 4(%esi),%esi
+
+ subl %ecx, %eax
+ addl %ebx, %ecx
+
+ decl %eax
+ jnc L(3)
+
+ movl %ecx, %edx
+ xorl %ecx, %eax
+
+ subl %ebx, %edx
+ andl $~magic, %eax
+
+ jne L(4)
+
+ movl %edx, (%edi)
+ leal 4(%edi),%edi
+
+ jmp L(1)
+
+L(3): movl %ecx, %edx
+
+ subl %ebx, %edx
+
+L(4): movb %dl, (%edi)
+ testb %dl, %dl
+
+ movl %edx, %eax
+ jz L(end2)
+
+ shrl $16, %eax
+ movb %dh, 1(%edi)
+#ifdef USE_AS_STPCPY
+ addl $1, %edi
+#endif
+
+ cmpb $0, %dh
+ jz L(end2)
+
+#ifdef USE_AS_STPCPY
+ movb %al, 1(%edi)
+ addl $1, %edi
+
+ cmpb $0, %al
+ jz L(end2)
+
+ addl $1, %edi
+#else
+ movb %al, 2(%edi)
+ testb %al, %al
+
+ leal 3(%edi), %edi
+ jz L(end2)
+#endif
+
+L(end): movb %ah, (%edi)
+
+L(end2):
+#ifdef USE_AS_STPCPY
+ movl %edi, %eax
+#else
+ movl DEST(%esp), %eax
+#endif
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (STRCPY)
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/strlen.S b/REORG.TODO/sysdeps/i386/i586/strlen.S
new file mode 100644
index 0000000000..cfea2e020f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strlen.S
@@ -0,0 +1,182 @@
+/* strlen -- Compute length of NUL terminated string.
+ Highly optimized version for ix86, x>=5.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+ processors. This is mainly done by using the two pipelines. The
+ version optimized for i486 is weak in this aspect because to get
+ as much parallelism we have to execute some *more* instructions.
+
+ The code below is structured to reflect the pairing of the instructions
+ as *I think* it is. I have no processor data book to verify this.
+ If you find something you think is incorrect let me know. */
+
+
+/* The magic value which is used throughout in the whole code. */
+#define magic 0xfefefeff
+
+#define PARMS 4 /* no space for saved regs */
+#define STR PARMS
+
+ .text
+ENTRY (strlen)
+
+ movl STR(%esp), %eax
+ movl $3, %edx /* load mask (= 3) */
+
+ andl %eax, %edx /* separate last two bits of address */
+
+ jz L(1) /* aligned => start loop */
+ jp L(0) /* exactly two bits set */
+
+ cmpb %dh, (%eax) /* is byte NUL? */
+ je L(2) /* yes => return */
+
+ incl %eax /* increment pointer */
+ cmpb %dh, (%eax) /* is byte NUL? */
+
+ je L(2) /* yes => return */
+
+ incl %eax /* increment pointer */
+ xorl $2, %edx
+
+ jz L(1)
+
+L(0): cmpb %dh, (%eax) /* is byte NUL? */
+ je L(2) /* yes => return */
+
+ incl %eax /* increment pointer */
+ xorl %edx, %edx /* We need %edx == 0 for later */
+
+ /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+ change any of the hole bits of LONGWORD.
+
+ 1) Is this safe? Will it catch all the zero bytes?
+ Suppose there is a byte with all zeros. Any carry bits
+ propagating from its left will fall into the hole at its
+ least significant bit and stop. Since there will be no
+ carry from its most significant bit, the LSB of the
+ byte to the left will be unchanged, and the zero will be
+ detected.
+
+ 2) Is this worthwhile? Will it ignore everything except
+ zero bytes? Suppose every byte of LONGWORD has a bit set
+ somewhere. There will be a carry into bit 8. If bit 8
+ is set, this will carry into bit 16. If bit 8 is clear,
+ one of bits 9-15 must be set, so there will be a carry
+ into bit 16. Similarly, there will be a carry into bit
+ 24. If one of bits 24-31 is set, there will be a carry
+ into bit 32 (=carry flag), so all of the hole bits will
+ be changed.
+
+ Note: %edx == 0 in any case here. */
+
+L(1):
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ addl $4, %eax /* adjust pointer for *next* word */
+
+ subl %ecx, %edx /* first step to negate word */
+ addl $magic, %ecx /* add magic word */
+
+ decl %edx /* complete negation of word */
+ jnc L(3) /* previous addl caused overflow? */
+
+ xorl %ecx, %edx /* (word+magic)^word */
+
+ andl $~magic, %edx /* any of the carry flags set? */
+
+ jne L(3) /* yes => determine byte */
+
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ addl $4, %eax /* adjust pointer for *next* word */
+
+ subl %ecx, %edx /* first step to negate word */
+ addl $magic, %ecx /* add magic word */
+
+ decl %edx /* complete negation of word */
+ jnc L(3) /* previous addl caused overflow? */
+
+ xorl %ecx, %edx /* (word+magic)^word */
+
+ andl $~magic, %edx /* any of the carry flags set? */
+
+ jne L(3) /* yes => determine byte */
+
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ addl $4, %eax /* adjust pointer for *next* word */
+
+ subl %ecx, %edx /* first step to negate word */
+ addl $magic, %ecx /* add magic word */
+
+ decl %edx /* complete negation of word */
+ jnc L(3) /* previous addl caused overflow? */
+
+ xorl %ecx, %edx /* (word+magic)^word */
+
+ andl $~magic, %edx /* any of the carry flags set? */
+
+ jne L(3) /* yes => determine byte */
+
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ addl $4, %eax /* adjust pointer for *next* word */
+
+ subl %ecx, %edx /* first step to negate word */
+ addl $magic, %ecx /* add magic word */
+
+ decl %edx /* complete negation of word */
+ jnc L(3) /* previous addl caused overflow? */
+
+ xorl %ecx, %edx /* (word+magic)^word */
+
+ andl $~magic, %edx /* any of the carry flags set? */
+
+ je L(1) /* no => start loop again */
+
+
+L(3): subl $4, %eax /* correct too early pointer increment */
+ subl $magic, %ecx
+
+ cmpb $0, %cl /* lowest byte NUL? */
+ jz L(2) /* yes => return */
+
+ inc %eax /* increment pointer */
+ testb %ch, %ch /* second byte NUL? */
+
+ jz L(2) /* yes => return */
+
+ shrl $16, %ecx /* make upper bytes accessible */
+ incl %eax /* increment pointer */
+
+ cmpb $0, %cl /* is third byte NUL? */
+ jz L(2) /* yes => return */
+
+ incl %eax /* increment pointer */
+
+L(2): subl STR(%esp), %eax /* now compute the length as difference
+ between start and terminating NUL
+ character */
+ ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/i586/sub_n.S b/REORG.TODO/sysdeps/i386/i586/sub_n.S
new file mode 100644
index 0000000000..21b5a2742c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/sub_n.S
@@ -0,0 +1,143 @@
+/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+ and store difference in a third limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define S2 S1+4
+#define SIZE S2+4
+
+ .text
+ENTRY (__mpn_sub_n)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 12)
+ movl S1(%esp),%esi
+ cfi_rel_offset (esi, 8)
+ movl S2(%esp),%ebx
+ cfi_rel_offset (ebx, 0)
+ movl SIZE(%esp),%ecx
+ movl (%ebx),%ebp
+ cfi_rel_offset (ebp, 4)
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx /* zero carry flag */
+ jz L(end)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+
+ ALIGN (3)
+L(oop): movl 28(%edi),%eax /* fetch destination cache line */
+ leal 32(%edi),%edi
+
+L(1): movl (%esi),%eax
+ movl 4(%esi),%edx
+ sbbl %ebp,%eax
+ movl 4(%ebx),%ebp
+ sbbl %ebp,%edx
+ movl 8(%ebx),%ebp
+ movl %eax,-32(%edi)
+ movl %edx,-28(%edi)
+
+L(2): movl 8(%esi),%eax
+ movl 12(%esi),%edx
+ sbbl %ebp,%eax
+ movl 12(%ebx),%ebp
+ sbbl %ebp,%edx
+ movl 16(%ebx),%ebp
+ movl %eax,-24(%edi)
+ movl %edx,-20(%edi)
+
+L(3): movl 16(%esi),%eax
+ movl 20(%esi),%edx
+ sbbl %ebp,%eax
+ movl 20(%ebx),%ebp
+ sbbl %ebp,%edx
+ movl 24(%ebx),%ebp
+ movl %eax,-16(%edi)
+ movl %edx,-12(%edi)
+
+L(4): movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ sbbl %ebp,%eax
+ movl 28(%ebx),%ebp
+ sbbl %ebp,%edx
+ movl 32(%ebx),%ebp
+ movl %eax,-8(%edi)
+ movl %edx,-4(%edi)
+
+ leal 32(%esi),%esi
+ leal 32(%ebx),%ebx
+ decl %ecx
+ jnz L(oop)
+
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+L(end):
+ decl %edx /* test %edx w/o clobbering carry */
+ js L(end2)
+ incl %edx
+L(oop2):
+ leal 4(%edi),%edi
+ movl (%esi),%eax
+ sbbl %ebp,%eax
+ movl 4(%ebx),%ebp
+ movl %eax,-4(%edi)
+ leal 4(%esi),%esi
+ leal 4(%ebx),%ebx
+ decl %edx
+ jnz L(oop2)
+L(end2):
+ movl (%esi),%eax
+ sbbl %ebp,%eax
+ movl %eax,(%edi)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_sub_n)
diff --git a/REORG.TODO/sysdeps/i386/i586/submul_1.S b/REORG.TODO/sysdeps/i386/i586/submul_1.S
new file mode 100644
index 0000000000..5e5e121ca2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/submul_1.S
@@ -0,0 +1,94 @@
+/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+ the result from a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_submul_1)
+
+ pushl %res_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %s1_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %s2_limb
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp), %res_ptr
+ cfi_rel_offset (res_ptr, 12)
+ movl S1(%esp), %s1_ptr
+ cfi_rel_offset (s1_ptr, 8)
+ movl SIZE(%esp), %size
+ movl S2LIMB(%esp), %s2_limb
+ cfi_rel_offset (s2_limb, 0)
+ leal (%res_ptr,%size,4), %res_ptr
+ leal (%s1_ptr,%size,4), %s1_ptr
+ negl %size
+ xorl %ebp, %ebp
+ cfi_rel_offset (ebp, 4)
+ ALIGN (3)
+
+L(oop): adcl $0, %ebp
+ movl (%s1_ptr,%size,4), %eax
+
+ mull %s2_limb
+
+ addl %ebp, %eax
+ movl (%res_ptr,%size,4), %ebp
+
+ adcl $0, %edx
+ subl %eax, %ebp
+
+ movl %ebp, (%res_ptr,%size,4)
+ incl %size
+
+ movl %edx, %ebp
+ jnz L(oop)
+
+ adcl $0, %ebp
+ movl %ebp, %eax
+ popl %s2_limb
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s2_limb)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %s1_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s1_ptr)
+ popl %res_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (res_ptr)
+
+ ret
+#undef size
+END (__mpn_submul_1)