aboutsummaryrefslogtreecommitdiff
path: root/REORG.TODO/sysdeps/sparc/sparc64/multiarch
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/sparc/sparc64/multiarch')
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/Makefile21
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n-vis3.S67
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n.S56
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1-vis3.S87
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1.S56
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c75
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-block.c29
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-crop.S110
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S347
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S498
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S332
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S325
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy.S167
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara1.S177
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara4.S124
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset.S124
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1-vis3.S73
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1.S56
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memcpy.c1
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memset.c1
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-block.c32
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-crop.S101
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-block.c32
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-crop.S131
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n-vis3.S71
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n.S56
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1-vis3.S87
-rw-r--r--REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1.S56
28 files changed, 3292 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/Makefile b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/Makefile
new file mode 100644
index 0000000000..55b757f9ad
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/Makefile
@@ -0,0 +1,21 @@
+ifeq ($(subdir),crypt)
+libcrypt-sysdep_routines += md5-crop sha256-crop sha512-crop
+endif
+
+ifeq ($(subdir),locale)
+localedef-aux += md5-crop
+endif
+
+ifeq ($(subdir),string)
+sysdep_routines += memcpy-ultra3 memcpy-niagara1 memcpy-niagara2 \
+ memset-niagara1 memcpy-niagara4 memset-niagara4
+endif
+
+ifeq ($(subdir),stdlib)
+sysdep_routines += mul_1-vis3 addmul_1-vis3 submul_1-vis3 add_n-vis3 sub_n-vis3
+endif
+
+ifeq ($(subdir),math)
+gmp-sysdep_routines = mul_1-vis3 addmul_1-vis3 submul_1-vis3 add_n-vis3 \
+ sub_n-vis3
+endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n-vis3.S
new file mode 100644
index 0000000000..c038bcbd6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n-vis3.S
@@ -0,0 +1,67 @@
+! SPARC v9 64-bit VIS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+! store sum in a third limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr %o0
+#define s1_ptr %o1
+#define s2_ptr %o2
+#define sz %o3
+#define tmp1 %g1
+#define tmp2 %g2
+#define tmp3 %g3
+#define tmp4 %o4
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ENTRY(__mpn_add_n_vis3)
+ subcc sz, 1, sz
+ be .Lfinal_limb
+ cmp %g0, 0
+
+.Lloop:
+ ldx [s2_ptr + 0x00], tmp1
+ add s2_ptr, 0x10, s2_ptr
+ ldx [s1_ptr + 0x00], tmp2
+ add s1_ptr, 0x10, s1_ptr
+ ldx [s2_ptr - 0x08], tmp3
+ add res_ptr, 0x10, res_ptr
+ ldx [s1_ptr - 0x08], tmp4
+ sub sz, 2, sz
+ addxccc tmp1, tmp2, tmp1
+ stx tmp1, [res_ptr - 0x10]
+ addxccc tmp3, tmp4, tmp3
+ brgz sz, .Lloop
+ stx tmp3, [res_ptr - 0x08]
+
+ brlz,pt sz, .Lfinish
+ nop
+
+.Lfinal_limb:
+ ldx [s2_ptr + 0x00], tmp1
+ ldx [s1_ptr + 0x00], tmp2
+ addxccc tmp1, tmp2, tmp1
+ stx tmp1, [res_ptr + 0x00]
+
+.Lfinish:
+ retl
+ addxc %g0, %g0, %o0
+END(__mpn_add_n_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n.S
new file mode 100644
index 0000000000..9ffaf7865b
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/add_n.S
@@ -0,0 +1,56 @@
+/* Multiple versions of add_n
+
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ Contributed by David S. Miller (davem@davemloft.net)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_add_n)
+ .type __mpn_add_n, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_VIS3, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_add_n_vis3), %o1
+ xor %o1, %gdop_lox10(__mpn_add_n_vis3), %o1
+# else
+ set __mpn_add_n_vis3, %o1
+# endif
+ ba 10f
+ nop
+1:
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_add_n_generic), %o1
+ xor %o1, %gdop_lox10(__mpn_add_n_generic), %o1
+# else
+ set __mpn_add_n_generic, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(__mpn_add_n)
+
+#define __mpn_add_n __mpn_add_n_generic
+#include "../add_n.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1-vis3.S
new file mode 100644
index 0000000000..64671f5079
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1-vis3.S
@@ -0,0 +1,87 @@
+! SPARC v9 64-bit VIS3 __mpn_addmul_1 -- Multiply a limb vector with a
+! limb and add the result to a second limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr %i0
+#define s1_ptr %i1
+#define sz %i2
+#define s2_limb %i3
+#define carry %o5
+#define tmp1 %g1
+#define tmp2 %g2
+#define tmp3 %g3
+#define tmp4 %o4
+#define tmp5 %l0
+#define tmp6 %l1
+#define tmp7 %l2
+#define tmp8 %l3
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ENTRY(__mpn_addmul_1_vis3)
+ save %sp, -176, %sp
+ subcc sz, 1, sz
+ be .Lfinal_limb
+ clr carry
+
+.Lloop:
+ ldx [s1_ptr + 0x00], tmp1
+ ldx [res_ptr + 0x00], tmp3
+ ldx [s1_ptr + 0x08], tmp2
+ ldx [res_ptr + 0x08], tmp4
+ mulx tmp1, s2_limb, tmp5
+ add s1_ptr, 0x10, s1_ptr
+ umulxhi tmp1, s2_limb, tmp6
+ add res_ptr, 0x10, res_ptr
+ mulx tmp2, s2_limb, tmp7
+ sub sz, 2, sz
+ umulxhi tmp2, s2_limb, tmp8
+ addcc carry, tmp5, tmp5
+ addxc %g0, tmp6, carry
+ addcc tmp3, tmp5, tmp5
+ addxc %g0, carry, carry
+ stx tmp5, [res_ptr - 0x10]
+ addcc carry, tmp7, tmp7
+ addxc %g0, tmp8, carry
+ addcc tmp4, tmp7, tmp7
+ addxc %g0, carry, carry
+ brgz sz, .Lloop
+ stx tmp7, [res_ptr - 0x08]
+
+ brlz,pt sz, .Lfinish
+ nop
+
+.Lfinal_limb:
+ ldx [s1_ptr + 0x00], tmp1
+ ldx [res_ptr + 0x00], tmp3
+ mulx tmp1, s2_limb, tmp5
+ umulxhi tmp1, s2_limb, tmp6
+ addcc carry, tmp5, tmp5
+ addxc %g0, tmp6, carry
+ addcc tmp3, tmp5, tmp5
+ addxc %g0, carry, carry
+ stx tmp5, [res_ptr + 0x00]
+
+.Lfinish:
+ jmpl %i7 + 8, %g0
+ restore carry, 0, %o0
+END(__mpn_addmul_1_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1.S
new file mode 100644
index 0000000000..dcb1da184c
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/addmul_1.S
@@ -0,0 +1,56 @@
+/* Multiple versions of addmul_1
+
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ Contributed by David S. Miller (davem@davemloft.net)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_addmul_1)
+ .type __mpn_addmul_1, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_VIS3, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_addmul_1_vis3), %o1
+ xor %o1, %gdop_lox10(__mpn_addmul_1_vis3), %o1
+# else
+ set __mpn_addmul_1_vis3, %o1
+# endif
+ ba 10f
+ nop
+1:
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_addmul_1_generic), %o1
+ xor %o1, %gdop_lox10(__mpn_addmul_1_generic), %o1
+# else
+ set __mpn_addmul_1_generic, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(__mpn_addmul_1)
+
+#define __mpn_addmul_1 __mpn_addmul_1_generic
+#include "../addmul_1.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..a97bc455a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,75 @@
+/* Enumerate available IFUNC implementations of a function. sparc version.
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <sysdep.h>
+#include <ifunc-impl-list.h>
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+ NAME and return the number of valid entries. */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ size_t max)
+{
+ size_t i = 0;
+ int hwcap;
+
+ hwcap = GLRO(dl_hwcap);
+
+ IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_CRYPTO,
+ __memcpy_niagara4)
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_N2,
+ __memcpy_niagara2)
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_BLKINIT,
+ __memcpy_niagara1)
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_ULTRA3,
+ __memcpy_ultra3)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ultra1));
+
+ IFUNC_IMPL (i, name, mempcpy,
+ IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_CRYPTO,
+ __mempcpy_niagara4)
+ IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_N2,
+ __mempcpy_niagara2)
+ IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_BLKINIT,
+ __mempcpy_niagara1)
+ IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_ULTRA3,
+ __mempcpy_ultra3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ultra1));
+
+ IFUNC_IMPL (i, name, bzero,
+ IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_CRYPTO,
+ __bzero_niagara4)
+ IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_BLKINIT,
+ __bzero_niagara1)
+ IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ultra1));
+
+ IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_CRYPTO,
+ __memset_niagara4)
+ IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_BLKINIT,
+ __memset_niagara1)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ultra1));
+
+ return i;
+}
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-block.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-block.c
new file mode 100644
index 0000000000..7c1a3a368f
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-block.c
@@ -0,0 +1,29 @@
+#include <sparc-ifunc.h>
+
+#define __md5_process_block __md5_process_block_generic
+extern void __md5_process_block_generic (const void *buffer, size_t len,
+ struct md5_ctx *ctx);
+
+#include <crypt/md5-block.c>
+
+#undef __md5_process_block
+
+extern void __md5_process_block_crop (const void *buffer, size_t len,
+ struct md5_ctx *ctx);
+static bool cpu_supports_md5(int hwcap)
+{
+ unsigned long cfr;
+
+ if (!(hwcap & HWCAP_SPARC_CRYPTO))
+ return false;
+
+ __asm__ ("rd %%asr26, %0" : "=r" (cfr));
+ if (cfr & (1 << 4))
+ return true;
+
+ return false;
+}
+
+extern void __md5_process_block (const void *buffer, size_t len,
+ struct md5_ctx *ctx);
+sparc_libc_ifunc(__md5_process_block, cpu_supports_md5(hwcap) ? __md5_process_block_crop : __md5_process_block_generic);
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-crop.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-crop.S
new file mode 100644
index 0000000000..e8810da83e
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/md5-crop.S
@@ -0,0 +1,110 @@
+/* MD5 using sparc crypto opcodes.
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@davemloft.net)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ASI_PL 0x88
+
+#define MD5 \
+ .word 0x81b02800;
+
+ .text
+ .align 32
+ENTRY(__md5_process_block_crop)
+ /* %o0=buffer, %o1=len, %o2=CTX */
+ ld [%o2 + 0x10], %g1
+ add %g1, %o1, %o4
+ st %o4, [%o2 + 0x10]
+ clr %o5
+ cmp %o4, %g1
+ movlu %icc, 1, %o5
+#ifdef __arch64__
+ srlx %o1, 32, %o4
+ add %o5, %o4, %o5
+#endif
+ ld [%o2 + 0x14], %o4
+ add %o4, %o5, %o4
+ st %o4, [%o2 + 0x14]
+ lda [%o2] ASI_PL, %f0
+ add %o2, 0x4, %g1
+ lda [%g1] ASI_PL, %f1
+ add %o2, 0x8, %g1
+ andcc %o0, 0x7, %g0
+ lda [%g1] ASI_PL, %f2
+ add %o2, 0xc, %g1
+ bne,pn %xcc, 10f
+ lda [%g1] ASI_PL, %f3
+
+1:
+ ldd [%o0 + 0x00], %f8
+ ldd [%o0 + 0x08], %f10
+ ldd [%o0 + 0x10], %f12
+ ldd [%o0 + 0x18], %f14
+ ldd [%o0 + 0x20], %f16
+ ldd [%o0 + 0x28], %f18
+ ldd [%o0 + 0x30], %f20
+ ldd [%o0 + 0x38], %f22
+
+ MD5
+
+ subcc %o1, 64, %o1
+ bne,pt %xcc, 1b
+ add %o0, 0x40, %o0
+
+5:
+ sta %f0, [%o2] ASI_PL
+ add %o2, 0x4, %g1
+ sta %f1, [%g1] ASI_PL
+ add %o2, 0x8, %g1
+ sta %f2, [%g1] ASI_PL
+ add %o2, 0xc, %g1
+ retl
+ sta %f3, [%g1] ASI_PL
+10:
+ alignaddr %o0, %g0, %o0
+
+ ldd [%o0 + 0x00], %f10
+1:
+ ldd [%o0 + 0x08], %f12
+ ldd [%o0 + 0x10], %f14
+ ldd [%o0 + 0x18], %f16
+ ldd [%o0 + 0x20], %f18
+ ldd [%o0 + 0x28], %f20
+ ldd [%o0 + 0x30], %f22
+ ldd [%o0 + 0x38], %f24
+ ldd [%o0 + 0x40], %f26
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ MD5
+
+ subcc %o1, 64, %o1
+ fsrc2 %f26, %f10
+ bne,pt %xcc, 1b
+ add %o0, 0x40, %o0
+
+ ba,a,pt %xcc, 5b
+END(__md5_process_block_crop)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S
new file mode 100644
index 0000000000..ccf42446e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S
@@ -0,0 +1,347 @@
+/* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara.
+ Copyright (C) 2006-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@davemloft.net)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define ASI_P 0x80
+#define ASI_PNF 0x82
+
+#define LOAD(type,addr,dest) type##a [addr] ASI_P, dest
+#define LOAD_TWIN(addr_reg,dest0,dest1) \
+ ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
+
+#define STORE(type,src,addr) type src, [addr]
+#define STORE_INIT(src,addr) stxa src, [addr] %asi
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+#if IS_IN (libc)
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ .register %g6,#scratch
+
+ .text
+
+ENTRY(__mempcpy_niagara1)
+ ba,pt %XCC, 101f
+ add %o0, %o2, %g5
+END(__mempcpy_niagara1)
+
+ .align 32
+ENTRY(__memcpy_niagara1)
+100: /* %o0=dst, %o1=src, %o2=len */
+ mov %o0, %g5
+101:
+# ifndef USE_BPR
+ srl %o2, 0, %o2
+# endif
+ cmp %o2, 0
+ be,pn %XCC, 85f
+218: or %o0, %o1, %o3
+ cmp %o2, 16
+ blu,a,pn %XCC, 80f
+ or %o3, %o2, %o3
+
+ /* 2 blocks (128 bytes) is the minimum we can do the block
+ * copy with. We need to ensure that we'll iterate at least
+ * once in the block copy loop. At worst we'll need to align
+ * the destination to a 64-byte boundary which can chew up
+ * to (64 - 1) bytes from the length before we perform the
+ * block copy loop.
+ */
+ cmp %o2, (2 * 64)
+ blu,pt %XCC, 70f
+ andcc %o3, 0x7, %g0
+
+ /* %o0: dst
+ * %o1: src
+ * %o2: len (known to be >= 128)
+ *
+ * The block copy loops will use %o4/%o5,%g2/%g3 as
+ * temporaries while copying the data.
+ */
+
+ LOAD(prefetch, %o1, #one_read)
+ wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+
+ /* Align destination on 64-byte boundary. */
+ andcc %o0, (64 - 1), %o4
+ be,pt %XCC, 2f
+ sub %o4, 64, %o4
+ sub %g0, %o4, %o4 ! bytes to align dst
+ sub %o2, %o4, %o2
+1: subcc %o4, 1, %o4
+ LOAD(ldub, %o1, %g1)
+ STORE(stb, %g1, %o0)
+ add %o1, 1, %o1
+ bne,pt %XCC, 1b
+ add %o0, 1, %o0
+
+ /* If the source is on a 16-byte boundary we can do
+ * the direct block copy loop. If it is 8-byte aligned
+ * we can do the 16-byte loads offset by -8 bytes and the
+ * init stores offset by one register.
+ *
+ * If the source is not even 8-byte aligned, we need to do
+ * shifting and masking (basically integer faligndata).
+ *
+ * The careful bit with init stores is that if we store
+ * to any part of the cache line we have to store the whole
+ * cacheline else we can end up with corrupt L2 cache line
+ * contents. Since the loop works on 64-bytes of 64-byte
+ * aligned store data at a time, this is easy to ensure.
+ */
+2:
+ andcc %o1, (16 - 1), %o4
+ andn %o2, (64 - 1), %g1 ! block copy loop iterator
+ sub %o2, %g1, %o2 ! final sub-block copy bytes
+ be,pt %XCC, 50f
+ cmp %o4, 8
+ be,a,pt %XCC, 10f
+ sub %o1, 0x8, %o1
+
+ /* Neither 8-byte nor 16-byte aligned, shift and mask. */
+ mov %g1, %o4
+ and %o1, 0x7, %g1
+ sll %g1, 3, %g1
+ mov 64, %o3
+ andn %o1, 0x7, %o1
+ LOAD(ldx, %o1, %g2)
+ sub %o3, %g1, %o3
+ sllx %g2, %g1, %g2
+
+#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
+ LOAD(ldx, SRC, TMP1); \
+ srlx TMP1, PRE_SHIFT, TMP2; \
+ or TMP2, PRE_VAL, TMP2; \
+ STORE_INIT(TMP2, DST); \
+ sllx TMP1, POST_SHIFT, PRE_VAL;
+
+1: add %o1, 0x8, %o1
+ SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
+ add %o1, 0x8, %o1
+ SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
+ add %o1, 0x8, %o1
+ SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
+ add %o1, 0x8, %o1
+ SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
+ add %o1, 32, %o1
+ LOAD(prefetch, %o1, #one_read)
+ sub %o1, 32 - 8, %o1
+ SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
+ add %o1, 8, %o1
+ SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
+ add %o1, 8, %o1
+ SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
+ add %o1, 8, %o1
+ SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
+ subcc %o4, 64, %o4
+ bne,pt %XCC, 1b
+ add %o0, 64, %o0
+
+#undef SWIVEL_ONE_DWORD
+
+ srl %g1, 3, %g1
+ ba,pt %XCC, 60f
+ add %o1, %g1, %o1
+
+10: /* Destination is 64-byte aligned, source was only 8-byte
+ * aligned but it has been subtracted by 8 and we perform
+ * one twin load ahead, then add 8 back into source when
+ * we finish the loop.
+ */
+ LOAD_TWIN(%o1, %o4, %o5)
+1: add %o1, 16, %o1
+ LOAD_TWIN(%o1, %g2, %g3)
+ add %o1, 16 + 32, %o1
+ LOAD(prefetch, %o1, #one_read)
+ sub %o1, 32, %o1
+ STORE_INIT(%o5, %o0 + 0x00) ! initializes cache line
+ STORE_INIT(%g2, %o0 + 0x08)
+ LOAD_TWIN(%o1, %o4, %o5)
+ add %o1, 16, %o1
+ STORE_INIT(%g3, %o0 + 0x10)
+ STORE_INIT(%o4, %o0 + 0x18)
+ LOAD_TWIN(%o1, %g2, %g3)
+ add %o1, 16, %o1
+ STORE_INIT(%o5, %o0 + 0x20)
+ STORE_INIT(%g2, %o0 + 0x28)
+ LOAD_TWIN(%o1, %o4, %o5)
+ STORE_INIT(%g3, %o0 + 0x30)
+ STORE_INIT(%o4, %o0 + 0x38)
+ subcc %g1, 64, %g1
+ bne,pt %XCC, 1b
+ add %o0, 64, %o0
+
+ ba,pt %XCC, 60f
+ add %o1, 0x8, %o1
+
+50: /* Destination is 64-byte aligned, and source is 16-byte
+ * aligned.
+ */
+1: LOAD_TWIN(%o1, %o4, %o5)
+ add %o1, 16, %o1
+ LOAD_TWIN(%o1, %g2, %g3)
+ add %o1, 16 + 32, %o1
+ LOAD(prefetch, %o1, #one_read)
+ sub %o1, 32, %o1
+ STORE_INIT(%o4, %o0 + 0x00) ! initializes cache line
+ STORE_INIT(%o5, %o0 + 0x08)
+ LOAD_TWIN(%o1, %o4, %o5)
+ add %o1, 16, %o1
+ STORE_INIT(%g2, %o0 + 0x10)
+ STORE_INIT(%g3, %o0 + 0x18)
+ LOAD_TWIN(%o1, %g2, %g3)
+ add %o1, 16, %o1
+ STORE_INIT(%o4, %o0 + 0x20)
+ STORE_INIT(%o5, %o0 + 0x28)
+ STORE_INIT(%g2, %o0 + 0x30)
+ STORE_INIT(%g3, %o0 + 0x38)
+ subcc %g1, 64, %g1
+ bne,pt %XCC, 1b
+ add %o0, 64, %o0
+ /* fall through */
+
+60:
+ /* %o2 contains any final bytes still needed to be copied
+ * over. If anything is left, we copy it one byte at a time.
+ */
+ wr %g0, ASI_PNF, %asi
+ brz,pt %o2, 85f
+ sub %o0, %o1, %o3
+ ba,a,pt %XCC, 90f
+
+ .align 64
+70: /* 16 < len <= 64 */
+ bne,pn %XCC, 75f
+ sub %o0, %o1, %o3
+
+72:
+ andn %o2, 0xf, %o4
+ and %o2, 0xf, %o2
+1: subcc %o4, 0x10, %o4
+ LOAD(ldx, %o1, %o5)
+ add %o1, 0x08, %o1
+ LOAD(ldx, %o1, %g1)
+ sub %o1, 0x08, %o1
+ STORE(stx, %o5, %o1 + %o3)
+ add %o1, 0x8, %o1
+ STORE(stx, %g1, %o1 + %o3)
+ bgu,pt %XCC, 1b
+ add %o1, 0x8, %o1
+73: andcc %o2, 0x8, %g0
+ be,pt %XCC, 1f
+ nop
+ sub %o2, 0x8, %o2
+ LOAD(ldx, %o1, %o5)
+ STORE(stx, %o5, %o1 + %o3)
+ add %o1, 0x8, %o1
+1: andcc %o2, 0x4, %g0
+ be,pt %XCC, 1f
+ nop
+ sub %o2, 0x4, %o2
+ LOAD(lduw, %o1, %o5)
+ STORE(stw, %o5, %o1 + %o3)
+ add %o1, 0x4, %o1
+1: cmp %o2, 0
+ be,pt %XCC, 85f
+ nop
+ ba,pt %XCC, 90f
+ nop
+
+75:
+ andcc %o0, 0x7, %g1
+ sub %g1, 0x8, %g1
+ be,pn %icc, 2f
+ sub %g0, %g1, %g1
+ sub %o2, %g1, %o2
+
+1: subcc %g1, 1, %g1
+ LOAD(ldub, %o1, %o5)
+ STORE(stb, %o5, %o1 + %o3)
+ bgu,pt %icc, 1b
+ add %o1, 1, %o1
+
+2: add %o1, %o3, %o0
+ andcc %o1, 0x7, %g1
+ bne,pt %icc, 8f
+ sll %g1, 3, %g1
+
+ cmp %o2, 16
+ bgeu,pt %icc, 72b
+ nop
+ ba,a,pt %XCC, 73b
+
+8: mov 64, %o3
+ andn %o1, 0x7, %o1
+ LOAD(ldx, %o1, %g2)
+ sub %o3, %g1, %o3
+ andn %o2, 0x7, %o4
+ sllx %g2, %g1, %g2
+1: add %o1, 0x8, %o1
+ LOAD(ldx, %o1, %g3)
+ subcc %o4, 0x8, %o4
+ srlx %g3, %o3, %o5
+ or %o5, %g2, %o5
+ STORE(stx, %o5, %o0)
+ add %o0, 0x8, %o0
+ bgu,pt %icc, 1b
+ sllx %g3, %g1, %g2
+
+ srl %g1, 3, %g1
+ andcc %o2, 0x7, %o2
+ be,pn %icc, 85f
+ add %o1, %g1, %o1
+ ba,pt %XCC, 90f
+ sub %o0, %o1, %o3
+
+ .align 64
+80: /* 0 < len <= 16 */
+ andcc %o3, 0x3, %g0
+ bne,pn %XCC, 90f
+ sub %o0, %o1, %o3
+
+1:
+ subcc %o2, 4, %o2
+ LOAD(lduw, %o1, %g1)
+ STORE(stw, %g1, %o1 + %o3)
+ bgu,pt %XCC, 1b
+ add %o1, 4, %o1
+
+85: retl
+ mov %g5, %o0
+
+ .align 32
+90:
+ subcc %o2, 1, %o2
+ LOAD(ldub, %o1, %g1)
+ STORE(stb, %g1, %o1 + %o3)
+ bgu,pt %XCC, 90b
+ add %o1, 1, %o1
+ retl
+ mov %g5, %o0
+
+END(__memcpy_niagara1)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S
new file mode 100644
index 0000000000..798b3c80fe
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S
@@ -0,0 +1,498 @@
+/* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara-2.
+ Copyright (C) 2007-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@davemloft.net)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define ASI_BLK_P 0xf0
+#define ASI_P 0x80
+#define ASI_PNF 0x82
+
+#define FPRS_FEF 0x04
+
+#define VISEntryHalf \
+ rd %fprs, %o5; \
+ wr %g0, FPRS_FEF, %fprs
+
+#define VISExitHalf \
+ and %o5, FPRS_FEF, %o5; \
+ wr %o5, 0x0, %fprs
+
+#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
+
+#define LOAD(type,addr,dest) type [addr], dest
+#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
+#define STORE(type,src,addr) type src, [addr]
+#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
+#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
+ faligndata %x0, %x1, %f0; \
+ faligndata %x1, %x2, %f2; \
+ faligndata %x2, %x3, %f4; \
+ faligndata %x3, %x4, %f6; \
+ faligndata %x4, %x5, %f8; \
+ faligndata %x5, %x6, %f10; \
+ faligndata %x6, %x7, %f12; \
+ faligndata %x7, %x8, %f14;
+
+#define FREG_MOVE_1(x0) \
+ fsrc2 %x0, %f0;
+#define FREG_MOVE_2(x0, x1) \
+ fsrc2 %x0, %f0; \
+ fsrc2 %x1, %f2;
+#define FREG_MOVE_3(x0, x1, x2) \
+ fsrc2 %x0, %f0; \
+ fsrc2 %x1, %f2; \
+ fsrc2 %x2, %f4;
+#define FREG_MOVE_4(x0, x1, x2, x3) \
+ fsrc2 %x0, %f0; \
+ fsrc2 %x1, %f2; \
+ fsrc2 %x2, %f4; \
+ fsrc2 %x3, %f6;
+#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
+ fsrc2 %x0, %f0; \
+ fsrc2 %x1, %f2; \
+ fsrc2 %x2, %f4; \
+ fsrc2 %x3, %f6; \
+ fsrc2 %x4, %f8;
+#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
+ fsrc2 %x0, %f0; \
+ fsrc2 %x1, %f2; \
+ fsrc2 %x2, %f4; \
+ fsrc2 %x3, %f6; \
+ fsrc2 %x4, %f8; \
+ fsrc2 %x5, %f10;
+#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
+ fsrc2 %x0, %f0; \
+ fsrc2 %x1, %f2; \
+ fsrc2 %x2, %f4; \
+ fsrc2 %x3, %f6; \
+ fsrc2 %x4, %f8; \
+ fsrc2 %x5, %f10; \
+ fsrc2 %x6, %f12;
+#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
+ fsrc2 %x0, %f0; \
+ fsrc2 %x1, %f2; \
+ fsrc2 %x2, %f4; \
+ fsrc2 %x3, %f6; \
+ fsrc2 %x4, %f8; \
+ fsrc2 %x5, %f10; \
+ fsrc2 %x6, %f12; \
+ fsrc2 %x7, %f14;
+#define FREG_LOAD_1(base, x0) \
+ LOAD(ldd, base + 0x00, %x0)
+#define FREG_LOAD_2(base, x0, x1) \
+ LOAD(ldd, base + 0x00, %x0); \
+ LOAD(ldd, base + 0x08, %x1);
+#define FREG_LOAD_3(base, x0, x1, x2) \
+ LOAD(ldd, base + 0x00, %x0); \
+ LOAD(ldd, base + 0x08, %x1); \
+ LOAD(ldd, base + 0x10, %x2);
+#define FREG_LOAD_4(base, x0, x1, x2, x3) \
+ LOAD(ldd, base + 0x00, %x0); \
+ LOAD(ldd, base + 0x08, %x1); \
+ LOAD(ldd, base + 0x10, %x2); \
+ LOAD(ldd, base + 0x18, %x3);
+#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
+ LOAD(ldd, base + 0x00, %x0); \
+ LOAD(ldd, base + 0x08, %x1); \
+ LOAD(ldd, base + 0x10, %x2); \
+ LOAD(ldd, base + 0x18, %x3); \
+ LOAD(ldd, base + 0x20, %x4);
+#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
+ LOAD(ldd, base + 0x00, %x0); \
+ LOAD(ldd, base + 0x08, %x1); \
+ LOAD(ldd, base + 0x10, %x2); \
+ LOAD(ldd, base + 0x18, %x3); \
+ LOAD(ldd, base + 0x20, %x4); \
+ LOAD(ldd, base + 0x28, %x5);
+#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
+ LOAD(ldd, base + 0x00, %x0); \
+ LOAD(ldd, base + 0x08, %x1); \
+ LOAD(ldd, base + 0x10, %x2); \
+ LOAD(ldd, base + 0x18, %x3); \
+ LOAD(ldd, base + 0x20, %x4); \
+ LOAD(ldd, base + 0x28, %x5); \
+ LOAD(ldd, base + 0x30, %x6);
+
+#if IS_IN (libc)
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ .register %g6,#scratch
+
+ .text
+
+ENTRY(__mempcpy_niagara2)
+ ba,pt %XCC, 101f
+ add %o0, %o2, %g5
+END(__mempcpy_niagara2)
+
+ .align 32
+ENTRY(__memcpy_niagara2)
+100: /* %o0=dst, %o1=src, %o2=len */
+ mov %o0, %g5
+101:
+# ifndef USE_BPR
+ srl %o2, 0, %o2
+# endif
+ cmp %o2, 0
+ be,pn %XCC, 85f
+218: or %o0, %o1, %o3
+ cmp %o2, 16
+ blu,a,pn %XCC, 80f
+ or %o3, %o2, %o3
+
+ /* 2 blocks (128 bytes) is the minimum we can do the block
+ * copy with. We need to ensure that we'll iterate at least
+ * once in the block copy loop. At worst we'll need to align
+ * the destination to a 64-byte boundary which can chew up
+ * to (64 - 1) bytes from the length before we perform the
+ * block copy loop.
+ *
+ * However, the cut-off point, performance wise, is around
+ * 4 64-byte blocks.
+ */
+ cmp %o2, (4 * 64)
+ blu,pt %XCC, 75f
+ andcc %o3, 0x7, %g0
+
+ /* %o0: dst
+ * %o1: src
+ * %o2: len (known to be >= 128)
+ *
+ * The block copy loops can use %o4, %g2, %g3 as
+ * temporaries while copying the data. %o5 must
+ * be preserved between VISEntryHalf and VISExitHalf
+ */
+
+ LOAD(prefetch, %o1 + 0x000, #one_read)
+ LOAD(prefetch, %o1 + 0x040, #one_read)
+ LOAD(prefetch, %o1 + 0x080, #one_read)
+
+ /* Align destination on 64-byte boundary. */
+ andcc %o0, (64 - 1), %o4
+ be,pt %XCC, 2f
+ sub %o4, 64, %o4
+ sub %g0, %o4, %o4 ! bytes to align dst
+ sub %o2, %o4, %o2
+1: subcc %o4, 1, %o4
+ LOAD(ldub, %o1, %g1)
+ STORE(stb, %g1, %o0)
+ add %o1, 1, %o1
+ bne,pt %XCC, 1b
+ add %o0, 1, %o0
+
+2:
+ /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
+ * o5 from here until we hit VISExitHalf.
+ */
+ VISEntryHalf
+
+ membar #Sync
+ alignaddr %o1, %g0, %g0
+
+ add %o1, (64 - 1), %o4
+ andn %o4, (64 - 1), %o4
+ andn %o2, (64 - 1), %g1
+ sub %o2, %g1, %o2
+
+ and %o1, (64 - 1), %g2
+ add %o1, %g1, %o1
+ sub %o0, %o4, %g3
+ brz,pt %g2, 190f
+ cmp %g2, 32
+ blu,a 5f
+ cmp %g2, 16
+ cmp %g2, 48
+ blu,a 4f
+ cmp %g2, 40
+ cmp %g2, 56
+ blu 170f
+ nop
+ ba,a,pt %xcc, 180f
+
+4: /* 32 <= low bits < 48 */
+ blu 150f
+ nop
+ ba,a,pt %xcc, 160f
+5: /* 0 < low bits < 32 */
+ blu,a 6f
+ cmp %g2, 8
+ cmp %g2, 24
+ blu 130f
+ nop
+ ba,a,pt %xcc, 140f
+6: /* 0 < low bits < 16 */
+ bgeu 120f
+ nop
+ /* fall through for 0 < low bits < 8 */
+110: sub %o4, 64, %g2
+ LOAD_BLK(%g2, %f0)
+1: STORE_INIT(%g0, %o4 + %g3)
+ LOAD_BLK(%o4, %f16)
+ FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
+ STORE_BLK(%f0, %o4 + %g3)
+ FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
+ subcc %g1, 64, %g1
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+ ba,pt %xcc, 195f
+ nop
+
+120: sub %o4, 56, %g2
+ FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
+1: STORE_INIT(%g0, %o4 + %g3)
+ LOAD_BLK(%o4, %f16)
+ FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
+ STORE_BLK(%f0, %o4 + %g3)
+ FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
+ subcc %g1, 64, %g1
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+ ba,pt %xcc, 195f
+ nop
+
+130: sub %o4, 48, %g2
+ FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
+1: STORE_INIT(%g0, %o4 + %g3)
+ LOAD_BLK(%o4, %f16)
+ FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
+ STORE_BLK(%f0, %o4 + %g3)
+ FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
+ subcc %g1, 64, %g1
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+ ba,pt %xcc, 195f
+ nop
+
+140: sub %o4, 40, %g2
+ FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
+1: STORE_INIT(%g0, %o4 + %g3)
+ LOAD_BLK(%o4, %f16)
+ FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
+ STORE_BLK(%f0, %o4 + %g3)
+ FREG_MOVE_5(f22, f24, f26, f28, f30)
+ subcc %g1, 64, %g1
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+ ba,pt %xcc, 195f
+ nop
+
+150: sub %o4, 32, %g2
+ FREG_LOAD_4(%g2, f0, f2, f4, f6)
+1: STORE_INIT(%g0, %o4 + %g3)
+ LOAD_BLK(%o4, %f16)
+ FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
+ STORE_BLK(%f0, %o4 + %g3)
+ FREG_MOVE_4(f24, f26, f28, f30)
+ subcc %g1, 64, %g1
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+ ba,pt %xcc, 195f
+ nop
+
+160: sub %o4, 24, %g2
+ FREG_LOAD_3(%g2, f0, f2, f4)
+1: STORE_INIT(%g0, %o4 + %g3)
+ LOAD_BLK(%o4, %f16)
+ FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
+ STORE_BLK(%f0, %o4 + %g3)
+ FREG_MOVE_3(f26, f28, f30)
+ subcc %g1, 64, %g1
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+ ba,pt %xcc, 195f
+ nop
+
+170: sub %o4, 16, %g2
+ FREG_LOAD_2(%g2, f0, f2)
+1: STORE_INIT(%g0, %o4 + %g3)
+ LOAD_BLK(%o4, %f16)
+ FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
+ STORE_BLK(%f0, %o4 + %g3)
+ FREG_MOVE_2(f28, f30)
+ subcc %g1, 64, %g1
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+ ba,pt %xcc, 195f
+ nop
+
+180: sub %o4, 8, %g2
+ FREG_LOAD_1(%g2, f0)
+1: STORE_INIT(%g0, %o4 + %g3)
+ LOAD_BLK(%o4, %f16)
+ FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
+ STORE_BLK(%f0, %o4 + %g3)
+ FREG_MOVE_1(f30)
+ subcc %g1, 64, %g1
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+ ba,pt %xcc, 195f
+ nop
+
+190:
+1: STORE_INIT(%g0, %o4 + %g3)
+ subcc %g1, 64, %g1
+ LOAD_BLK(%o4, %f0)
+ STORE_BLK(%f0, %o4 + %g3)
+ add %o4, 64, %o4
+ bne,pt %XCC, 1b
+ LOAD(prefetch, %o4 + 64, #one_read)
+
+195:
+ add %o4, %g3, %o0
+ membar #Sync
+
+ VISExitHalf
+
+ /* %o2 contains any final bytes still needed to be copied
+ * over. If anything is left, we copy it one byte at a time.
+ */
+ brz,pt %o2, 85f
+ sub %o0, %o1, %o3
+ ba,a,pt %XCC, 90f
+
+ .align 64
+75: /* 16 < len <= 64 */
+ bne,pn %XCC, 75f
+ sub %o0, %o1, %o3
+
+72:
+ andn %o2, 0xf, %o4
+ and %o2, 0xf, %o2
+1: subcc %o4, 0x10, %o4
+ LOAD(ldx, %o1, %o5)
+ add %o1, 0x08, %o1
+ LOAD(ldx, %o1, %g1)
+ sub %o1, 0x08, %o1
+ STORE(stx, %o5, %o1 + %o3)
+ add %o1, 0x8, %o1
+ STORE(stx, %g1, %o1 + %o3)
+ bgu,pt %XCC, 1b
+ add %o1, 0x8, %o1
+73: andcc %o2, 0x8, %g0
+ be,pt %XCC, 1f
+ nop
+ sub %o2, 0x8, %o2
+ LOAD(ldx, %o1, %o5)
+ STORE(stx, %o5, %o1 + %o3)
+ add %o1, 0x8, %o1
+1: andcc %o2, 0x4, %g0
+ be,pt %XCC, 1f
+ nop
+ sub %o2, 0x4, %o2
+ LOAD(lduw, %o1, %o5)
+ STORE(stw, %o5, %o1 + %o3)
+ add %o1, 0x4, %o1
+1: cmp %o2, 0
+ be,pt %XCC, 85f
+ nop
+ ba,pt %xcc, 90f
+ nop
+
+75:
+ andcc %o0, 0x7, %g1
+ sub %g1, 0x8, %g1
+ be,pn %icc, 2f
+ sub %g0, %g1, %g1
+ sub %o2, %g1, %o2
+
+1: subcc %g1, 1, %g1
+ LOAD(ldub, %o1, %o5)
+ STORE(stb, %o5, %o1 + %o3)
+ bgu,pt %icc, 1b
+ add %o1, 1, %o1
+
+2: add %o1, %o3, %o0
+ andcc %o1, 0x7, %g1
+ bne,pt %icc, 8f
+ sll %g1, 3, %g1
+
+ cmp %o2, 16
+ bgeu,pt %icc, 72b
+ nop
+ ba,a,pt %xcc, 73b
+
+8: mov 64, %o3
+ andn %o1, 0x7, %o1
+ LOAD(ldx, %o1, %g2)
+ sub %o3, %g1, %o3
+ andn %o2, 0x7, %o4
+ sllx %g2, %g1, %g2
+1: add %o1, 0x8, %o1
+ LOAD(ldx, %o1, %g3)
+ subcc %o4, 0x8, %o4
+ srlx %g3, %o3, %o5
+ or %o5, %g2, %o5
+ STORE(stx, %o5, %o0)
+ add %o0, 0x8, %o0
+ bgu,pt %icc, 1b
+ sllx %g3, %g1, %g2
+
+ srl %g1, 3, %g1
+ andcc %o2, 0x7, %o2
+ be,pn %icc, 85f
+ add %o1, %g1, %o1
+ ba,pt %xcc, 90f
+ sub %o0, %o1, %o3
+
+ .align 64
+80: /* 0 < len <= 16 */
+ andcc %o3, 0x3, %g0
+ bne,pn %XCC, 90f
+ sub %o0, %o1, %o3
+
+1:
+ subcc %o2, 4, %o2
+ LOAD(lduw, %o1, %g1)
+ STORE(stw, %g1, %o1 + %o3)
+ bgu,pt %XCC, 1b
+ add %o1, 4, %o1
+
+85: retl
+ mov %g5, %o0
+
+ .align 32
+90:
+ subcc %o2, 1, %o2
+ LOAD(ldub, %o1, %g1)
+ STORE(stb, %g1, %o1 + %o3)
+ bgu,pt %XCC, 90b
+ add %o1, 1, %o1
+ retl
+ mov %g5, %o0
+
+END(__memcpy_niagara2)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S
new file mode 100644
index 0000000000..709b398364
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S
@@ -0,0 +1,332 @@
+/* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara-4.
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@davemloft.net)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+
+#define FPRS_FEF 0x04
+
+/* On T4 it is very expensive to access ASRs like %fprs and
+ * %asi, avoiding a read or a write can save ~50 cycles.
+ */
+#define FPU_ENTER \
+ rd %fprs, %o5; \
+ andcc %o5, FPRS_FEF, %g0; \
+ be,a,pn %icc, 999f; \
+ wr %g0, FPRS_FEF, %fprs; \
+ 999:
+
+#define VISEntryHalf FPU_ENTER
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+
+#define GLOBAL_SPARE %g5
+
+#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
+#define EX_LD(x) x
+#define EX_ST(x) x
+#define EX_RETVAL(x) x
+#define LOAD(type,addr,dest) type [addr], dest
+#define STORE(type,src,addr) type src, [addr]
+#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
+
+#if IS_IN (libc)
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ .register %g6,#scratch
+
+ .text
+
+ENTRY(__mempcpy_niagara4)
+ ba,pt %icc, 101f
+ add %o0, %o2, %o3
+END(__mempcpy_niagara4)
+
+ .align 32
+ENTRY(__memcpy_niagara4)
+100: /* %o0=dst, %o1=src, %o2=len */
+ mov %o0, %o3
+101:
+#ifndef __arch64__
+ srl %o2, 0, %o2
+#endif
+ brz,pn %o2, .Lexit
+ cmp %o2, 3
+ ble,pn %icc, .Ltiny
+ cmp %o2, 19
+ ble,pn %icc, .Lsmall
+ or %o0, %o1, %g2
+ cmp %o2, 128
+ bl,pn %icc, .Lmedium
+ nop
+
+.Llarge:/* len >= 0x80 */
+ /* First get dest 8 byte aligned. */
+ sub %g0, %o0, %g1
+ and %g1, 0x7, %g1
+ brz,pt %g1, 51f
+ sub %o2, %g1, %o2
+
+1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+ add %o1, 1, %o1
+ subcc %g1, 1, %g1
+ add %o0, 1, %o0
+ bne,pt %icc, 1b
+ EX_ST(STORE(stb, %g2, %o0 - 0x01))
+
+51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
+ LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
+ LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
+ LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
+ LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
+ LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
+ LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
+ LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+ /* Check if we can use the straight fully aligned
+ * loop, or we require the alignaddr/faligndata variant.
+ */
+ andcc %o1, 0x7, %o5
+ bne,pn %icc, .Llarge_src_unaligned
+ sub %g0, %o0, %g1
+
+ /* Legitimize the use of initializing stores by getting dest
+ * to be 64-byte aligned.
+ */
+ and %g1, 0x3f, %g1
+ brz,pt %g1, .Llarge_aligned
+ sub %o2, %g1, %o2
+
+1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
+ add %o1, 8, %o1
+ subcc %g1, 8, %g1
+ add %o0, 8, %o0
+ bne,pt %icc, 1b
+ EX_ST(STORE(stx, %g2, %o0 - 0x08))
+
+.Llarge_aligned:
+ /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
+ andn %o2, 0x3f, %o4
+ sub %o2, %o4, %o2
+
+1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+ add %o1, 0x40, %o1
+ EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
+ subcc %o4, 0x40, %o4
+ EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
+ EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
+ EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
+ EX_ST(STORE_INIT(%g1, %o0))
+ add %o0, 0x08, %o0
+ EX_ST(STORE_INIT(%g2, %o0))
+ add %o0, 0x08, %o0
+ EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
+ EX_ST(STORE_INIT(%g3, %o0))
+ add %o0, 0x08, %o0
+ EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
+ EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+ add %o0, 0x08, %o0
+ EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
+ EX_ST(STORE_INIT(%o5, %o0))
+ add %o0, 0x08, %o0
+ EX_ST(STORE_INIT(%g2, %o0))
+ add %o0, 0x08, %o0
+ EX_ST(STORE_INIT(%g3, %o0))
+ add %o0, 0x08, %o0
+ EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+ add %o0, 0x08, %o0
+ bne,pt %icc, 1b
+ LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+ membar #StoreLoad | #StoreStore
+
+ brz,pn %o2, .Lexit
+ cmp %o2, 19
+ ble,pn %icc, .Lsmall_unaligned
+ nop
+ ba,a,pt %icc, .Lmedium_noprefetch
+
+.Lexit: retl
+ mov EX_RETVAL(%o3), %o0
+
+.Llarge_src_unaligned:
+ andn %o2, 0x3f, %o4
+ sub %o2, %o4, %o2
+ VISEntryHalf
+ alignaddr %o1, %g0, %g1
+ add %o1, %o4, %o1
+ EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
+1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
+ subcc %o4, 0x40, %o4
+ EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
+ EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
+ EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
+ EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
+ EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
+ EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
+ faligndata %f0, %f2, %f16
+ EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
+ faligndata %f2, %f4, %f18
+ add %g1, 0x40, %g1
+ faligndata %f4, %f6, %f20
+ faligndata %f6, %f8, %f22
+ faligndata %f8, %f10, %f24
+ faligndata %f10, %f12, %f26
+ faligndata %f12, %f14, %f28
+ faligndata %f14, %f0, %f30
+ EX_ST(STORE(std, %f16, %o0 + 0x00))
+ EX_ST(STORE(std, %f18, %o0 + 0x08))
+ EX_ST(STORE(std, %f20, %o0 + 0x10))
+ EX_ST(STORE(std, %f22, %o0 + 0x18))
+ EX_ST(STORE(std, %f24, %o0 + 0x20))
+ EX_ST(STORE(std, %f26, %o0 + 0x28))
+ EX_ST(STORE(std, %f28, %o0 + 0x30))
+ EX_ST(STORE(std, %f30, %o0 + 0x38))
+ add %o0, 0x40, %o0
+ bne,pt %icc, 1b
+ LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
+ VISExitHalf
+
+ brz,pn %o2, .Lexit
+ cmp %o2, 19
+ ble,pn %icc, .Lsmall_unaligned
+ nop
+ ba,a,pt %icc, .Lmedium_unaligned
+
+.Lmedium:
+ LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
+ andcc %g2, 0x7, %g0
+ bne,pn %icc, .Lmedium_unaligned
+ nop
+.Lmedium_noprefetch:
+ andncc %o2, 0x20 - 1, %o5
+ be,pn %icc, 2f
+ sub %o2, %o5, %o2
+1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+ EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
+ EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
+ EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
+ add %o1, 0x20, %o1
+ subcc %o5, 0x20, %o5
+ EX_ST(STORE(stx, %g1, %o0 + 0x00))
+ EX_ST(STORE(stx, %g2, %o0 + 0x08))
+ EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
+ EX_ST(STORE(stx, %o4, %o0 + 0x18))
+ bne,pt %icc, 1b
+ add %o0, 0x20, %o0
+2: andcc %o2, 0x18, %o5
+ be,pt %icc, 3f
+ sub %o2, %o5, %o2
+1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+ add %o1, 0x08, %o1
+ add %o0, 0x08, %o0
+ subcc %o5, 0x08, %o5
+ bne,pt %icc, 1b
+ EX_ST(STORE(stx, %g1, %o0 - 0x08))
+3: brz,pt %o2, .Lexit
+ cmp %o2, 0x04
+ bl,pn %icc, .Ltiny
+ nop
+ EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+ add %o1, 0x04, %o1
+ add %o0, 0x04, %o0
+ subcc %o2, 0x04, %o2
+ bne,pn %icc, .Ltiny
+ EX_ST(STORE(stw, %g1, %o0 - 0x04))
+ ba,a,pt %icc, .Lexit
+.Lmedium_unaligned:
+ /* First get dest 8 byte aligned. */
+ sub %g0, %o0, %g1
+ and %g1, 0x7, %g1
+ brz,pt %g1, 2f
+ sub %o2, %g1, %o2
+
+1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+ add %o1, 1, %o1
+ subcc %g1, 1, %g1
+ add %o0, 1, %o0
+ bne,pt %icc, 1b
+ EX_ST(STORE(stb, %g2, %o0 - 0x01))
+2:
+ and %o1, 0x7, %g1
+ brz,pn %g1, .Lmedium_noprefetch
+ sll %g1, 3, %g1
+ mov 64, %g2
+ sub %g2, %g1, %g2
+ andn %o1, 0x7, %o1
+ EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
+ sllx %o4, %g1, %o4
+ andn %o2, 0x08 - 1, %o5
+ sub %o2, %o5, %o2
+1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
+ add %o1, 0x08, %o1
+ subcc %o5, 0x08, %o5
+ srlx %g3, %g2, GLOBAL_SPARE
+ or GLOBAL_SPARE, %o4, GLOBAL_SPARE
+ EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
+ add %o0, 0x08, %o0
+ bne,pt %icc, 1b
+ sllx %g3, %g1, %o4
+ srl %g1, 3, %g1
+ add %o1, %g1, %o1
+ brz,pn %o2, .Lexit
+ nop
+ ba,pt %icc, .Lsmall_unaligned
+
+.Ltiny:
+ EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+ subcc %o2, 1, %o2
+ be,pn %icc, .Lexit
+ EX_ST(STORE(stb, %g1, %o0 + 0x00))
+ EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
+ subcc %o2, 1, %o2
+ be,pn %icc, .Lexit
+ EX_ST(STORE(stb, %g1, %o0 + 0x01))
+ EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
+ ba,pt %icc, .Lexit
+ EX_ST(STORE(stb, %g1, %o0 + 0x02))
+
+.Lsmall:
+ andcc %g2, 0x3, %g0
+ bne,pn %icc, .Lsmall_unaligned
+ andn %o2, 0x4 - 1, %o5
+ sub %o2, %o5, %o2
+1:
+ EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+ add %o1, 0x04, %o1
+ subcc %o5, 0x04, %o5
+ add %o0, 0x04, %o0
+ bne,pt %icc, 1b
+ EX_ST(STORE(stw, %g1, %o0 - 0x04))
+ brz,pt %o2, .Lexit
+ nop
+ ba,a,pt %icc, .Ltiny
+
+.Lsmall_unaligned:
+1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+ add %o1, 1, %o1
+ add %o0, 1, %o0
+ subcc %o2, 1, %o2
+ bne,pt %icc, 1b
+ EX_ST(STORE(stb, %g1, %o0 - 0x01))
+ ba,a,pt %icc, .Lexit
+END(__memcpy_niagara4)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
new file mode 100644
index 0000000000..b8f5c3cb8f
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
@@ -0,0 +1,325 @@
+/* Copy SIZE bytes from SRC to DEST.
+ For UltraSPARC-III.
+ Copyright (C) 2001-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@redhat.com)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ASI_BLK_P 0xf0
+#define FPRS_FEF 0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+#if IS_IN (libc)
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ .register %g6,#scratch
+
+ .text
+
+ENTRY(__mempcpy_ultra3)
+ ba,pt %XCC, 101f
+ add %o0, %o2, %g5
+END(__mempcpy_ultra3)
+
+ /* Special/non-trivial issues of this code:
+ *
+ * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+ * 2) Only low 32 FPU registers are used so that only the
+ * lower half of the FPU register set is dirtied by this
+ * code. This is especially important in the kernel.
+ * 3) This code never prefetches cachelines past the end
+ * of the source buffer.
+ *
+ * The cheetah's flexible spine, oversized liver, enlarged heart,
+ * slender muscular body, and claws make it the swiftest hunter
+ * in Africa and the fastest animal on land. Can reach speeds
+ * of up to 2.4GB per second.
+ */
+ .align 32
+ENTRY(__memcpy_ultra3)
+
+100: /* %o0=dst, %o1=src, %o2=len */
+ mov %o0, %g5
+101:
+ cmp %o2, 0
+ be,pn %XCC, out
+218: or %o0, %o1, %o3
+ cmp %o2, 16
+ bleu,a,pn %XCC, small_copy
+ or %o3, %o2, %o3
+
+ cmp %o2, 256
+ blu,pt %XCC, medium_copy
+ andcc %o3, 0x7, %g0
+
+ ba,pt %xcc, enter
+ andcc %o0, 0x3f, %g2
+
+ /* Here len >= 256 and condition codes reflect execution
+ * of "andcc %o0, 0x7, %g2", done by caller.
+ */
+ .align 64
+enter:
+ /* Is 'dst' already aligned on an 64-byte boundary? */
+ be,pt %XCC, 2f
+
+ /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
+ * of bytes to copy to make 'dst' 64-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x40, %g2
+ sub %g0, %g2, %g2
+ sub %o2, %g2, %o2
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: ldub [%o1 + 0x00], %o3
+ add %o1, 0x1, %o1
+ add %o0, 0x1, %o0
+ subcc %g2, 0x1, %g2
+
+ bg,pt %XCC, 1b
+ stb %o3, [%o0 + -1]
+
+2: VISEntryHalf
+ and %o1, 0x7, %g1
+ ba,pt %xcc, begin
+ alignaddr %o1, %g0, %o1
+
+ .align 64
+begin:
+ prefetch [%o1 + 0x000], #one_read
+ prefetch [%o1 + 0x040], #one_read
+ andn %o2, (0x40 - 1), %o4
+ prefetch [%o1 + 0x080], #one_read
+ prefetch [%o1 + 0x0c0], #one_read
+ ldd [%o1 + 0x000], %f0
+ prefetch [%o1 + 0x100], #one_read
+ ldd [%o1 + 0x008], %f2
+ prefetch [%o1 + 0x140], #one_read
+ ldd [%o1 + 0x010], %f4
+ prefetch [%o1 + 0x180], #one_read
+ faligndata %f0, %f2, %f16
+ ldd [%o1 + 0x018], %f6
+ faligndata %f2, %f4, %f18
+ ldd [%o1 + 0x020], %f8
+ faligndata %f4, %f6, %f20
+ ldd [%o1 + 0x028], %f10
+ faligndata %f6, %f8, %f22
+
+ ldd [%o1 + 0x030], %f12
+ faligndata %f8, %f10, %f24
+ ldd [%o1 + 0x038], %f14
+ faligndata %f10, %f12, %f26
+ ldd [%o1 + 0x040], %f0
+
+ sub %o4, 0x80, %o4
+ add %o1, 0x40, %o1
+ ba,pt %xcc, loop
+ srl %o4, 6, %o3
+
+ .align 64
+loop:
+ ldd [%o1 + 0x008], %f2
+ faligndata %f12, %f14, %f28
+ ldd [%o1 + 0x010], %f4
+ faligndata %f14, %f0, %f30
+ stda %f16, [%o0] ASI_BLK_P
+ ldd [%o1 + 0x018], %f6
+ faligndata %f0, %f2, %f16
+
+ ldd [%o1 + 0x020], %f8
+ faligndata %f2, %f4, %f18
+ ldd [%o1 + 0x028], %f10
+ faligndata %f4, %f6, %f20
+ ldd [%o1 + 0x030], %f12
+ faligndata %f6, %f8, %f22
+ ldd [%o1 + 0x038], %f14
+ faligndata %f8, %f10, %f24
+
+ ldd [%o1 + 0x040], %f0
+ prefetch [%o1 + 0x180], #one_read
+ faligndata %f10, %f12, %f26
+ subcc %o3, 0x01, %o3
+ add %o1, 0x40, %o1
+ bg,pt %XCC, loop
+ add %o0, 0x40, %o0
+
+ /* Finally we copy the last full 64-byte block. */
+loopfini:
+ ldd [%o1 + 0x008], %f2
+ faligndata %f12, %f14, %f28
+ ldd [%o1 + 0x010], %f4
+ faligndata %f14, %f0, %f30
+ stda %f16, [%o0] ASI_BLK_P
+ ldd [%o1 + 0x018], %f6
+ faligndata %f0, %f2, %f16
+ ldd [%o1 + 0x020], %f8
+ faligndata %f2, %f4, %f18
+ ldd [%o1 + 0x028], %f10
+ faligndata %f4, %f6, %f20
+ ldd [%o1 + 0x030], %f12
+ faligndata %f6, %f8, %f22
+ ldd [%o1 + 0x038], %f14
+ faligndata %f8, %f10, %f24
+ cmp %g1, 0
+ be,pt %XCC, 1f
+ add %o0, 0x40, %o0
+ ldd [%o1 + 0x040], %f0
+1: faligndata %f10, %f12, %f26
+ faligndata %f12, %f14, %f28
+ faligndata %f14, %f0, %f30
+ stda %f16, [%o0] ASI_BLK_P
+ add %o0, 0x40, %o0
+ add %o1, 0x40, %o1
+ membar #Sync
+
+ /* Now we copy the (len modulo 64) bytes at the end.
+ * Note how we borrow the %f0 loaded above.
+ *
+ * Also notice how this code is careful not to perform a
+ * load past the end of the src buffer.
+ */
+loopend:
+ and %o2, 0x3f, %o2
+ andcc %o2, 0x38, %g2
+ be,pn %XCC, endcruft
+ subcc %g2, 0x8, %g2
+ be,pn %XCC, endcruft
+ cmp %g1, 0
+
+ be,a,pt %XCC, 1f
+ ldd [%o1 + 0x00], %f0
+
+1: ldd [%o1 + 0x08], %f2
+ add %o1, 0x8, %o1
+ sub %o2, 0x8, %o2
+ subcc %g2, 0x8, %g2
+ faligndata %f0, %f2, %f8
+ std %f8, [%o0 + 0x00]
+ be,pn %XCC, endcruft
+ add %o0, 0x8, %o0
+ ldd [%o1 + 0x08], %f0
+ add %o1, 0x8, %o1
+ sub %o2, 0x8, %o2
+ subcc %g2, 0x8, %g2
+ faligndata %f2, %f0, %f8
+ std %f8, [%o0 + 0x00]
+ bne,pn %XCC, 1b
+ add %o0, 0x8, %o0
+
+ /* If anything is left, we copy it one byte at a time.
+ * Note that %g1 is (src & 0x3) saved above before the
+ * alignaddr was performed.
+ */
+endcruft:
+ cmp %o2, 0
+ add %o1, %g1, %o1
+ VISExitHalf
+ be,pn %XCC, out
+ sub %o0, %o1, %o3
+
+ andcc %g1, 0x7, %g0
+ bne,pn %icc, small_copy_unaligned
+ andcc %o2, 0x8, %g0
+ be,pt %icc, 1f
+ nop
+ ldx [%o1], %o5
+ stx %o5, [%o1 + %o3]
+ add %o1, 0x8, %o1
+
+1: andcc %o2, 0x4, %g0
+ be,pt %icc, 1f
+ nop
+ lduw [%o1], %o5
+ stw %o5, [%o1 + %o3]
+ add %o1, 0x4, %o1
+
+1: andcc %o2, 0x2, %g0
+ be,pt %icc, 1f
+ nop
+ lduh [%o1], %o5
+ sth %o5, [%o1 + %o3]
+ add %o1, 0x2, %o1
+
+1: andcc %o2, 0x1, %g0
+ be,pt %icc, out
+ nop
+ ldub [%o1], %o5
+ ba,pt %xcc, out
+ stb %o5, [%o1 + %o3]
+
+medium_copy: /* 16 < len <= 64 */
+ bne,pn %XCC, small_copy_unaligned
+ sub %o0, %o1, %o3
+
+medium_copy_aligned:
+ andn %o2, 0x7, %o4
+ and %o2, 0x7, %o2
+1: subcc %o4, 0x8, %o4
+ ldx [%o1], %o5
+ stx %o5, [%o1 + %o3]
+ bgu,pt %XCC, 1b
+ add %o1, 0x8, %o1
+ andcc %o2, 0x4, %g0
+ be,pt %XCC, 1f
+ nop
+ sub %o2, 0x4, %o2
+ lduw [%o1], %o5
+ stw %o5, [%o1 + %o3]
+ add %o1, 0x4, %o1
+1: cmp %o2, 0
+ be,pt %XCC, out
+ nop
+ ba,pt %xcc, small_copy_unaligned
+ nop
+
+small_copy: /* 0 < len <= 16 */
+ andcc %o3, 0x3, %g0
+ bne,pn %XCC, small_copy_unaligned
+ sub %o0, %o1, %o3
+
+small_copy_aligned:
+ subcc %o2, 4, %o2
+ lduw [%o1], %g1
+ stw %g1, [%o1 + %o3]
+ bgu,pt %XCC, small_copy_aligned
+ add %o1, 4, %o1
+
+out: retl
+ mov %g5, %o0
+
+ .align 32
+small_copy_unaligned:
+ subcc %o2, 1, %o2
+ ldub [%o1], %g1
+ stb %g1, [%o1 + %o3]
+ bgu,pt %XCC, small_copy_unaligned
+ add %o1, 1, %o1
+ retl
+ mov %g5, %o0
+
+END(__memcpy_ultra3)
+
+#endif \ No newline at end of file
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy.S
new file mode 100644
index 0000000000..b6396eeae5
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memcpy.S
@@ -0,0 +1,167 @@
+/* Multiple versions of memcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by David S. Miller (davem@davemloft.net)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(memcpy)
+ .type memcpy, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_CRYPTO, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ andcc %o0, HWCAP_SPARC_N2, %g0
+# ifdef SHARED
+ sethi %gdop_hix22(__memcpy_niagara4), %o1
+ xor %o1, %gdop_lox10(__memcpy_niagara4), %o1
+# else
+ set __memcpy_niagara4, %o1
+# endif
+ ba 10f
+ nop
+1: be 1f
+ andcc %o0, HWCAP_SPARC_BLKINIT, %g0
+# ifdef SHARED
+ sethi %gdop_hix22(__memcpy_niagara2), %o1
+ xor %o1, %gdop_lox10(__memcpy_niagara2), %o1
+# else
+ set __memcpy_niagara2, %o1
+# endif
+ ba 10f
+ nop
+1: be 1f
+ andcc %o0, HWCAP_SPARC_ULTRA3, %g0
+# ifdef SHARED
+ sethi %gdop_hix22(__memcpy_niagara1), %o1
+ xor %o1, %gdop_lox10(__memcpy_niagara1), %o1
+# else
+ set __memcpy_niagara1, %o1
+# endif
+ ba 10f
+ nop
+1: be 9f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__memcpy_ultra3), %o1
+ xor %o1, %gdop_lox10(__memcpy_ultra3), %o1
+# else
+ set __memcpy_ultra3, %o1
+# endif
+ ba 10f
+ nop
+9:
+# ifdef SHARED
+ sethi %gdop_hix22(__memcpy_ultra1), %o1
+ xor %o1, %gdop_lox10(__memcpy_ultra1), %o1
+# else
+ set __memcpy_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(memcpy)
+
+ENTRY(__mempcpy)
+ .type __mempcpy, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_CRYPTO, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ andcc %o0, HWCAP_SPARC_N2, %g0
+# ifdef SHARED
+ sethi %gdop_hix22(__mempcpy_niagara4), %o1
+ xor %o1, %gdop_lox10(__mempcpy_niagara4), %o1
+# else
+ set __mempcpy_niagara4, %o1
+# endif
+ ba 10f
+ nop
+1: be 1f
+ andcc %o0, HWCAP_SPARC_BLKINIT, %g0
+# ifdef SHARED
+ sethi %gdop_hix22(__mempcpy_niagara2), %o1
+ xor %o1, %gdop_lox10(__mempcpy_niagara2), %o1
+# else
+ set __mempcpy_niagara2, %o1
+# endif
+ ba 10f
+ nop
+1: be 1f
+ andcc %o0, HWCAP_SPARC_ULTRA3, %g0
+# ifdef SHARED
+ sethi %gdop_hix22(__mempcpy_niagara1), %o1
+ xor %o1, %gdop_lox10(__mempcpy_niagara1), %o1
+# else
+ set __mempcpy_niagara1, %o1
+# endif
+ ba 10f
+ nop
+1: be 9f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__mempcpy_ultra3), %o1
+ xor %o1, %gdop_lox10(__mempcpy_ultra3), %o1
+# else
+ set __mempcpy_ultra3, %o1
+# endif
+ ba 10f
+ nop
+9:
+# ifdef SHARED
+ sethi %gdop_hix22(__mempcpy_ultra1), %o1
+ xor %o1, %gdop_lox10(__mempcpy_ultra1), %o1
+# else
+ set __mempcpy_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(__mempcpy)
+
+libc_hidden_builtin_def (memcpy)
+
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(x, y)
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+#define memcpy __memcpy_ultra1
+#define __mempcpy __mempcpy_ultra1
+
+#endif
+
+#include "../memcpy.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara1.S
new file mode 100644
index 0000000000..45b2251691
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara1.S
@@ -0,0 +1,177 @@
+/* Set a block of memory to some byte value. For SUN4V Niagara.
+ Copyright (C) 2006-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@davemloft.net)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define ASI_P 0x80
+#define ASI_PNF 0x82
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+#if IS_IN (libc)
+
+ .register %g2,#scratch
+
+ .text
+ .align 32
+
+ENTRY(__memset_niagara1)
+ /* %o0=buf, %o1=pat, %o2=len */
+ and %o1, 0xff, %o3
+ mov %o2, %o1
+ sllx %o3, 8, %g1
+ or %g1, %o3, %o2
+ sllx %o2, 16, %g1
+ or %g1, %o2, %o2
+ sllx %o2, 32, %g1
+ ba,pt %XCC, 1f
+ or %g1, %o2, %o2
+END(__memset_niagara1)
+
+ENTRY(__bzero_niagara1)
+ clr %o2
+1:
+# ifndef USE_BRP
+ srl %o1, 0, %o1
+# endif
+ brz,pn %o1, 90f
+ mov %o0, %o3
+
+ wr %g0, ASI_P, %asi
+
+ cmp %o1, 15
+ blu,pn %XCC, 70f
+ andcc %o0, 0x7, %g1
+ be,pt %XCC, 2f
+ mov 8, %g2
+ sub %g2, %g1, %g1
+ sub %o1, %g1, %o1
+1: stba %o2, [%o0 + 0x00] %asi
+ subcc %g1, 1, %g1
+ bne,pt %XCC, 1b
+ add %o0, 1, %o0
+2: cmp %o1, 128
+ blu,pn %XCC, 60f
+ andcc %o0, (64 - 1), %g1
+ be,pt %XCC, 40f
+ mov 64, %g2
+ sub %g2, %g1, %g1
+ sub %o1, %g1, %o1
+1: stxa %o2, [%o0 + 0x00] %asi
+ subcc %g1, 8, %g1
+ bne,pt %XCC, 1b
+ add %o0, 8, %o0
+
+40:
+ wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+ andn %o1, (64 - 1), %g1
+ sub %o1, %g1, %o1
+
+ andn %g1, (256 - 1), %g2
+ brz,pt %g2, 50f
+ and %g1, (256 - 1), %g1
+
+45:
+ stxa %o2, [%o0 + 0x00] %asi
+ stxa %o2, [%o0 + 0x08] %asi
+ stxa %o2, [%o0 + 0x10] %asi
+ stxa %o2, [%o0 + 0x18] %asi
+ stxa %o2, [%o0 + 0x20] %asi
+ stxa %o2, [%o0 + 0x28] %asi
+ stxa %o2, [%o0 + 0x30] %asi
+ stxa %o2, [%o0 + 0x38] %asi
+ stxa %o2, [%o0 + 0x40] %asi
+ stxa %o2, [%o0 + 0x48] %asi
+ stxa %o2, [%o0 + 0x50] %asi
+ stxa %o2, [%o0 + 0x58] %asi
+ stxa %o2, [%o0 + 0x60] %asi
+ stxa %o2, [%o0 + 0x68] %asi
+ stxa %o2, [%o0 + 0x70] %asi
+ stxa %o2, [%o0 + 0x78] %asi
+ stxa %o2, [%o0 + 0x80] %asi
+ stxa %o2, [%o0 + 0x88] %asi
+ stxa %o2, [%o0 + 0x90] %asi
+ stxa %o2, [%o0 + 0x98] %asi
+ stxa %o2, [%o0 + 0xa0] %asi
+ stxa %o2, [%o0 + 0xa8] %asi
+ stxa %o2, [%o0 + 0xb0] %asi
+ stxa %o2, [%o0 + 0xb8] %asi
+ stxa %o2, [%o0 + 0xc0] %asi
+ stxa %o2, [%o0 + 0xc8] %asi
+ stxa %o2, [%o0 + 0xd0] %asi
+ stxa %o2, [%o0 + 0xd8] %asi
+ stxa %o2, [%o0 + 0xe0] %asi
+ stxa %o2, [%o0 + 0xe8] %asi
+ stxa %o2, [%o0 + 0xf0] %asi
+ stxa %o2, [%o0 + 0xf8] %asi
+ subcc %g2, 256, %g2
+ bne,pt %XCC, 45b
+ add %o0, 256, %o0
+
+ brz,pn %g1, 55f
+ nop
+
+50:
+ stxa %o2, [%o0 + 0x00] %asi
+ stxa %o2, [%o0 + 0x08] %asi
+ stxa %o2, [%o0 + 0x10] %asi
+ stxa %o2, [%o0 + 0x18] %asi
+ stxa %o2, [%o0 + 0x20] %asi
+ stxa %o2, [%o0 + 0x28] %asi
+ stxa %o2, [%o0 + 0x30] %asi
+ stxa %o2, [%o0 + 0x38] %asi
+ subcc %g1, 64, %g1
+ bne,pt %XCC, 50b
+ add %o0, 64, %o0
+
+55:
+ wr %g0, ASI_P, %asi
+ brz,pn %o1, 80f
+60:
+ andncc %o1, 0x7, %g1
+ be,pn %XCC, 2f
+ sub %o1, %g1, %o1
+1: stxa %o2, [%o0 + 0x00] %asi
+ subcc %g1, 8, %g1
+ bne,pt %XCC, 1b
+ add %o0, 8, %o0
+2: brz,pt %o1, 80f
+ nop
+
+70:
+1: stba %o2, [%o0 + 0x00] %asi
+ subcc %o1, 1, %o1
+ bne,pt %XCC, 1b
+ add %o0, 1, %o0
+
+ /* fallthrough */
+
+80:
+ wr %g0, ASI_PNF, %asi
+
+90:
+ retl
+ mov %o3, %o0
+END(__bzero_niagara1)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara4.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara4.S
new file mode 100644
index 0000000000..c04a07a7f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset-niagara4.S
@@ -0,0 +1,124 @@
+/* Set a block of memory to some byte value. For SUN4V Niagara-4.
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@davemloft.net)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+
+#if IS_IN (libc)
+
+ .register %g2, #scratch
+ .register %g3, #scratch
+
+ .text
+ .align 32
+
+ENTRY(__memset_niagara4)
+ andcc %o1, 0xff, %o4
+ be,pt %icc, 1f
+ mov %o2, %o1
+ sllx %o4, 8, %g1
+ or %g1, %o4, %o2
+ sllx %o2, 16, %g1
+ or %g1, %o2, %o2
+ sllx %o2, 32, %g1
+ ba,pt %icc, 1f
+ or %g1, %o2, %o4
+END(__memset_niagara4)
+
+ .align 32
+ENTRY(__bzero_niagara4)
+ clr %o4
+1: cmp %o1, 16
+ ble %icc, .Ltiny
+ mov %o0, %o3
+ sub %g0, %o0, %g1
+ and %g1, 0x7, %g1
+ brz,pt %g1, .Laligned8
+ sub %o1, %g1, %o1
+1: stb %o4, [%o0 + 0x00]
+ subcc %g1, 1, %g1
+ bne,pt %icc, 1b
+ add %o0, 1, %o0
+.Laligned8:
+ cmp %o1, 64 + (64 - 8)
+ ble .Lmedium
+ sub %g0, %o0, %g1
+ andcc %g1, (64 - 1), %g1
+ brz,pn %g1, .Laligned64
+ sub %o1, %g1, %o1
+1: stx %o4, [%o0 + 0x00]
+ subcc %g1, 8, %g1
+ bne,pt %icc, 1b
+ add %o0, 0x8, %o0
+.Laligned64:
+ andn %o1, 64 - 1, %g1
+ sub %o1, %g1, %o1
+ brnz,pn %o4, .Lnon_bzero_loop
+ mov 0x20, %g2
+1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+ subcc %g1, 0x40, %g1
+ stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+ bne,pt %icc, 1b
+ add %o0, 0x40, %o0
+.Lpostloop:
+ cmp %o1, 8
+ bl,pn %icc, .Ltiny
+ membar #StoreStore|#StoreLoad
+.Lmedium:
+ andn %o1, 0x7, %g1
+ sub %o1, %g1, %o1
+1: stx %o4, [%o0 + 0x00]
+ subcc %g1, 0x8, %g1
+ bne,pt %icc, 1b
+ add %o0, 0x08, %o0
+ andcc %o1, 0x4, %g1
+ be,pt %icc, .Ltiny
+ sub %o1, %g1, %o1
+ stw %o4, [%o0 + 0x00]
+ add %o0, 0x4, %o0
+.Ltiny:
+ cmp %o1, 0
+ be,pn %icc, .Lexit
+1: subcc %o1, 1, %o1
+ stb %o4, [%o0 + 0x00]
+ bne,pt %icc, 1b
+ add %o0, 1, %o0
+.Lexit:
+ retl
+ mov %o3, %o0
+.Lnon_bzero_loop:
+ mov 0x08, %g3
+ mov 0x28, %o5
+1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+ subcc %g1, 0x40, %g1
+ stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+ stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+ stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P
+ add %o0, 0x10, %o0
+ stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+ stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+ stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+ stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P
+ bne,pt %icc, 1b
+ add %o0, 0x30, %o0
+ ba,a,pt %icc, .Lpostloop
+END(__bzero_niagara4)
+
+#endif
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset.S
new file mode 100644
index 0000000000..9469d5e7ce
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/memset.S
@@ -0,0 +1,124 @@
+/* Multiple versions of memset and bzero
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by David S. Miller (davem@davemloft.net)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(memset)
+ .type memset, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_CRYPTO, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ andcc %o0, HWCAP_SPARC_BLKINIT, %g0
+# ifdef SHARED
+ sethi %gdop_hix22(__memset_niagara4), %o1
+ xor %o1, %gdop_lox10(__memset_niagara4), %o1
+# else
+ set __memset_niagara4, %o1
+# endif
+ ba 10f
+ nop
+1: be 9f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__memset_niagara1), %o1
+ xor %o1, %gdop_lox10(__memset_niagara1), %o1
+# else
+ set __memset_niagara1, %o1
+# endif
+ ba 10f
+ nop
+9:
+# ifdef SHARED
+ sethi %gdop_hix22(__memset_ultra1), %o1
+ xor %o1, %gdop_lox10(__memset_ultra1), %o1
+# else
+ set __memset_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(memset)
+
+ENTRY(__bzero)
+ .type bzero, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_CRYPTO, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ andcc %o0, HWCAP_SPARC_BLKINIT, %g0
+# ifdef SHARED
+ sethi %gdop_hix22(__bzero_niagara4), %o1
+ xor %o1, %gdop_lox10(__bzero_niagara4), %o1
+# else
+ set __bzero_niagara4, %o1
+# endif
+ ba 10f
+ nop
+1: be 9f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__bzero_niagara1), %o1
+ xor %o1, %gdop_lox10(__bzero_niagara1), %o1
+# else
+ set __bzero_niagara1, %o1
+# endif
+ ba 10f
+ nop
+9:
+# ifdef SHARED
+ sethi %gdop_hix22(__bzero_ultra1), %o1
+ xor %o1, %gdop_lox10(__bzero_ultra1), %o1
+# else
+ set __bzero_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(__bzero)
+
+weak_alias (__bzero, bzero)
+
+# undef weak_alias
+# define weak_alias(a, b)
+
+libc_hidden_builtin_def (memset)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define memset __memset_ultra1
+#define __bzero __bzero_ultra1
+
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1-vis3.S
new file mode 100644
index 0000000000..7c4fa49ce4
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1-vis3.S
@@ -0,0 +1,73 @@
+! SPARC v9 64-bit VIS3 __mpn_mul_1 -- Multiply a limb vector with a single
+! limb and store the product in a second limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr %o0
+#define s1_ptr %o1
+#define sz %o2
+#define s2_limb %o3
+#define carry %o5
+#define tmp1 %g1
+#define tmp2 %g2
+#define tmp3 %g3
+#define tmp4 %o4
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ENTRY(__mpn_mul_1_vis3)
+ subcc sz, 1, sz
+ be .Lfinal_limb
+ clr carry
+
+.Lloop:
+ ldx [s1_ptr + 0x00], tmp1
+ ldx [s1_ptr + 0x08], tmp4
+ mulx tmp1, s2_limb, tmp3
+ add s1_ptr, 0x10, s1_ptr
+ umulxhi tmp1, s2_limb, tmp2
+ sub sz, 2, sz
+ mulx tmp4, s2_limb, tmp1
+ add res_ptr, 0x10, res_ptr
+ umulxhi tmp4, s2_limb, tmp4
+ addcc carry, tmp3, tmp3
+ stx tmp3, [res_ptr - 0x10]
+ addxc %g0, tmp2, carry
+ addcc carry, tmp1, tmp1
+ addxc %g0, tmp4, carry
+ brgz sz, .Lloop
+ stx tmp1, [res_ptr - 0x08]
+
+ brlz,pt sz, .Lfinish
+ nop
+
+.Lfinal_limb:
+ ldx [s1_ptr + 0x00], tmp1
+ mulx tmp1, s2_limb, tmp3
+ umulxhi tmp1, s2_limb, tmp2
+ addcc carry, tmp3, tmp3
+ addxc %g0, tmp2, carry
+ stx tmp3, [res_ptr + 0x00]
+
+.Lfinish:
+ retl
+ mov carry, %o0
+END(__mpn_mul_1_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1.S
new file mode 100644
index 0000000000..75fca932b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/mul_1.S
@@ -0,0 +1,56 @@
+/* Multiple versions of mul_1
+
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ Contributed by David S. Miller (davem@davemloft.net)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_mul_1)
+ .type __mpn_mul_1, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_VIS3, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_mul_1_vis3), %o1
+ xor %o1, %gdop_lox10(__mpn_mul_1_vis3), %o1
+# else
+ set __mpn_mul_1_vis3, %o1
+# endif
+ ba 10f
+ nop
+1:
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_mul_1_generic), %o1
+ xor %o1, %gdop_lox10(__mpn_mul_1_generic), %o1
+# else
+ set __mpn_mul_1_generic, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(__mpn_mul_1)
+
+#define __mpn_mul_1 __mpn_mul_1_generic
+#include "../mul_1.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memcpy.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memcpy.c
new file mode 100644
index 0000000000..2452575343
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memcpy.c
@@ -0,0 +1 @@
+#include "../rtld-memcpy.c"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memset.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memset.c
new file mode 100644
index 0000000000..c01eb0beb9
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/rtld-memset.c
@@ -0,0 +1 @@
+#include "../rtld-memset.c"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-block.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-block.c
new file mode 100644
index 0000000000..9d65315a5a
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-block.c
@@ -0,0 +1,32 @@
+#include <sparc-ifunc.h>
+
+#define __sha256_process_block __sha256_process_block_generic
+extern void __sha256_process_block_generic (const void *buffer, size_t len,
+ struct sha256_ctx *ctx);
+
+#include <crypt/sha256-block.c>
+
+#undef __sha256_process_block
+
+extern void __sha256_process_block_crop (const void *buffer, size_t len,
+ struct sha256_ctx *ctx);
+
+static bool cpu_supports_sha256(int hwcap)
+{
+ unsigned long cfr;
+
+ if (!(hwcap & HWCAP_SPARC_CRYPTO))
+ return false;
+
+ __asm__ ("rd %%asr26, %0" : "=r" (cfr));
+ if (cfr & (1 << 6))
+ return true;
+
+ return false;
+}
+
+extern void __sha256_process_block (const void *buffer, size_t len,
+ struct sha256_ctx *ctx);
+sparc_libc_ifunc (__sha256_process_block,
+ cpu_supports_sha256(hwcap) ? __sha256_process_block_crop
+ : __sha256_process_block_generic);
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-crop.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-crop.S
new file mode 100644
index 0000000000..8f07e4245a
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha256-crop.S
@@ -0,0 +1,101 @@
+/* SHA256 using sparc crypto opcodes.
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@davemloft.net)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define SHA256 \
+ .word 0x81b02840;
+
+ .text
+ .align 32
+ENTRY(__sha256_process_block_crop)
+ /* %o0=buffer, %o1=len, %o2=CTX */
+ ldx [%o2 + 0x20], %g1
+ add %g1, %o1, %g1
+ stx %g1, [%o2 + 0x20]
+
+ ld [%o2 + 0x00], %f0
+ ld [%o2 + 0x04], %f1
+ ld [%o2 + 0x08], %f2
+ ld [%o2 + 0x0c], %f3
+ ld [%o2 + 0x10], %f4
+ ld [%o2 + 0x14], %f5
+ andcc %o1, 0x7, %g0
+ ld [%o2 + 0x18], %f6
+ bne,pn %xcc, 10f
+ ld [%o2 + 0x1c], %f7
+
+1:
+ ldd [%o0 + 0x00], %f8
+ ldd [%o0 + 0x08], %f10
+ ldd [%o0 + 0x10], %f12
+ ldd [%o0 + 0x18], %f14
+ ldd [%o0 + 0x20], %f16
+ ldd [%o0 + 0x28], %f18
+ ldd [%o0 + 0x30], %f20
+ ldd [%o0 + 0x38], %f22
+
+ SHA256
+
+ subcc %o1, 0x40, %o1
+ bne,pt %xcc, 1b
+ add %o0, 0x40, %o0
+
+5:
+ st %f0, [%o2 + 0x00]
+ st %f1, [%o2 + 0x04]
+ st %f2, [%o2 + 0x08]
+ st %f3, [%o2 + 0x0c]
+ st %f4, [%o2 + 0x10]
+ st %f5, [%o2 + 0x14]
+ st %f6, [%o2 + 0x18]
+ retl
+ st %f7, [%o2 + 0x1c]
+10:
+ alignaddr %o0, %g0, %o0
+
+ ldd [%o0 + 0x00], %f10
+1:
+ ldd [%o0 + 0x08], %f12
+ ldd [%o0 + 0x10], %f14
+ ldd [%o0 + 0x18], %f16
+ ldd [%o0 + 0x20], %f18
+ ldd [%o0 + 0x28], %f20
+ ldd [%o0 + 0x30], %f22
+ ldd [%o0 + 0x38], %f24
+ ldd [%o0 + 0x40], %f26
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ SHA256
+
+ subcc %o1, 0x40, %o1
+ fsrc2 %f26, %f10
+ bne,pt %xcc, 1b
+ add %o0, 0x40, %o0
+
+ ba,a,pt %xcc, 5b
+END(__sha256_process_block_crop)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-block.c b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-block.c
new file mode 100644
index 0000000000..2863e05d09
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-block.c
@@ -0,0 +1,32 @@
+#include <sparc-ifunc.h>
+
+#define __sha512_process_block __sha512_process_block_generic
+extern void __sha512_process_block_generic (const void *buffer, size_t len,
+ struct sha512_ctx *ctx);
+
+#include <crypt/sha512-block.c>
+
+#undef __sha512_process_block
+
+extern void __sha512_process_block_crop (const void *buffer, size_t len,
+ struct sha512_ctx *ctx);
+
+static bool cpu_supports_sha512(int hwcap)
+{
+ unsigned long cfr;
+
+ if (!(hwcap & HWCAP_SPARC_CRYPTO))
+ return false;
+
+ __asm__ ("rd %%asr26, %0" : "=r" (cfr));
+ if (cfr & (1 << 6))
+ return true;
+
+ return false;
+}
+
+extern void __sha512_process_block (const void *buffer, size_t len,
+ struct sha512_ctx *ctx);
+sparc_libc_ifunc (__sha512_process_block,
+ cpu_supports_sha512(hwcap) ? __sha512_process_block_crop
+ : __sha512_process_block_generic);
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-crop.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-crop.S
new file mode 100644
index 0000000000..f78354c485
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sha512-crop.S
@@ -0,0 +1,131 @@
+/* SHA512 using sparc crypto opcodes.
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David S. Miller (davem@davemloft.net)
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define SHA512 \
+ .word 0x81b02860;
+
+ .text
+ .align 32
+ENTRY(__sha512_process_block_crop)
+ /* %o0=buffer, %o1=len, %o2=CTX */
+ ldx [%o2 + 0x48], %g1
+ add %g1, %o1, %o4
+ stx %o4, [%o2 + 0x48]
+ cmp %o4, %g1
+ bgeu,pt %xcc, 1f
+ nop
+ ldx [%o2 + 0x40], %g1
+ add %g1, 1, %g1
+ stx %g1, [%o2 + 0x40]
+
+1: ldd [%o2 + 0x00], %f0
+ ldd [%o2 + 0x08], %f2
+ ldd [%o2 + 0x10], %f4
+ ldd [%o2 + 0x18], %f6
+ ldd [%o2 + 0x20], %f8
+ ldd [%o2 + 0x28], %f10
+ andcc %o1, 0x7, %g0
+ ldd [%o2 + 0x30], %f12
+ bne,pn %xcc, 10f
+ ldd [%o2 + 0x38], %f14
+
+1:
+ ldd [%o0 + 0x00], %f16
+ ldd [%o0 + 0x08], %f18
+ ldd [%o0 + 0x10], %f20
+ ldd [%o0 + 0x18], %f22
+ ldd [%o0 + 0x20], %f24
+ ldd [%o0 + 0x28], %f26
+ ldd [%o0 + 0x30], %f28
+ ldd [%o0 + 0x38], %f30
+ ldd [%o0 + 0x40], %f32
+ ldd [%o0 + 0x48], %f34
+ ldd [%o0 + 0x50], %f36
+ ldd [%o0 + 0x58], %f38
+ ldd [%o0 + 0x60], %f40
+ ldd [%o0 + 0x68], %f42
+ ldd [%o0 + 0x70], %f44
+ ldd [%o0 + 0x78], %f46
+
+ SHA512
+
+ subcc %o1, 0x80, %o1
+ bne,pt %xcc, 1b
+ add %o0, 0x80, %o0
+
+5:
+ std %f0, [%o2 + 0x00]
+ std %f2, [%o2 + 0x08]
+ std %f4, [%o2 + 0x10]
+ std %f6, [%o2 + 0x18]
+ std %f8, [%o2 + 0x20]
+ std %f10, [%o2 + 0x28]
+ std %f12, [%o2 + 0x30]
+ retl
+ std %f14, [%o2 + 0x38]
+10:
+ alignaddr %o0, %g0, %o0
+
+ ldd [%o0 + 0x00], %f18
+1:
+ ldd [%o0 + 0x08], %f20
+ ldd [%o0 + 0x10], %f22
+ ldd [%o0 + 0x18], %f24
+ ldd [%o0 + 0x20], %f26
+ ldd [%o0 + 0x28], %f28
+ ldd [%o0 + 0x30], %f30
+ ldd [%o0 + 0x38], %f32
+ ldd [%o0 + 0x40], %f34
+ ldd [%o0 + 0x48], %f36
+ ldd [%o0 + 0x50], %f38
+ ldd [%o0 + 0x58], %f40
+ ldd [%o0 + 0x60], %f42
+ ldd [%o0 + 0x68], %f44
+ ldd [%o0 + 0x70], %f46
+ ldd [%o0 + 0x78], %f48
+ ldd [%o0 + 0x80], %f50
+
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+ faligndata %f26, %f28, %f24
+ faligndata %f28, %f30, %f26
+ faligndata %f30, %f32, %f28
+ faligndata %f32, %f34, %f30
+ faligndata %f34, %f36, %f32
+ faligndata %f36, %f38, %f34
+ faligndata %f38, %f40, %f36
+ faligndata %f40, %f42, %f38
+ faligndata %f42, %f44, %f40
+ faligndata %f44, %f46, %f42
+ faligndata %f46, %f48, %f44
+ faligndata %f48, %f50, %f46
+
+ SHA512
+
+ subcc %o1, 0x80, %o1
+ fsrc2 %f50, %f18
+ bne,pt %xcc, 1b
+ add %o0, 0x80, %o0
+
+ ba,a,pt %xcc, 5b
+END(__sha512_process_block_crop)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n-vis3.S
new file mode 100644
index 0000000000..2d2a75dff8
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n-vis3.S
@@ -0,0 +1,71 @@
+! SPARC v9 64-bit VIS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+! and store difference in a third limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr %o0
+#define s1_ptr %o1
+#define s2_ptr %o2
+#define sz %o3
+#define tmp1 %g1
+#define tmp2 %g2
+#define tmp3 %g3
+#define tmp4 %o4
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ENTRY(__mpn_sub_n_vis3)
+ subcc sz, 1, sz
+ be .Lfinal_limb
+ cmp %g0, 1
+
+.Lloop:
+ ldx [s2_ptr + 0x00], tmp1
+ add s2_ptr, 0x10, s2_ptr
+ ldx [s1_ptr + 0x00], tmp2
+ add s1_ptr, 0x10, s1_ptr
+ ldx [s2_ptr - 0x08], tmp3
+ add res_ptr, 0x10, res_ptr
+ ldx [s1_ptr - 0x08], tmp4
+ sub sz, 2, sz
+ xnor tmp1, %g0, tmp1
+ addxccc tmp1, tmp2, tmp1
+ stx tmp1, [res_ptr - 0x10]
+ xnor tmp3, %g0, tmp3
+ addxccc tmp3, tmp4, tmp3
+ brgz sz, .Lloop
+ stx tmp3, [res_ptr - 0x08]
+
+ brlz,pt sz, .Lfinish
+ nop
+
+.Lfinal_limb:
+ ldx [s2_ptr + 0x00], tmp1
+ ldx [s1_ptr + 0x00], tmp2
+ xnor tmp1, %g0, tmp1
+ addxccc tmp1, tmp2, tmp1
+ stx tmp1, [res_ptr + 0x00]
+
+.Lfinish:
+ clr %o0
+ retl
+ movcc %xcc, 1, %o0
+END(__mpn_sub_n_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n.S
new file mode 100644
index 0000000000..d20a286df1
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/sub_n.S
@@ -0,0 +1,56 @@
+/* Multiple versions of sub_n
+
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ Contributed by David S. Miller (davem@davemloft.net)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_sub_n)
+ .type __mpn_sub_n, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_VIS3, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_sub_n_vis3), %o1
+ xor %o1, %gdop_lox10(__mpn_sub_n_vis3), %o1
+# else
+ set __mpn_sub_n_vis3, %o1
+# endif
+ ba 10f
+ nop
+1:
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_sub_n_generic), %o1
+ xor %o1, %gdop_lox10(__mpn_sub_n_generic), %o1
+# else
+ set __mpn_sub_n_generic, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(__mpn_sub_n)
+
+#define __mpn_sub_n __mpn_sub_n_generic
+#include "../sub_n.S"
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1-vis3.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1-vis3.S
new file mode 100644
index 0000000000..99644491e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1-vis3.S
@@ -0,0 +1,87 @@
+! SPARC v9 64-bit VIS3 __mpn_submul_1 -- Multiply a limb vector with a
+! limb and subtract the result from a second limb vector.
+!
+! Copyright (C) 2013-2017 Free Software Foundation, Inc.
+! This file is part of the GNU C Library.
+! Contributed by David S. Miller <davem@davemloft.net>
+!
+! The GNU C Library is free software; you can redistribute it and/or
+! modify it under the terms of the GNU Lesser General Public
+! License as published by the Free Software Foundation; either
+! version 2.1 of the License, or (at your option) any later version.
+!
+! The GNU C Library is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+! Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public
+! License along with the GNU C Library; if not, see
+! <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+#define res_ptr %i0
+#define s1_ptr %i1
+#define sz %i2
+#define s2_limb %i3
+#define carry %o5
+#define tmp1 %g1
+#define tmp2 %g2
+#define tmp3 %g3
+#define tmp4 %o4
+#define tmp5 %l0
+#define tmp6 %l1
+#define tmp7 %l2
+#define tmp8 %l3
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+ENTRY(__mpn_submul_1_vis3)
+ save %sp, -176, %sp
+ subcc sz, 1, sz
+ be .Lfinal_limb
+ clr carry
+
+.Lloop:
+ ldx [s1_ptr + 0x00], tmp1
+ ldx [res_ptr + 0x00], tmp3
+ ldx [s1_ptr + 0x08], tmp2
+ ldx [res_ptr + 0x08], tmp4
+ mulx tmp1, s2_limb, tmp5
+ add s1_ptr, 0x10, s1_ptr
+ umulxhi tmp1, s2_limb, tmp6
+ add res_ptr, 0x10, res_ptr
+ mulx tmp2, s2_limb, tmp7
+ sub sz, 2, sz
+ umulxhi tmp2, s2_limb, tmp8
+ addcc carry, tmp5, tmp5
+ addxc %g0, tmp6, carry
+ subcc tmp3, tmp5, tmp5
+ addxc %g0, carry, carry
+ stx tmp5, [res_ptr - 0x10]
+ addcc carry, tmp7, tmp7
+ addxc %g0, tmp8, carry
+ subcc tmp4, tmp7, tmp7
+ addxc %g0, carry, carry
+ brgz sz, .Lloop
+ stx tmp7, [res_ptr - 0x08]
+
+ brlz,pt sz, .Lfinish
+ nop
+
+.Lfinal_limb:
+ ldx [s1_ptr + 0x00], tmp1
+ ldx [res_ptr + 0x00], tmp3
+ mulx tmp1, s2_limb, tmp5
+ umulxhi tmp1, s2_limb, tmp6
+ addcc carry, tmp5, tmp5
+ addxc %g0, tmp6, carry
+ subcc tmp3, tmp5, tmp5
+ addxc %g0, carry, carry
+ stx tmp5, [res_ptr + 0x00]
+
+.Lfinish:
+ jmpl %i7 + 8, %g0
+ restore carry, 0, %o0
+END(__mpn_submul_1_vis3)
diff --git a/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1.S b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1.S
new file mode 100644
index 0000000000..3c297d989b
--- /dev/null
+++ b/REORG.TODO/sysdeps/sparc/sparc64/multiarch/submul_1.S
@@ -0,0 +1,56 @@
+/* Multiple versions of submul_1
+
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ Contributed by David S. Miller (davem@davemloft.net)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(__mpn_submul_1)
+ .type __mpn_submul_1, @gnu_indirect_function
+# ifdef SHARED
+ SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+ set HWCAP_SPARC_VIS3, %o1
+ andcc %o0, %o1, %g0
+ be 1f
+ nop
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_submul_1_vis3), %o1
+ xor %o1, %gdop_lox10(__mpn_submul_1_vis3), %o1
+# else
+ set __mpn_submul_1_vis3, %o1
+# endif
+ ba 10f
+ nop
+1:
+# ifdef SHARED
+ sethi %gdop_hix22(__mpn_submul_1_generic), %o1
+ xor %o1, %gdop_lox10(__mpn_submul_1_generic), %o1
+# else
+ set __mpn_submul_1_generic, %o1
+# endif
+10:
+# ifdef SHARED
+ add %o3, %o1, %o1
+# endif
+ retl
+ mov %o1, %o0
+END(__mpn_submul_1)
+
+#define __mpn_submul_1 __mpn_submul_1_generic
+#include "../submul_1.S"