aboutsummaryrefslogtreecommitdiff
path: root/ports/sysdeps
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2012-11-02 12:53:57 -0400
committerChris Metcalf <cmetcalf@tilera.com>2012-11-06 09:24:19 -0500
commitcd84016efe83d92ee3903fef37f79ca2bafb3985 (patch)
tree7a92df4a9a081a6f20b8fd68f31139b6336e595c /ports/sysdeps
parent82477c28f46c579a149a8333c07233e9f4e43408 (diff)
downloadglibc-cd84016efe83d92ee3903fef37f79ca2bafb3985.tar
glibc-cd84016efe83d92ee3903fef37f79ca2bafb3985.tar.gz
glibc-cd84016efe83d92ee3903fef37f79ca2bafb3985.tar.bz2
glibc-cd84016efe83d92ee3903fef37f79ca2bafb3985.zip
Optimize tile (mostly tilegx) memcpy and memmove performance.
- Override <memcopy.h> so we use full 8-byte word copies on tilegx32 for memmove, then use op_t in memcpy instead of the previous locally-defined word_t just to avoid proliferating identical types. - Fix bug in memcpy prefetch that caused us to never prefetch past the first cache line. - Optimize misaligned memcpy by inlining _wordcopy_fwd_dest_aligned instead of just doing a dumb word-at-a-time copy. - Make memcpy safe for forward copies by doing all the loads from a given cache line prior to doing a wh64 (cache line zero-fill) on the destination. Remove now-redundant src == dst check. - Copy and optimize the generic wordcopy.c routines to use the tile "double align" instruction instead of the MERGE macro; to avoid offset addressing mode (which tile doesn't have) by rewriting the pointer math to load and store with a zero index; and to use post-increment addresses in the inner loops to improve scheduling.
Diffstat (limited to 'ports/sysdeps')
-rw-r--r--ports/sysdeps/tile/memcopy.h27
-rw-r--r--ports/sysdeps/tile/tilegx/memcpy.c200
-rw-r--r--ports/sysdeps/tile/wordcopy.c449
3 files changed, 609 insertions, 67 deletions
diff --git a/ports/sysdeps/tile/memcopy.h b/ports/sysdeps/tile/memcopy.h
new file mode 100644
index 0000000000..2bc3fce686
--- /dev/null
+++ b/ports/sysdeps/tile/memcopy.h
@@ -0,0 +1,27 @@
+/* memcopy.h -- definitions for memory copy functions. Tile version.
+ Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdeps/generic/memcopy.h>
+#include <bits/wordsize.h>
+
+/* Support more efficient copying on tilegx32, which supports
+ long long as a native 64-bit type. */
+#if defined (__tilegx__) && __WORDSIZE == 32
+# undef op_t
+# define op_t unsigned long long int
+#endif
diff --git a/ports/sysdeps/tile/tilegx/memcpy.c b/ports/sysdeps/tile/tilegx/memcpy.c
index dd6e30dd60..5b015f39d2 100644
--- a/ports/sysdeps/tile/tilegx/memcpy.c
+++ b/ports/sysdeps/tile/tilegx/memcpy.c
@@ -19,11 +19,9 @@
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
+#include <memcopy.h>
#include <arch/chip.h>
-/* Must be 8 bytes in size. */
-#define word_t uint64_t
-
/* How many cache lines ahead should we prefetch? */
#define PREFETCH_LINES_AHEAD 3
@@ -34,8 +32,8 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
const char *__restrict src1 = (const char *) srcv;
const char *__restrict src1_end;
const char *__restrict prefetch;
- word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
- word_t final; /* Final bytes to write to trailing word, if any */
+ op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
+ op_t final; /* Final bytes to write to trailing word, if any */
long i;
if (n < 16)
@@ -55,101 +53,169 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
{
__insn_prefetch (prefetch);
prefetch += CHIP_L2_LINE_SIZE ();
- prefetch = (prefetch > src1_end) ? prefetch : src1;
+ prefetch = (prefetch < src1_end) ? prefetch : src1;
}
/* Copy bytes until dst is word-aligned. */
- for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
+ for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--)
*dst1++ = *src1++;
/* 8-byte pointer to destination memory. */
- dst8 = (word_t *) dst1;
+ dst8 = (op_t *) dst1;
- if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
+ if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0))
{
- /* Misaligned copy. Copy 8 bytes at a time, but don't bother
- with other fanciness.
- TODO: Consider prefetching and using wh64 as well. */
+ /* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but
+ inline it to avoid prologue/epilogue. TODO: Consider
+ prefetching and using wh64 as well. */
+ void * srci;
+ op_t a0, a1, a2, a3;
+ long int dstp = (long int) dst1;
+ long int srcp = (long int) src1;
+ long int len = n / OPSIZ;
- /* Create an aligned src8. */
- const word_t *__restrict src8 =
- (const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
- word_t b;
+ /* Save the initial source pointer so we know the number of
+ bytes to shift for merging two unaligned results. */
+ srci = (void *) srcp;
- word_t a = *src8++;
- for (; n >= sizeof (word_t); n -= sizeof (word_t))
- {
- b = *src8++;
- a = __insn_dblalign (a, b, src1);
- *dst8++ = a;
- a = b;
- }
+ /* Make SRCP aligned by rounding it down to the beginning of the
+ `op_t' it points in the middle of. */
+ srcp &= -OPSIZ;
+
+ switch (len % 4)
+ {
+ case 2:
+ a1 = ((op_t *) srcp)[0];
+ a2 = ((op_t *) srcp)[1];
+ len += 2;
+ srcp += 2 * OPSIZ;
+ goto do1;
+ case 3:
+ a0 = ((op_t *) srcp)[0];
+ a1 = ((op_t *) srcp)[1];
+ len += 1;
+ srcp += 2 * OPSIZ;
+ goto do2;
+ case 0:
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ return dstv;
+ a3 = ((op_t *) srcp)[0];
+ a0 = ((op_t *) srcp)[1];
+ len += 0;
+ srcp += 2 * OPSIZ;
+ goto do3;
+ case 1:
+ a2 = ((op_t *) srcp)[0];
+ a3 = ((op_t *) srcp)[1];
+ srcp += 2 * OPSIZ;
+ len -= 1;
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ goto do0;
+ goto do4; /* No-op. */
+ }
+ do
+ {
+ do4:
+ a0 = ((op_t *) srcp)[0];
+ a2 = __insn_dblalign (a2, a3, srci);
+ ((op_t *) dstp)[0] = a2;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do3:
+ a1 = ((op_t *) srcp)[0];
+ a3 = __insn_dblalign (a3, a0, srci);
+ ((op_t *) dstp)[0] = a3;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do2:
+ a2 = ((op_t *) srcp)[0];
+ a0 = __insn_dblalign (a0, a1, srci);
+ ((op_t *) dstp)[0] = a0;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do1:
+ a3 = ((op_t *) srcp)[0];
+ a1 = __insn_dblalign (a1, a2, srci);
+ ((op_t *) dstp)[0] = a1;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ len -= 4;
+ }
+ while (len != 0);
+
+ /* This is the right position for do0. Please don't move
+ it into the loop. */
+ do0:
+ ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci);
+
+ n = n % OPSIZ;
if (n == 0)
- return dstv;
+ return dstv;
- b = ((const char *) src8 <= src1_end) ? *src8 : 0;
+ a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0;
- /* Final source bytes to write to trailing partial word, if any. */
- final = __insn_dblalign (a, b, src1);
+ final = __insn_dblalign (a3, a0, srci);
+ dst8 = (op_t *)(dstp + OPSIZ);
}
else
{
/* Aligned copy. */
- const word_t *__restrict src8 = (const word_t *) src1;
+ const op_t *__restrict src8 = (const op_t *) src1;
/* src8 and dst8 are both word-aligned. */
if (n >= CHIP_L2_LINE_SIZE ())
{
/* Copy until 'dst' is cache-line-aligned. */
for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
- n -= sizeof (word_t))
+ n -= sizeof (op_t))
*dst8++ = *src8++;
- /* If copying to self, return. The test is cheap enough
- that we do it despite the fact that the memcpy() contract
- doesn't require us to support overlapping dst and src.
- This is the most common case of overlap, and any close
- overlap will cause corruption due to the wh64 below.
- This case is particularly important since the compiler
- will emit memcpy() calls for aggregate copies even if it
- can't prove that src != dst. */
- if (__builtin_expect (dst8 == src8, 0))
- return dstv;
-
for (; n >= CHIP_L2_LINE_SIZE ();)
- {
- __insn_wh64 (dst8);
-
- /* Prefetch and advance to next line to prefetch, but
- don't go past the end. */
- __insn_prefetch (prefetch);
- prefetch += CHIP_L2_LINE_SIZE ();
- prefetch = (prefetch > src1_end) ? prefetch :
- (const char *) src8;
-
- /* Copy an entire cache line. Manually unrolled to
- avoid idiosyncracies of compiler unrolling. */
-#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
- COPY_WORD (0);
- COPY_WORD (1);
- COPY_WORD (2);
- COPY_WORD (3);
- COPY_WORD (4);
- COPY_WORD (5);
- COPY_WORD (6);
- COPY_WORD (7);
+ {
+ op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ /* Prefetch and advance to next line to prefetch, but
+ don't go past the end. */
+ __insn_prefetch (prefetch);
+ prefetch += CHIP_L2_LINE_SIZE ();
+ prefetch = (prefetch < src1_end) ? prefetch :
+ (const char *) src8;
+
+ /* Do all the loads before wh64. This is necessary if
+ [src8, src8+7] and [dst8, dst8+7] share the same
+ cache line and dst8 <= src8, as can be the case when
+ called from memmove, or with code tested on x86 whose
+ memcpy always works with forward copies. */
+ tmp0 = *src8++;
+ tmp1 = *src8++;
+ tmp2 = *src8++;
+ tmp3 = *src8++;
+ tmp4 = *src8++;
+ tmp5 = *src8++;
+ tmp6 = *src8++;
+ tmp7 = *src8++;
+
+ __insn_wh64 (dst8);
+
+ *dst8++ = tmp0;
+ *dst8++ = tmp1;
+ *dst8++ = tmp2;
+ *dst8++ = tmp3;
+ *dst8++ = tmp4;
+ *dst8++ = tmp5;
+ *dst8++ = tmp6;
+ *dst8++ = tmp7;
+
+ n -= 64;
+ }
#if CHIP_L2_LINE_SIZE() != 64
# error "Fix code that assumes particular L2 cache line size."
#endif
-
- dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
- src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
- }
}
- for (; n >= sizeof (word_t); n -= sizeof (word_t))
+ for (; n >= sizeof (op_t); n -= sizeof (op_t))
*dst8++ = *src8++;
if (__builtin_expect (n == 0, 1))
diff --git a/ports/sysdeps/tile/wordcopy.c b/ports/sysdeps/tile/wordcopy.c
new file mode 100644
index 0000000000..f978d8fdcb
--- /dev/null
+++ b/ports/sysdeps/tile/wordcopy.c
@@ -0,0 +1,449 @@
+/* wordcopy.c -- subroutines for memory copy functions. Tile version.
+ Copyright (C) 1991-2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* To optimize for tile, we make the following changes from the
+ default glibc version:
+ - Use the double align instruction instead of the MERGE macro.
+ - Since we don't have offset addressing mode, make sure the loads /
+ stores in the inner loop always have indices of 0.
+ - Use post-increment addresses in the inner loops, which yields
+ better scheduling. */
+
+/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...! */
+
+#include <stddef.h>
+#include <memcopy.h>
+
+/* Provide the appropriate dblalign builtin to shift two registers
+ based on the alignment of a pointer held in a third register. */
+#ifdef __tilegx__
+#define DBLALIGN __insn_dblalign
+#else
+#define DBLALIGN __insn_dword_align
+#endif
+
+/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
+ block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+ Both SRCP and DSTP should be aligned for memory operations on `op_t's. */
+
+void
+_wordcopy_fwd_aligned (dstp, srcp, len)
+ long int dstp;
+ long int srcp;
+ size_t len;
+{
+ op_t a0, a1;
+
+ switch (len % 8)
+ {
+ case 2:
+ a0 = ((op_t *) srcp)[0];
+ srcp += OPSIZ;
+ len += 6;
+ goto do1;
+ case 3:
+ a1 = ((op_t *) srcp)[0];
+ srcp += OPSIZ;
+ len += 5;
+ goto do2;
+ case 4:
+ a0 = ((op_t *) srcp)[0];
+ srcp += OPSIZ;
+ len += 4;
+ goto do3;
+ case 5:
+ a1 = ((op_t *) srcp)[0];
+ srcp += OPSIZ;
+ len += 3;
+ goto do4;
+ case 6:
+ a0 = ((op_t *) srcp)[0];
+ srcp += OPSIZ;
+ len += 2;
+ goto do5;
+ case 7:
+ a1 = ((op_t *) srcp)[0];
+ srcp += OPSIZ;
+ len += 1;
+ goto do6;
+
+ case 0:
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ return;
+ a0 = ((op_t *) srcp)[0];
+ srcp += OPSIZ;
+ goto do7;
+ case 1:
+ a1 = ((op_t *) srcp)[0];
+ srcp += OPSIZ;
+ len -= 1;
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ goto do0;
+ goto do8; /* No-op. */
+ }
+
+ do
+ {
+ do8:
+ a0 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a1;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do7:
+ a1 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a0;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do6:
+ a0 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a1;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do5:
+ a1 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a0;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do4:
+ a0 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a1;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do3:
+ a1 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a0;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do2:
+ a0 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a1;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do1:
+ a1 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a0;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+
+ len -= 8;
+ }
+ while (len != 0);
+
+ /* This is the right position for do0. Please don't move
+ it into the loop. */
+ do0:
+ ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
+ block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+ DSTP should be aligned for memory operations on `op_t's, but SRCP must
+ *not* be aligned. */
+
+void
+_wordcopy_fwd_dest_aligned (dstp, srcp, len)
+ long int dstp;
+ long int srcp;
+ size_t len;
+{
+ void * srci;
+ op_t a0, a1, a2, a3;
+
+ /* Save the initial source pointer so we know the number of bytes to
+ shift for merging two unaligned results. */
+ srci = (void *) srcp;
+
+ /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
+ it points in the middle of. */
+ srcp &= -OPSIZ;
+
+ switch (len % 4)
+ {
+ case 2:
+ a1 = ((op_t *) srcp)[0];
+ a2 = ((op_t *) srcp)[1];
+ len += 2;
+ srcp += 2 * OPSIZ;
+ goto do1;
+ case 3:
+ a0 = ((op_t *) srcp)[0];
+ a1 = ((op_t *) srcp)[1];
+ len += 1;
+ srcp += 2 * OPSIZ;
+ goto do2;
+ case 0:
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ return;
+ a3 = ((op_t *) srcp)[0];
+ a0 = ((op_t *) srcp)[1];
+ len += 0;
+ srcp += 2 * OPSIZ;
+ goto do3;
+ case 1:
+ a2 = ((op_t *) srcp)[0];
+ a3 = ((op_t *) srcp)[1];
+ srcp += 2 * OPSIZ;
+ len -= 1;
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ goto do0;
+ goto do4; /* No-op. */
+ }
+
+ do
+ {
+ do4:
+ a0 = ((op_t *) srcp)[0];
+ a2 = DBLALIGN (a2, a3, srci);
+ ((op_t *) dstp)[0] = a2;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do3:
+ a1 = ((op_t *) srcp)[0];
+ a3 = DBLALIGN (a3, a0, srci);
+ ((op_t *) dstp)[0] = a3;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do2:
+ a2 = ((op_t *) srcp)[0];
+ a0 = DBLALIGN (a0, a1, srci);
+ ((op_t *) dstp)[0] = a0;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ do1:
+ a3 = ((op_t *) srcp)[0];
+ a1 = DBLALIGN (a1, a2, srci);
+ ((op_t *) dstp)[0] = a1;
+ srcp += OPSIZ;
+ dstp += OPSIZ;
+ len -= 4;
+ }
+ while (len != 0);
+
+ /* This is the right position for do0. Please don't move
+ it into the loop. */
+ do0:
+ ((op_t *) dstp)[0] = DBLALIGN (a2, a3, srci);
+}
+
+/* _wordcopy_bwd_aligned -- Copy block finishing right before
+ SRCP to block finishing right before DSTP with LEN `op_t' words
+ (not LEN bytes!). Both SRCP and DSTP should be aligned for memory
+ operations on `op_t's. */
+
+void
+_wordcopy_bwd_aligned (dstp, srcp, len)
+ long int dstp;
+ long int srcp;
+ size_t len;
+{
+ op_t a0, a1;
+ long int srcp1;
+
+ srcp1 = srcp - 1 * OPSIZ;
+ srcp -= 2 * OPSIZ;
+ dstp -= 1 * OPSIZ;
+
+ switch (len % 8)
+ {
+ case 2:
+ a0 = ((op_t *) srcp1)[0];
+ len += 6;
+ goto do1;
+ case 3:
+ a1 = ((op_t *) srcp1)[0];
+ len += 5;
+ goto do2;
+ case 4:
+ a0 = ((op_t *) srcp1)[0];
+ len += 4;
+ goto do3;
+ case 5:
+ a1 = ((op_t *) srcp1)[0];
+ len += 3;
+ goto do4;
+ case 6:
+ a0 = ((op_t *) srcp1)[0];
+ len += 2;
+ goto do5;
+ case 7:
+ a1 = ((op_t *) srcp1)[0];
+ len += 1;
+ goto do6;
+
+ case 0:
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ return;
+ a0 = ((op_t *) srcp1)[0];
+ goto do7;
+ case 1:
+ a1 = ((op_t *) srcp1)[0];
+ len -= 1;
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ goto do0;
+ goto do8; /* No-op. */
+ }
+
+ do
+ {
+ do8:
+ a0 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a1;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do7:
+ a1 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a0;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do6:
+ a0 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a1;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do5:
+ a1 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a0;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do4:
+ a0 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a1;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do3:
+ a1 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a0;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do2:
+ a0 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a1;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do1:
+ a1 = ((op_t *) srcp)[0];
+ ((op_t *) dstp)[0] = a0;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+
+ len -= 8;
+ }
+ while (len != 0);
+
+ /* This is the right position for do0. Please don't move
+ it into the loop. */
+ do0:
+ ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_bwd_dest_aligned -- Copy block finishing right
+ before SRCP to block finishing right before DSTP with LEN `op_t'
+ words (not LEN bytes!). DSTP should be aligned for memory
+ operations on `op_t', but SRCP must *not* be aligned. */
+
+void
+_wordcopy_bwd_dest_aligned (dstp, srcp, len)
+ long int dstp;
+ long int srcp;
+ size_t len;
+{
+ void * srci;
+ op_t a0, a1, a2, a3;
+ op_t b0, b1, b2, b3;
+
+ /* Save the initial source pointer so we know the number of bytes to
+ shift for merging two unaligned results. */
+ srci = (void *) srcp;
+
+ /* Make SRCP aligned by rounding it down to the beginning of the op_t
+ it points in the middle of. */
+ srcp &= -OPSIZ;
+ srcp += OPSIZ;
+
+ switch (len % 4)
+ {
+ case 2:
+ srcp -= 3 * OPSIZ;
+ dstp -= 1 * OPSIZ;
+ b2 = ((op_t *) srcp)[2];
+ b1 = a1 = ((op_t *) srcp)[1];
+ len += 2;
+ goto do1;
+ case 3:
+ srcp -= 3 * OPSIZ;
+ dstp -= 1 * OPSIZ;
+ b3 = ((op_t *) srcp)[2];
+ b2 = a2 = ((op_t *) srcp)[1];
+ len += 1;
+ goto do2;
+ case 0:
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ return;
+ srcp -= 3 * OPSIZ;
+ dstp -= 1 * OPSIZ;
+ b0 = ((op_t *) srcp)[2];
+ b3 = a3 = ((op_t *) srcp)[1];
+ goto do3;
+ case 1:
+ srcp -= 3 * OPSIZ;
+ dstp -= 1 * OPSIZ;
+ b1 = ((op_t *) srcp)[2];
+ b0 = a0 = ((op_t *) srcp)[1];
+ len -= 1;
+ if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+ goto do0;
+ goto do4; /* No-op. */
+ }
+
+ do
+ {
+ do4:
+ b3 = a3 = ((op_t *) srcp)[0];
+ a0 = DBLALIGN (a0, b1, srci);
+ ((op_t *) dstp)[0] = a0;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do3:
+ b2 = a2 = ((op_t *) srcp)[0];
+ a3 = DBLALIGN (a3, b0, srci);
+ ((op_t *) dstp)[0] = a3;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do2:
+ b1 = a1 = ((op_t *) srcp)[0];
+ a2 = DBLALIGN (a2, b3, srci);
+ ((op_t *) dstp)[0] = a2;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+ do1:
+ b0 = a0 = ((op_t *) srcp)[0];
+ a1 = DBLALIGN (a1, b2, srci);
+ ((op_t *) dstp)[0] = a1;
+ srcp -= OPSIZ;
+ dstp -= OPSIZ;
+
+ len -= 4;
+ }
+ while (len != 0);
+
+ /* This is the right position for do0. Please don't move
+ it into the loop. */
+ do0:
+ a0 = DBLALIGN (a0, b1, srci);
+ ((op_t *) dstp)[0] = a0;
+}