aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog6
-rw-r--r--sysdeps/powerpc/powerpc32/power7/memcpy.S88
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcpy.S86
3 files changed, 151 insertions, 29 deletions
diff --git a/ChangeLog b/ChangeLog
index c90f2c7754..429767dc6e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2011-07-28 Will Schmidt <will_schmidt@vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc32/power7/memcpy.S: Optimize the
+ aligned copy for power7 with vector-scalar instructions.
+ * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+
2011-07-24 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Simplify
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index f0c332f1ba..ec7055745c 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -1,5 +1,5 @@
/* Optimized memcpy implementation for PowerPC32/POWER7.
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -116,24 +116,82 @@ L(copy_GE_32_aligned_cont):
stfd 6,0(3)
addi 10,3,8
+L(aligned_copy):
+ /* Main aligned copy loop. Copies up to 128-bytes at a time. */
.align 4
-4: /* Main aligned copy loop. Copies 32-bytes at a time. */
- lfd 6,0(11)
- lfd 7,8(11)
- lfd 8,16(11)
- lfd 0,24(11)
- addi 11,11,32
+4:
+ /* check for any 32-byte or 64-byte lumps that are outside of a
+ nice 128-byte range. R8 contains the number of 32-byte
+ lumps, so drop this into the CR, and use the SO/EQ bits to help
+ handle the 32- or 64- byte lumps. Then handle the rest with an
+ unrolled 128-bytes-at-a-time copy loop. */
+ mtocrf 1,8
+ li 6,16 # 16() index
+ li 7,32 # 32() index
+ li 8,48 # 48() index
+
+L(aligned_32byte):
+ /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+ bns cr7,L(aligned_64byte)
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ addi 11,11,32
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ addi 10,10,32
+
+L(aligned_64byte):
+ /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+ bne cr7,L(aligned_128setup)
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ addi 11,11,64
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ addi 10,10,64
+
+L(aligned_128setup):
+ /* Set up for the 128-byte at a time copy loop. */
+ srwi 8,31,7
+ cmpwi 8,0 # Any 4x lumps left?
+ beq 3f # if not, move along.
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ mtctr 8 # otherwise, load the ctr and begin.
+ li 8,48 # 48() index
+ b L(aligned_128loop)
+
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+L(aligned_128loop):
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ stxvd2x 6,0,10
+ addi 11,11,64
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ addi 10,10,64
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ addi 11,11,64
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ addi 10,10,64
+ bdnz L(aligned_128head)
- stfd 6,0(10)
- stfd 7,8(10)
- stfd 8,16(10)
- stfd 0,24(10)
- addi 10,10,32
- bdnz 4b
3:
-
/* Check for tail bytes. */
-
clrrwi 0,31,3
mtcrf 0x01,31
beq cr6,0f
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 2e5beed15e..8aaef97440 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -1,5 +1,5 @@
/* Optimized memcpy implementation for PowerPC64/POWER7.
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -115,23 +115,81 @@ L(copy_GE_32_aligned_cont):
std 6,0(3)
addi 10,3,8
- /* Main aligned copy loop. Copies 32-bytes at a time. */
+L(aligned_copy):
+ /* Main aligned copy loop. Copies up to 128-bytes at a time. */
.align 4
4:
- ld 6,0(11)
- ld 7,8(11)
- ld 8,16(11)
- ld 0,24(11)
- addi 11,11,32
+ /* check for any 32-byte or 64-byte lumps that are outside of a
+ nice 128-byte range. R8 contains the number of 32-byte
+ lumps, so drop this into the CR, and use the SO/EQ bits to help
+ handle the 32- or 64- byte lumps. Then handle the rest with an
+ unrolled 128-bytes-at-a-time copy loop. */
+ mtocrf 1,8
+ li 6,16 # 16() index
+ li 7,32 # 32() index
+ li 8,48 # 48() index
+
+L(aligned_32byte):
+ /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+ bns cr7,L(aligned_64byte)
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ addi 11,11,32
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ addi 10,10,32
+
+L(aligned_64byte):
+ /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+ bne cr7,L(aligned_128setup)
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ addi 11,11,64
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ addi 10,10,64
+
+L(aligned_128setup):
+ /* Set up for the 128-byte at a time copy loop. */
+ srdi 8,31,7
+ cmpdi 8,0 # Any 4x lumps left?
+ beq 3f # if not, move along.
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ mtctr 8 # otherwise, load the ctr and begin.
+ li 8,48 # 48() index
+ b L(aligned_128loop)
+
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+L(aligned_128loop):
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ stxvd2x 6,0,10
+ addi 11,11,64
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ lxvd2x 6,0,11
+ lxvd2x 7,11,6
+ addi 10,10,64
+ lxvd2x 8,11,7
+ lxvd2x 9,11,8
+ addi 11,11,64
+ stxvd2x 6,0,10
+ stxvd2x 7,10,6
+ stxvd2x 8,10,7
+ stxvd2x 9,10,8
+ addi 10,10,64
+ bdnz L(aligned_128head)
- std 6,0(10)
- std 7,8(10)
- std 8,16(10)
- std 0,24(10)
- addi 10,10,32
- bdnz 4b
3:
-
/* Check for tail bytes. */
rldicr 0,31,0,60
mtcrf 0x01,31