aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyan Arnold <ryanarn@etna.rchland.ibm.com>2010-05-20 17:05:06 -0500
committerRyan Arnold <ryanarn@etna.rchland.ibm.com>2010-05-20 17:05:06 -0500
commit5fe0b279e4cb7b05c870a8903694a710b650acd7 (patch)
tree9aa1304d21116a3b513652513c3d8e0d505708e0
parent0a89b6a6fac08b42533075d90d8693ec825bdac1 (diff)
downloadglibc-5fe0b279e4cb7b05c870a8903694a710b650acd7.tar
glibc-5fe0b279e4cb7b05c870a8903694a710b650acd7.tar.gz
glibc-5fe0b279e4cb7b05c870a8903694a710b650acd7.tar.bz2
glibc-5fe0b279e4cb7b05c870a8903694a710b650acd7.zip
Revert "Power7 memset powerpc32 and powerpc64 .S optimizations."
This reverts commit 0a89b6a6fac08b42533075d90d8693ec825bdac1. A corrected version has been pushed upstream and that will be cherry-picked.
-rw-r--r--ChangeLog6
-rw-r--r--sysdeps/powerpc/powerpc32/power7/memset.S434
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memset.S398
3 files changed, 0 insertions, 838 deletions
diff --git a/ChangeLog b/ChangeLog
index 4fb7a2979f..a16a1fba85 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,9 +1,3 @@
-2010-05-15 Luis Machado <luisgpm@br.ibm.com>
- * sysdeps/powerpc/powerpc64/power7/memset.S: New POWER7-optimized
- 64-bit memset.
- * sysdeps/powerpc/powerpc32/power7/memset.S: New POWER7-optimized
- 32-bit memset.
-
2010-05-01 Alan Modra <amodra@gmail.com>
* sysdeps/powerpc/powerpc32/power4/memcmp.S: Correct cfi for r24.
* sysdeps/powerpc/powerpc64/bsd-_setjmp.S: Move contents..
diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S
deleted file mode 100644
index 990faa1452..0000000000
--- a/sysdeps/powerpc/powerpc32/power7/memset.S
+++ /dev/null
@@ -1,434 +0,0 @@
-/* Optimized memset implementation for PowerPC32/POWER7.
- Copyright (C) 2010 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
- 02110-1301 USA. */
-
-#include <sysdep.h>
-#include <bp-sym.h>
-#include <bp-asm.h>
-
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
- Returns 's'. */
-
- .machine power7
-EALIGN (BP_SYM (memset), 5, 0)
- CALL_MCOUNT
-
- .align 4
-L(_memset):
- cmplwi cr7,5,31
- cmplwi cr6,5,8
- mr 10,3 /* Save original argument for later. */
- mr 7,1 /* Save original r1 for later. */
- cfi_offset(31,-8)
-
- /* Replicate byte to word. */
- rlwimi 4,4,8,16,23
- rlwimi 4,4,16,0,15
-
- ble cr6,L(small) /* If length <= 8, use short copy code. */
-
- neg 0,3
- ble cr7,L(medium) /* If length < 32, use medium copy code. */
-
- /* Save our word twice to create a doubleword that we will later
- copy to a FPR. */
- stwu 1,-32(1)
- andi. 11,10,7 /* Check alignment of DST. */
- mr 12,5
- stw 4,24(1)
- stw 4,28(1)
- beq L(big_aligned)
-
- clrlwi 0,0,29
- mtocrf 0x01,0
- subf 5,0,5
-
- /* Get DST aligned to 8 bytes. */
-1: bf 31,2f
-
- stb 4,0(10)
- addi 10,10,1
-2: bf 30,4f
-
- sth 4,0(10)
- addi 10,10,2
-4: bf 29,L(big_aligned)
-
- stw 4,0(10)
- addi 10,10,4
-
- .align 4
-L(big_aligned):
- cmplwi cr5,5,255
- li 0,32
- cmplwi cr1,5,160
- dcbtst 0,10
- cmplwi cr6,4,0
- srwi 9,5,3 /* Number of full doublewords remaining. */
- crand 27,26,21
- mtocrf 0x01,9
- bt 27,L(huge)
-
- /* From this point on, we'll copy 32+ bytes and the value
- isn't 0 (so we can't use dcbz). */
-
- srwi 8,5,5
- clrlwi 11,5,29
- cmplwi cr6,11,0
- cmplwi cr1,9,4
- mtctr 8
-
- /* Copy 1~3 doublewords so the main loop starts
- at a multiple of 32 bytes. */
-
- bf 30,1f
-
- stw 4,0(10)
- stw 4,4(10)
- stw 4,8(10)
- stw 4,12(10)
- addi 10,10,16
- bf 31,L(big_loop)
-
- stw 4,0(10)
- stw 4,4(10)
- addi 10,10,8
- mr 12,10
- blt cr1,L(tail_bytes)
-
- b L(big_loop)
-
- .align 4
-1: /* Copy 1 doubleword. */
- bf 31,L(big_loop)
-
- stw 4,0(10)
- stw 4,4(10)
- addi 10,10,8
-
- /* First use a 32-bytes loop with stw's to try and avoid the LHS due
- to the lfd we will do next. Also, ping-pong through r10 and r12
- to avoid AGEN delays. */
- .align 4
-L(big_loop):
- addi 12,10,32
- stw 4,0(10)
- stw 4,4(10)
- stw 4,8(10)
- stw 4,12(10)
- stw 4,16(10)
- stw 4,20(10)
- stw 4,24(10)
- stw 4,28(10)
- bdz L(tail_bytes)
-
- addi 10,10,64
- stw 4,0(12)
- stw 4,4(12)
- stw 4,8(12)
- stw 4,12(12)
- stw 4,16(12)
- stw 4,20(12)
- stw 4,24(12)
- stw 4,28(12)
- bdnz L(big_loop_fast_setup)
-
- mr 12,10
- b L(tail_bytes)
-
- /* Now that we're probably past the LHS window, use the VSX to
- speed up the loop. */
-L(big_loop_fast_setup):
- li 11,24
- li 6,16
- lxvdsx 4,1,11
-
- .align 4
-L(big_loop_fast):
- addi 12,10,32
- stxvd2x 4,10,0
- stxvd2x 4,10,6
- bdz L(tail_bytes)
-
- addi 10,10,64
- stxvd2x 4,12,0
- stxvd2x 4,12,6
- bdnz L(big_loop_fast)
-
- mr 12,10
-
- .align 4
-L(tail_bytes):
-
- /* Check for tail bytes. */
- mr 1,7 /* Restore r1. */
- beqlr cr6
-
- clrlwi 0,5,29
- mtocrf 0x01,0
-
- /* At this point we have a tail of 0-7 bytes and we know that the
- destination is doubleword-aligned. */
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- stw 4,0(12)
- addi 12,12,4
-2: /* Copy 2 bytes. */
- bf 30,1f
-
- sth 4,0(12)
- addi 12,12,2
-1: /* Copy 1 byte. */
- bflr 31
-
- stb 4,0(12)
- blr
-
-
- /* Special case when value is 0 and we have a long length to deal
- with. Use dcbz to zero out 128-bytes at a time. Before using
- dcbz though, we need to get the destination 128-bytes aligned. */
- .align 4
-L(huge):
- lfd 4,24(1)
- andi. 11,10,127
- neg 0,10
- beq L(huge_aligned)
-
- clrlwi 0,0,25
- subf 5,0,5
- srwi 0,0,3
- mtocrf 0x01,0
-
- /* Get DST aligned to 128 bytes. */
-8: bf 28,4f
-
- stfd 4,0(10)
- stfd 4,8(10)
- stfd 4,16(10)
- stfd 4,24(10)
- stfd 4,32(10)
- stfd 4,40(10)
- stfd 4,48(10)
- stfd 4,56(10)
- addi 10,10,64
- .align 4
-4: bf 29,2f
-
- stfd 4,0(10)
- stfd 4,8(10)
- stfd 4,16(10)
- stfd 4,24(10)
- addi 10,10,32
- .align 4
-2: bf 30,1f
-
- stfd 4,0(10)
- stfd 4,8(10)
- addi 10,10,16
- .align 4
-1: bf 31,L(huge_aligned)
-
- stfd 4,0(10)
- addi 10,10,8
-
-L(huge_aligned):
- srwi 8,5,7
- clrlwi 11,5,25
- cmplwi cr6,11,0
- mtctr 8
-
- /* Copies 128-bytes at a time. */
- .align 4
-L(huge_loop):
- dcbz 0,10
- addi 10,10,128
- bdnz L(huge_loop)
-
- /* We have a tail of 0~127 bytes to handle. */
- mr 1,7 /* Restore r1. */
- beqlr cr6
-
- subf 9,3,10
- subf 5,9,12
- srwi 8,5,3
- cmplwi cr6,8,0
- mtocrf 0x01,8
-
- /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
- speed. We'll handle the resulting tail bytes later. */
- beq cr6,L(tail)
-
-8: bf 28,4f
-
- stfd 4,0(10)
- stfd 4,8(10)
- stfd 4,16(10)
- stfd 4,24(10)
- stfd 4,32(10)
- stfd 4,40(10)
- stfd 4,48(10)
- stfd 4,56(10)
- addi 10,10,64
- .align 4
-4: bf 29,2f
-
- stfd 4,0(10)
- stfd 4,8(10)
- stfd 4,16(10)
- stfd 4,24(10)
- addi 10,10,32
- .align 4
-2: bf 30,1f
-
- stfd 4,0(10)
- stfd 4,8(10)
- addi 10,10,16
- .align 4
-1: bf 31,L(tail)
-
- stfd 4,0(10)
- addi 10,10,8
-
- /* Handle the rest of the tail bytes here. */
-L(tail):
- mtocrf 0x01,5
-
- .align 4
-4: bf 29,2f
-
- stw 4,0(10)
- addi 10,10,4
- .align 4
-2: bf 30,1f
-
- sth 4,0(10)
- addi 10,10,2
- .align 4
-1: bflr 31
-
- stb 4,0(10)
- blr
-
-
- /* Expanded tree to copy tail bytes without increments. */
- .align 4
-L(copy_tail):
- bf 29,L(FXX)
-
- stw 4,0(10)
- bf 30,L(TFX)
-
- sth 4,4(10)
- bflr 31
-
- stb 4,6(10)
- blr
-
- .align 4
-L(FXX): bf 30,L(FFX)
-
- sth 4,0(10)
- bflr 31
-
- stb 4,2(10)
- blr
-
- .align 4
-L(TFX): bflr 31
-
- stb 4,4(10)
- blr
-
- .align 4
-L(FFX): bflr 31
-
- stb 4,0(10)
- blr
-
- /* Handle copies of 9~31 bytes. */
- .align 4
-L(medium):
- /* At least 9 bytes to go. */
- andi. 11,10,3
- clrlwi 0,0,30
- beq L(medium_aligned)
-
- /* Force 4-bytes alignment for DST. */
- mtocrf 0x01,0
- subf 5,0,5
-1: /* Copy 1 byte. */
- bf 31,2f
-
- stb 4,0(10)
- addi 10,10,1
-2: /* Copy 2 bytes. */
- bf 30,L(medium_aligned)
-
- sth 4,0(10)
- addi 10,10,2
-
- .align 4
-L(medium_aligned):
- /* At least 6 bytes to go, and DST is word-aligned. */
- cmplwi cr1,5,16
- mtocrf 0x01,5
- blt cr1,8f
-
- /* Copy 16 bytes. */
- stw 4,0(10)
- stw 4,4(10)
- stw 4,8(10)
- stw 4,12(10)
- addi 10,10,16
-8: /* Copy 8 bytes. */
- bf 28,4f
-
- stw 4,0(10)
- stw 4,4(10)
- addi 10,10,8
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- stw 4,0(10)
- addi 10,10,4
-2: /* Copy 2-3 bytes. */
- bf 30,1f
-
- sth 4,0(10)
- addi 10,10,2
-1: /* Copy 1 byte. */
- bflr 31
-
- stb 4,0(10)
- blr
-
- /* Handles copies of 0~8 bytes. */
- .align 4
-L(small):
- mtocrf 0x01,5
- bne cr6,L(copy_tail)
-
- stw 4,0(10)
- stw 4,4(10)
- blr
-
-END (BP_SYM (memset))
-libc_hidden_builtin_def (memset)
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
deleted file mode 100644
index 0f726d4f37..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ /dev/null
@@ -1,398 +0,0 @@
-/* Optimized memset implementation for PowerPC64/POWER7.
- Copyright (C) 2010 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
-
-#include <sysdep.h>
-#include <bp-sym.h>
-#include <bp-asm.h>
-
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
- Returns 's'. */
-
- .machine power7
-EALIGN (BP_SYM (memset), 5, 0)
- CALL_MCOUNT 3
-
-L(_memset):
- cmpldi cr7,5,31
- cmpldi cr6,5,8
- mr 10,3
-
- /* Replicate byte to word. */
- rlwimi 4,4,8,16,23
- rlwimi 4,4,16,0,15
- ble cr6, L(small) /* If length <= 8, use short copy code. */
-
- neg 0,3
- ble cr7, L(medium) /* If length < 32, use medium copy code. */
-
- andi. 11,10,7 /* Check alignment of SRC. */
- insrdi 4,4,32,0 /* Replicate word to double word. */
-
- mr 12,5
- beq L(big_aligned)
-
- clrldi 0,0,61
- mtocrf 0x01,0
- subf 5,0,5
-
- /* Get DST aligned to 8 bytes. */
-1: bf 31,2f
-
- stb 4,0(10)
- addi 10,10,1
-2: bf 30,4f
-
- sth 4,0(10)
- addi 10,10,2
-4: bf 29,L(big_aligned)
-
- stw 4,0(10)
- addi 10,10,4
-
- .align 4
-L(big_aligned):
-
- cmpldi cr5,5,255
- li 0,32
- dcbtst 0,10
- cmpldi cr6,4,0
- srdi 9,5,3 /* Number of full doublewords remaining. */
- crand 27,26,21
- mtocrf 0x01,9
- bt 27,L(huge)
-
- /* From this point on, we'll copy 32+ bytes and the value
- isn't 0 (so we can't use dcbz). */
-
- srdi 8,5,5
- clrldi 11,5,61
- cmpldi cr6,11,0
- cmpldi cr1,9,4
- mtctr 8
-
- /* Copy 1~3 doublewords so the main loop starts
- at a multiple of 32 bytes. */
-
- bf 30,1f
-
- std 4,0(10)
- std 4,8(10)
- addi 10,10,16
- bf 31,L(big_loop)
-
- std 4,0(10)
- addi 10,10,8
- mr 12,10
- blt cr1,L(tail_bytes)
- b L(big_loop)
-
- .align 4
-1: /* Copy 1 doubleword. */
- bf 31,L(big_loop)
-
- std 4,0(10)
- addi 10,10,8
-
- /* Main aligned copy loop. Copies 32-bytes at a time and
- ping-pong through r10 and r12 to avoid AGEN delays. */
- .align 4
-L(big_loop):
- addi 12,10,32
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- bdz L(tail_bytes)
-
- addi 10,10,64
- std 4,0(12)
- std 4,8(12)
- std 4,16(12)
- std 4,24(12)
- bdnz L(big_loop)
-
- mr 12,10
- b L(tail_bytes)
-
- .align 4
-L(tail_bytes):
-
- /* Check for tail bytes. */
- beqlr cr6
-
- clrldi 0,5,61
- mtocrf 0x01,0
-
- /* At this point we have a tail of 0-7 bytes and we know that the
- destination is doubleword-aligned. */
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- stw 4,0(12)
- addi 12,12,4
-2: /* Copy 2 bytes. */
- bf 30,1f
-
- sth 4,0(12)
- addi 12,12,2
-1: /* Copy 1 byte. */
- bflr 31
-
- stb 4,0(12)
- blr
-
- /* Special case when value is 0 and we have a long length to deal
- with. Use dcbz to zero out 128-bytes at a time. Before using
- dcbz though, we need to get the destination 128-bytes aligned. */
- .align 4
-L(huge):
- andi. 11,10,127
- neg 0,10
- beq L(huge_aligned)
-
- clrldi 0,0,57
- subf 5,0,5
- srdi 0,0,3
- mtocrf 0x01,0
-
- /* Get DST aligned to 128 bytes. */
-8: bf 28,4f
-
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- std 4,32(10)
- std 4,40(10)
- std 4,48(10)
- std 4,56(10)
- addi 10,10,64
- .align 4
-4: bf 29,2f
-
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- addi 10,10,32
- .align 4
-2: bf 30,1f
-
- std 4,0(10)
- std 4,8(10)
- addi 10,10,16
- .align 4
-1: bf 31,L(huge_aligned)
-
- std 4,0(10)
- addi 10,10,8
-
-
-L(huge_aligned):
- srdi 8,5,7
- clrldi 11,5,57
- cmpldi cr6,11,0
- mtctr 8
-
- .align 4
-L(huge_loop):
- dcbz 0,10
- addi 10,10,128
- bdnz L(huge_loop)
-
- /* Check how many bytes are still left. */
- beqlr cr6
-
- subf 9,3,10
- subf 5,9,12
- srdi 8,5,3
- cmpldi cr6,8,0
- mtocrf 0x01,8
-
- /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
- speed. We'll handle the resulting tail bytes later. */
- beq cr6,L(tail)
-
-8: bf 28,4f
-
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- std 4,32(10)
- std 4,40(10)
- std 4,48(10)
- std 4,56(10)
- addi 10,10,64
- .align 4
-4: bf 29,2f
-
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- addi 10,10,32
- .align 4
-2: bf 30,1f
-
- std 4,0(10)
- std 4,8(10)
- addi 10,10,16
- .align 4
-1: bf 31,L(tail)
-
- std 4,0(10)
- addi 10,10,8
-
- /* Handle the rest of the tail bytes here. */
-L(tail):
- mtocrf 0x01,5
-
- .align 4
-4: bf 29,2f
-
- stw 4,0(10)
- addi 10,10,4
- .align 4
-2: bf 30,1f
-
- sth 4,0(10)
- addi 10,10,2
- .align 4
-1: bflr 31
-
- stb 4,0(10)
- blr
-
- /* Expanded tree to copy tail bytes without increments. */
- .align 4
-L(copy_tail):
- bf 29,L(FXX)
-
- stw 4,0(10)
- bf 30,L(TFX)
-
- sth 4,4(10)
- bflr 31
-
- stb 4,6(10)
- blr
-
- .align 4
-L(FXX): bf 30,L(FFX)
-
- sth 4,0(10)
- bflr 31
-
- stb 4,2(10)
- blr
-
- .align 4
-L(TFX): bflr 31
-
- stb 4,4(10)
- blr
-
- .align 4
-L(FFX): bflr 31
-
- stb 4,0(10)
- blr
-
- /* Handle copies of 9~31 bytes. */
- .align 4
-L(medium):
- /* At least 9 bytes to go. */
- andi. 11,10,3
- clrldi 0,0,62
- beq L(medium_aligned)
-
- /* Force 4-bytes alignment for SRC. */
- mtocrf 0x01,0
- subf 5,0,5
-1: /* Copy 1 byte. */
- bf 31,2f
-
- stb 4,0(10)
- addi 10,10,1
-2: /* Copy 2 bytes. */
- bf 30,L(medium_aligned)
-
- sth 4,0(10)
- addi 10,10,2
-
- .align 4
-L(medium_aligned):
- /* At least 6 bytes to go, and DST is word-aligned. */
- cmpldi cr1,5,16
- mtocrf 0x01,5
- blt cr1,8f
-
- /* Copy 16 bytes. */
- stw 4,0(10)
- stw 4,4(10)
- stw 4,8(10)
- stw 4,12(10)
- addi 10,10,16
-8: /* Copy 8 bytes. */
- bf 28,4f
-
- stw 4,0(10)
- stw 4,4(10)
- addi 10,10,8
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- stw 4,0(10)
- addi 10,10,4
-2: /* Copy 2-3 bytes. */
- bf 30,1f
-
- sth 4,0(10)
- addi 10,10,2
-1: /* Copy 1 byte. */
- bflr 31
-
- stb 4,0(10)
- blr
-
- /* Handles copies of 0~8 bytes. */
- .align 4
-L(small):
- mtocrf 0x01,5
- bne cr6,L(copy_tail)
-
- stw 4,0(10)
- stw 4,4(10)
- blr
-
-END_GEN_TB (BP_SYM (memset),TB_TOCLESS)
-libc_hidden_builtin_def (memset)
-
-/* Copied from bzero.S to prevent the linker from inserting a stub
- between bzero and memset. */
-ENTRY (BP_SYM (__bzero))
- CALL_MCOUNT 3
- mr r5,r4
- li r4,0
- b L(_memset)
-END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS)
-
-weak_alias (BP_SYM (__bzero), BP_SYM (bzero))