diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc32/cell/memcpy.S')
-rw-r--r-- | sysdeps/powerpc/powerpc32/cell/memcpy.S | 242 |
1 files changed, 0 insertions, 242 deletions
diff --git a/sysdeps/powerpc/powerpc32/cell/memcpy.S b/sysdeps/powerpc/powerpc32/cell/memcpy.S deleted file mode 100644 index a7f761408a..0000000000 --- a/sysdeps/powerpc/powerpc32/cell/memcpy.S +++ /dev/null @@ -1,242 +0,0 @@ -/* Optimized memcpy implementation for CELL BE PowerPC. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ -#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ - -/* memcpy routine optimized for CELL-BE-PPC v2.0 - * - * The CELL PPC core has 1 integer unit and 1 load/store unit - * CELL: - * 1st level data cache = 32K - * 2nd level data cache = 512K - * 3rd level data cache = 0K - * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, - * latency to memory is >400 clocks - * To improve copy performance we need to prefetch source data - * far ahead to hide this latency - * For best performance instruction forms ending in "." like "andi." - * should be avoided as the are implemented in microcode on CELL. - * The below code is loop unrolled for the CELL cache line of 128 bytes - */ - -.align 7 - -EALIGN (memcpy, 5, 0) - CALL_MCOUNT - - dcbt 0,r4 /* Prefetch ONE SRC cacheline */ - cmplwi cr1,r5,16 /* is size < 16 ? */ - mr r6,r3 - blt+ cr1,.Lshortcopy - -.Lbigcopy: - neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ - clrlwi r8,r8,32-4 /* align to 16byte boundary */ - sub r7,r4,r3 - cmplwi cr0,r8,0 - beq+ .Ldst_aligned - -.Ldst_unaligned: - mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ - subf r5,r8,r5 - - bf cr7*4+3,1f - lbzx r0,r7,r6 /* copy 1 byte */ - stb r0,0(r6) - addi r6,r6,1 -1: bf cr7*4+2,2f - lhzx r0,r7,r6 /* copy 2 byte */ - sth r0,0(r6) - addi r6,r6,2 -2: bf cr7*4+1,4f - lwzx r0,r7,r6 /* copy 4 byte */ - stw r0,0(r6) - addi r6,r6,4 -4: bf cr7*4+0,8f - lfdx fp9,r7,r6 /* copy 8 byte */ - stfd fp9,0(r6) - addi r6,r6,8 -8: - add r4,r7,r6 - -.Ldst_aligned: - - cmpwi cr5,r5,128-1 - - neg r7,r6 - addi r6,r6,-8 /* prepare for stfdu */ - addi r4,r4,-8 /* prepare for lfdu */ - - clrlwi r7,r7,32-7 /* align to cacheline boundary */ - ble+ cr5,.Llessthancacheline - - cmplwi cr6,r7,0 - subf r5,r7,r5 - srwi r7,r7,4 /* divide size by 16 */ - srwi r10,r5,7 /* number of cache lines to copy */ - - cmplwi r10,0 - li r11,0 /* number cachelines to copy with prefetch */ - beq .Lnocacheprefetch - - cmplwi r10,PREFETCH_AHEAD - li r12,128+8 /* prefetch distance */ - ble .Llessthanmaxprefetch - - subi r11,r10,PREFETCH_AHEAD - li r10,PREFETCH_AHEAD - -.Llessthanmaxprefetch: - mtctr r10 - -.LprefetchSRC: - dcbt r12,r4 - addi r12,r12,128 - bdnz .LprefetchSRC - -.Lnocacheprefetch: - mtctr r7 - cmplwi cr1,r5,128 - clrlwi r5,r5,32-7 - beq cr6,.Lcachelinealigned - -.Laligntocacheline: - lfd fp9,0x08(r4) - lfdu fp10,0x10(r4) - stfd fp9,0x08(r6) - stfdu fp10,0x10(r6) - bdnz .Laligntocacheline - - -.Lcachelinealigned: /* copy while cache lines */ - - blt- cr1,.Llessthancacheline /* size <128 */ - -.Louterloop: - cmpwi r11,0 - mtctr r11 - beq- .Lendloop - - li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ - -.align 4 - /* Copy whole cachelines, optimized by prefetching SRC cacheline */ -.Lloop: /* Copy aligned body */ - dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ - lfd fp9, 0x08(r4) - dcbz r11,r6 - lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */ - lfd fp11, 0x18(r4) /* to hide 1st level cache latency. */ - lfd fp12, 0x20(r4) - stfd fp9, 0x08(r6) - stfd fp10, 0x10(r6) - stfd fp11, 0x18(r6) - stfd fp12, 0x20(r6) - lfd fp9, 0x28(r4) - lfd fp10, 0x30(r4) - lfd fp11, 0x38(r4) - lfd fp12, 0x40(r4) - stfd fp9, 0x28(r6) - stfd fp10, 0x30(r6) - stfd fp11, 0x38(r6) - stfd fp12, 0x40(r6) - lfd fp9, 0x48(r4) - lfd fp10, 0x50(r4) - lfd fp11, 0x58(r4) - lfd fp12, 0x60(r4) - stfd fp9, 0x48(r6) - stfd fp10, 0x50(r6) - stfd fp11, 0x58(r6) - stfd fp12, 0x60(r6) - lfd fp9, 0x68(r4) - lfd fp10, 0x70(r4) - lfd fp11, 0x78(r4) - lfdu fp12, 0x80(r4) - stfd fp9, 0x68(r6) - stfd fp10, 0x70(r6) - stfd fp11, 0x78(r6) - stfdu fp12, 0x80(r6) - - bdnz .Lloop - -.Lendloop: - cmpwi r10,0 - slwi r10,r10,2 /* adjust from 128 to 32 byte stride */ - beq- .Lendloop2 - mtctr r10 - -.Lloop2: /* Copy aligned body */ - lfd fp9, 0x08(r4) - lfd fp10, 0x10(r4) - lfd fp11, 0x18(r4) - lfdu fp12, 0x20(r4) - stfd fp9, 0x08(r6) - stfd fp10, 0x10(r6) - stfd fp11, 0x18(r6) - stfdu fp12, 0x20(r6) - - bdnz .Lloop2 -.Lendloop2: - -.Llessthancacheline: /* less than cache to do ? */ - cmplwi cr0,r5,16 - srwi r7,r5,4 /* divide size by 16 */ - blt- .Ldo_lt16 - mtctr r7 - -.Lcopy_remaining: - lfd fp9,0x08(r4) - lfdu fp10,0x10(r4) - stfd fp9,0x08(r6) - stfdu fp10,0x10(r6) - bdnz .Lcopy_remaining - -.Ldo_lt16: /* less than 16 ? */ - cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */ - beqlr+ /* no rest to copy */ - addi r4,r4,8 - addi r6,r6,8 - -.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ - mtcrf 0x01,r5 - sub r7,r4,r6 - bf- cr7*4+0,8f - lfdx fp9,r7,r6 /* copy 8 byte */ - stfd fp9,0(r6) - addi r6,r6,8 -8: - bf cr7*4+1,4f - lwzx r0,r7,r6 /* copy 4 byte */ - stw r0,0(r6) - addi r6,r6,4 -4: - bf cr7*4+2,2f - lhzx r0,r7,r6 /* copy 2 byte */ - sth r0,0(r6) - addi r6,r6,2 -2: - bf cr7*4+3,1f - lbzx r0,r7,r6 /* copy 1 byte */ - stb r0,0(r6) -1: blr - -END (memcpy) -libc_hidden_builtin_def (memcpy) |