diff options
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | benchtests/bench-strcpy.c | 16 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power7/strcpy.S | 327 |
3 files changed, 268 insertions, 82 deletions
@@ -1,3 +1,10 @@ +2014-12-31 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> + Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/power7/strcpy.S (strcpy): Optimize unaligned + path. + * benchtests/bench-strcpy.c (test_main): Add more unaligned inputs. + 2014-12-31 Joseph Myers <joseph@codesourcery.com> * sysdeps/powerpc/bits/fenvinline.h (fegetround): Rename macro to diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c index c3ab4cfcf7..e9445f290f 100644 --- a/benchtests/bench-strcpy.c +++ b/benchtests/bench-strcpy.c @@ -171,6 +171,22 @@ test_main (void) do_test (i, i, 8 << i, BIG_CHAR); } + for (i = 16; i <= 512; i+=4) + { + do_test (0, 4, i, SMALL_CHAR); + do_test (4, 0, i, BIG_CHAR); + do_test (4, 4, i, SMALL_CHAR); + do_test (2, 2, i, BIG_CHAR); + do_test (2, 6, i, SMALL_CHAR); + do_test (6, 2, i, BIG_CHAR); + do_test (1, 7, i, SMALL_CHAR); + do_test (7, 1, i, BIG_CHAR); + do_test (3, 4, i, SMALL_CHAR); + do_test (4, 3, i, BIG_CHAR); + do_test (5, 7, i, SMALL_CHAR); + do_test (7, 5, i, SMALL_CHAR); + } + return ret; } diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S index ce71982eaf..115f98a304 100644 --- a/sysdeps/powerpc/powerpc64/power7/strcpy.S +++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S @@ -31,8 +31,6 @@ if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0)) goto aligned_doubleword_copy; - if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0)) - goto aligned_word_copy; if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL)) goto same_alignment; goto unaligned; @@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0) #endif or rTMP, rSRC, rRTN clrldi. rTMP, rTMP, 61 - bne L(check_word_alignment) + bne L(check_alignment) b L(aligned_doubleword_copy) + .align 4 +L(check_alignment): + rldicl rRTNAL, rRTN, 0, 61 + rldicl rSRCAL, rSRC, 0, 61 + cmpld cr7, rSRCAL, rRTNAL + beq cr7, L(same_alignment) + b L(unaligned) + + .align 4 L(same_alignment): /* Src and dst with same alignment: align both to doubleword. */ mr rALCNT, rRTN @@ -180,93 +187,249 @@ L(g1): #endif blr -L(check_word_alignment): - clrldi. rTMP, rTMP, 62 - beq L(aligned_word_copy) - rldicl rRTNAL, rRTN, 0, 61 - rldicl rSRCAL, rSRC, 0, 61 - cmpld cr7, rSRCAL, rRTNAL - beq cr7, L(same_alignment) - b L(unaligned) - -/* For word aligned memory, operate using word load and stores. */ .align 4 -L(aligned_word_copy): - li rMASK, 0 - addi rRTN, rRTN, -4 - lwz rWORD, 0(rSRC) - b L(g5) +L(unaligned): + cmpdi rSRCAL, 0 /* Check src alignment */ + beq L(srcaligndstunalign) + /* src is unaligned */ + rlwinm r10, rSRC, 3,26,28 /* Calculate padding. */ + clrrdi rSRC, rSRC, 3 /* Align the addr to dw boundary */ + ld rWORD, 0(rSRC) /* Load doubleword from memory. */ + li rTMP, 0 + /* Discard bits not part of the string */ +#ifdef __LITTLE_ENDIAN__ + srd rALT, rWORD, r10 +#else + sld rALT, rWORD, r10 +#endif + cmpb rTMP, rALT, rTMP /* Compare each byte against null */ + /* Discard bits not part of the string */ +#ifdef __LITTLE_ENDIAN__ + sld rTMP, rTMP, r10 +#else + srd rTMP, rTMP, r10 +#endif + cmpdi rTMP, 0 + bne L(bytebybyte) /* if it has null, copy byte by byte */ + subfic r8, r9, 8 + rlwinm r5, rRTN, 3,26,28 /* Calculate padding in bits. */ + rldicl r9, rRTN, 0, 61 /* Calculate padding in bytes. */ + addi rRTN, rRTN, -1 - .align 4 -L(g3): lwzu rALT, 4(rSRC) - stwu rWORD, 4(rRTN) - cmpb rTMP, rALT, rMASK - cmpwi rTMP, 0 - bne L(g4) - lwzu rWORD, 4(rSRC) - stwu rALT, 4(rRTN) -L(g5): cmpb rTMP, rWORD, rMASK - cmpwi rTMP, 0 /* If rTMP is 0, no null in word. */ - beq L(g3) - - mr rALT, rWORD -/* We've hit the end of the string. Do the rest byte-by-byte. */ -L(g4): + cmpdi r5, 0 /* check dest alignment */ + beq L(srcunaligndstalign) + + /* both src and dst unaligned */ #ifdef __LITTLE_ENDIAN__ - rlwinm. rTMP, rALT, 0, 24, 31 - stbu rALT, 4(rRTN) - beqlr- - rlwinm. rTMP, rALT, 24, 24, 31 - stbu rTMP, 1(rRTN) - beqlr- - rlwinm. rTMP, rALT, 16, 24, 31 - stbu rTMP, 1(rRTN) - beqlr- - rlwinm rTMP, rALT, 8, 24, 31 - stbu rTMP, 1(rRTN) + sld rWORD, rALT, r10 + mr r11, r10 + addi r11, r11, -8 /* Adjust byte pointer on loaded dw */ #else - rlwinm. rTMP, rALT, 8, 24, 31 - stbu rTMP, 4(rRTN) - beqlr - rlwinm. rTMP, rALT, 16, 24, 31 - stbu rTMP, 1(rRTN) - beqlr - rlwinm. rTMP, rALT, 24, 24, 31 - stbu rTMP, 1(rRTN) - beqlr - stbu rALT, 1(rRTN) + srd rWORD, rALT, r10 + subfic r11, r10, 64 #endif - blr + /* dst alignment is greater then src alignment? */ + cmpd cr7, r5, r10 + blt cr7, L(dst_align_small) + /* src alignment is less than dst */ -/* Oh well. In this case, we just do a byte-by-byte copy. */ - .align 4 -L(unaligned): - lbz rWORD, 0(rSRC) - addi rRTN, rRTN, -1 - cmpdi rWORD, 0 - beq L(u2) - - .align 5 -L(u0): lbzu rALT, 1(rSRC) - stbu rWORD, 1(rRTN) - cmpdi rALT, 0 - beq L(u1) - lbzu rWORD, 1(rSRC) + /* Calculate the dst alignment differnce */ + subfic rALT, r9, 8 + mtctr rALT + + /* Write till dst is aligned */ + cmpdi rTMP, rALT, 4 + blt L(storebyte1) /* less than 4, store byte by byte */ + beq L(equal1) /* if its 4, store word */ + addi rTMP, rALT, -4 /* greater than 4, so stb and stw */ + mtctr rTMP +L(storebyte1): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ +#else + addi r11, r11, -8 +#endif + srd rALT, rWORD, r11 + stbu rALT, 1(rRTN) + bdnz L(storebyte1) + + subfic rALT, r9, 8 /* Check the remaining bytes */ + cmpdi rTMP, rALT, 4 + blt L(proceed) + + .align 4 +L(equal1): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ + srd rALT, rWORD, r11 +#else + subfic r11, r11, 64 + sld rALT, rWORD, r11 + srdi rALT, rALT, 32 +#endif + stw rALT, 1(rRTN) + addi rRTN, rRTN, 4 + +L(proceed): + mr rALT, rWORD + /* calculate the Left over bytes to be written */ + subfic r11, r10, 64 + subfic r5, r5, 64 + subf r5, r5, r11 /* remaining bytes on second dw */ + subfic r10, r5, 64 /* remaining bytes on first dw */ + subfic r9, r9, 8 + subf r8, r9, r8 /* recalculate padding */ +L(srcunaligndstalign): + addi rRTN, rRTN, 1 + subfic r5, r10, 64 /* remaining bytes on second dw */ + addi rSRC, rSRC, 8 + li rTMP,0 + b L(storedouble) + + .align 4 +L(dst_align_small): + mtctr r8 + /* Write till src is aligned */ +L(storebyte2): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on dw */ +#else + addi r11, r11, -8 +#endif + srd rALT, rWORD, r11 stbu rALT, 1(rRTN) - cmpdi rWORD, 0 - beq L(u2) - lbzu rALT, 1(rSRC) - stbu rWORD, 1(rRTN) - cmpdi rALT, 0 - beq L(u1) - lbzu rWORD, 1(rSRC) + bdnz L(storebyte2) + + addi rSRC, rSRC, 8 /* Increment src pointer */ + addi rRTN, rRTN, 1 /* Increment dst pointer */ + rldicl r8, rRTN, 0, 61 /* Recalculate padding */ + + /* src is aligned */ +L(srcaligndstunalign): + ld rWORD, 0(rSRC) + mr rALT, rWORD + li rTMP, 0 /* Check null */ + cmpb rTMP, rWORD, rTMP + cmpdi rTMP, 0 + bne L(bytebybyte) /* Do byte by byte if there is NULL */ + rlwinm r5, rRTN, 3,26,28 /* Calculate padding */ + addi rRTN, rRTN, -1 + subfic r10, r8, 8 + /* write byte by byte till aligned */ +#ifdef __LITTLE_ENDIAN__ + li r11, -8 +#else + li r11, 64 +#endif + mtctr r10 + cmpdi rTMP, r10, 4 + blt L(storebyte) + beq L(equal) + addi rTMP, r10, -4 + mtctr rTMP +L(storebyte): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on dw */ +#else + addi r11, r11, -8 +#endif + srd rALT, rWORD, r11 stbu rALT, 1(rRTN) - cmpdi rWORD, 0 - bne L(u0) -L(u2): stbu rWORD, 1(rRTN) - blr -L(u1): stbu rALT, 1(rRTN) - blr + bdnz L(storebyte) + + cmpdi rTMP, r10, 4 + blt L(align) + + .align 4 +L(equal): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 + srd rALT, rWORD, r11 +#else + subfic r11, r11, 64 + sld rALT, rWORD, r11 + srdi rALT, rALT, 32 +#endif + stw rALT, 1(rRTN) + addi rRTN, rRTN, 4 +L(align): + addi rRTN, rRTN, 1 + addi rSRC, rSRC, 8 /* Increment src pointer */ + subfic r10, r5, 64 + li rTMP, 0 + /* dst addr aligned to 8 */ +L(storedouble): + ld rALT, 0(rSRC) /* load next dw */ + cmpb rTMP, rALT, rTMP + cmpdi rTMP, 0 /* check for null on each new dw */ + bne L(null) +#ifdef __LITTLE_ENDIAN__ + srd r9, rWORD, r10 /* bytes from first dw */ + sld r11, rALT, r5 /* bytes from second dw */ +#else + sld r9, rWORD, r10 + srd r11, rALT, r5 +#endif + or r11, r9, r11 /* make as a single dw */ + std r11, 0(rRTN) /* store as std on aligned addr */ + mr rWORD, rALT /* still few bytes left to be written */ + addi rRTN, rRTN, 8 /* increment dst addr */ + addi rSRC, rSRC, 8 /* increment src addr */ + b L(storedouble) /* Loop till NULL */ + + .align 4 + +/* We've hit the end of the string. Do the rest byte-by-byte. */ +L(null): + addi rRTN, rRTN, -1 + mr r10, r5 + mtctr r8 +#ifdef __LITTLE_ENDIAN__ + subfic r10, r10, 64 + addi r10, r10, -8 +#endif + cmpdi rTMP, r8, 4 + blt L(loop) + + /* we can still use stw if leftover >= 4*/ +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, 8 + srd r11, rWORD, r10 +#else + subfic r10, r10, 64 + sld r11, rWORD, r10 + srdi r11, r11, 32 +#endif + stw r11, 1(rRTN) + addi rRTN, rRTN, 4 + + beq L(bytebybyte1) + addi r10, r10, 32 +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, -8 +#else + subfic r10, r10, 64 +#endif + addi rTMP, r8, -4 + mtctr rTMP + /* remaining byte by byte part of first dw */ +L(loop): +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, 8 +#else + addi r10, r10, -8 +#endif + srd rTMP, rWORD, r10 + stbu rTMP, 1(rRTN) + bdnz L(loop) + +L(bytebybyte1): + addi rRTN, rRTN, 1 + /* remaining byte by byte part of second dw */ +L(bytebybyte): + addi rRTN, rRTN, -8 + b L(g1) + END (FUNC_NAME) #ifndef USE_AS_STPCPY |