diff options
Diffstat (limited to 'sysdeps/arm/armv7/multiarch/memcpy_impl.S')
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy_impl.S | 917 |
1 files changed, 917 insertions, 0 deletions
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/sysdeps/arm/armv7/multiarch/memcpy_impl.S new file mode 100644 index 0000000000..1562416cf6 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memcpy_impl.S @@ -0,0 +1,917 @@ +/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15. + Copyright (C) 2013-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. + + This memcpy routine is optimised for Cortex-A15 cores and takes advantage + of VFP or NEON when built with the appropriate flags. + + Assumptions: + + ARMv6 (ARMv7-a if using Neon) + ARM state + Unaligned accesses + + */ + +/* Thumb cannot encode negative immediate offsets in memory operations. */ +#ifndef NO_THUMB +#define NO_THUMB +#endif +#include <sysdep.h> +#include <arm-features.h> + + .syntax unified + /* This implementation requires ARM state. */ + .arm + +#ifdef MEMCPY_NEON + + .fpu neon + .arch armv7-a +# define FRAME_SIZE 4 +# define USE_VFP +# define USE_NEON + +#elif defined (MEMCPY_VFP) + + .arch armv6 + .fpu vfpv2 +# define FRAME_SIZE 32 +# define USE_VFP + +#else + .arch armv6 +# define FRAME_SIZE 32 + +#endif + +#define ALIGN(addr, align) addr:align + +#define INSN_SIZE 4 + +/* Call parameters. */ +#define dstin r0 +#define src r1 +#define count r2 + +/* Locals. */ +#define tmp1 r3 +#define dst ip +#define tmp2 r8 + +/* These two macros both work by repeated invocation of the macro + dispatch_step (not defined here). That macro performs one "step", + doing one load instruction and one store instruction to copy one + "unit". On entry, TMP1 contains the number of bytes to be copied, + a multiple of the unit size. The macro clobbers TMP1 in the + process of doing a computed jump to the tail containing the + appropriate number of steps. + + In dispatch_7_dword, dispatch_step is invoked seven times, with an + argument that is 7 for the first and 1 for the last. Units are + double-words (8 bytes). TMP1 is at most 56. + + In dispatch_15_word, dispatch_step is invoked fifteen times, + with an argument that is 15 for the first and 1 for the last. + Units are words (4 bytes). TMP1 is at most 60. */ + +#ifndef ARM_ALWAYS_BX +# if ARM_BX_ALIGN_LOG2 != 2 +# error case not handled +# endif + .macro dispatch_7_dword + rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE) + add pc, pc, tmp1 + dispatch_step 7 + dispatch_step 6 + dispatch_step 5 + dispatch_step 4 + dispatch_step 3 + dispatch_step 2 + dispatch_step 1 + .purgem dispatch_step + .endm + + .macro dispatch_15_word + rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2) + add pc, pc, tmp1, lsl #1 + dispatch_step 15 + dispatch_step 14 + dispatch_step 13 + dispatch_step 12 + dispatch_step 11 + dispatch_step 10 + dispatch_step 9 + dispatch_step 8 + dispatch_step 7 + dispatch_step 6 + dispatch_step 5 + dispatch_step 4 + dispatch_step 3 + dispatch_step 2 + dispatch_step 1 + .purgem dispatch_step + .endm +#else +# if ARM_BX_ALIGN_LOG2 < 3 +# error case not handled +# endif + .macro dispatch_helper steps, log2_bytes_per_step + /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is + (STEPS << LOG2_BYTES_PER_STEP). + So this is (steps_to_skip << LOG2_BYTES_PER_STEP). + Then it needs further adjustment to compensate for the + distance between the PC value taken below (0f + PC_OFS) + and the first step's instructions (1f). */ + rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \ + + ((1f - PC_OFS - 0f) \ + >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step))) + /* Shifting down LOG2_BYTES_PER_STEP gives us the number of + steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us + the (byte) distance to add to the PC. */ +0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step) + bx tmp1 + .p2align ARM_BX_ALIGN_LOG2 +1: + .endm + + .macro dispatch_7_dword + dispatch_helper 7, 3 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 7 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 6 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 5 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 4 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 3 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 2 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 1 + .p2align ARM_BX_ALIGN_LOG2 + .purgem dispatch_step + .endm + + .macro dispatch_15_word + dispatch_helper 15, 2 + dispatch_step 15 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 14 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 13 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 12 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 11 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 10 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 9 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 8 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 7 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 6 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 5 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 4 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 3 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 2 + .p2align ARM_BX_ALIGN_LOG2 + dispatch_step 1 + .p2align ARM_BX_ALIGN_LOG2 + .purgem dispatch_step + .endm + +#endif + +#ifndef USE_NEON +/* For bulk copies using GP registers. */ +#define A_l r2 /* Call-clobbered. */ +#define A_h r3 /* Call-clobbered. */ +#define B_l r4 +#define B_h r5 +#define C_l r6 +#define C_h r7 +/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */ +#define D_l r10 +#define D_h r11 +#endif + +/* Number of lines ahead to pre-fetch data. If you change this the code + below will need adjustment to compensate. */ + +#define prefetch_lines 5 + +#ifdef USE_VFP + .macro cpy_line_vfp vreg, base + sfi_breg dst, \ + vstr \vreg, [\B, #\base] + sfi_breg src, \ + vldr \vreg, [\B, #\base] + sfi_breg dst, \ + vstr d0, [\B, #\base + 8] + sfi_breg src, \ + vldr d0, [\B, #\base + 8] + sfi_breg dst, \ + vstr d1, [\B, #\base + 16] + sfi_breg src, \ + vldr d1, [\B, #\base + 16] + sfi_breg dst, \ + vstr d2, [\B, #\base + 24] + sfi_breg src, \ + vldr d2, [\B, #\base + 24] + sfi_breg dst, \ + vstr \vreg, [\B, #\base + 32] + sfi_breg src, \ + vldr \vreg, [\B, #\base + prefetch_lines * 64 - 32] + sfi_breg dst, \ + vstr d0, [\B, #\base + 40] + sfi_breg src, \ + vldr d0, [\B, #\base + 40] + sfi_breg dst, \ + vstr d1, [\B, #\base + 48] + sfi_breg src, \ + vldr d1, [\B, #\base + 48] + sfi_breg dst, \ + vstr d2, [\B, #\base + 56] + sfi_breg src, \ + vldr d2, [\B, #\base + 56] + .endm + + .macro cpy_tail_vfp vreg, base + sfi_breg dst, \ + vstr \vreg, [\B, #\base] + sfi_breg src, \ + vldr \vreg, [\B, #\base] + sfi_breg dst, \ + vstr d0, [\B, #\base + 8] + sfi_breg src, \ + vldr d0, [\B, #\base + 8] + sfi_breg dst, \ + vstr d1, [\B, #\base + 16] + sfi_breg src, \ + vldr d1, [\B, #\base + 16] + sfi_breg dst, \ + vstr d2, [\B, #\base + 24] + sfi_breg src, \ + vldr d2, [\B, #\base + 24] + sfi_breg dst, \ + vstr \vreg, [\B, #\base + 32] + sfi_breg dst, \ + vstr d0, [\B, #\base + 40] + sfi_breg src, \ + vldr d0, [\B, #\base + 40] + sfi_breg dst, \ + vstr d1, [\B, #\base + 48] + sfi_breg src, \ + vldr d1, [\B, #\base + 48] + sfi_breg dst, \ + vstr d2, [\B, #\base + 56] + sfi_breg src, \ + vldr d2, [\B, #\base + 56] + .endm +#endif + + .p2align 6 +ENTRY(memcpy) + + mov dst, dstin /* Preserve dstin, we need to return it. */ + cmp count, #64 + bge .Lcpy_not_short + /* Deal with small copies quickly by dropping straight into the + exit block. */ + +.Ltail63unaligned: +#ifdef USE_NEON + /* These need an extra layer of macro just to work around a + bug in the assembler's parser when an operand starts with + a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647 + tracks that bug; it was not fixed as of binutils-2.23.2. */ + .macro neon_load_d0 reg + vld1.8 {d0}, [\reg]! + .endm + .macro neon_store_d0 reg + vst1.8 {d0}, [\reg]! + .endm + + /* These are used by the NaCl sfi_breg macro. */ + .macro _sfi_breg_dmask_neon_load_d0 reg + _sfi_dmask \reg + .endm + .macro _sfi_breg_dmask_neon_store_d0 reg + _sfi_dmask \reg + .endm + + and tmp1, count, #0x38 + .macro dispatch_step i + sfi_breg src, neon_load_d0 \B + sfi_breg dst, neon_store_d0 \B + .endm + dispatch_7_dword + + tst count, #4 + sfi_breg src, \ + ldrne tmp1, [\B], #4 + sfi_breg dst, \ + strne tmp1, [\B], #4 +#else + /* Copy up to 15 full words of data. May not be aligned. */ + /* Cannot use VFP for unaligned data. */ + and tmp1, count, #0x3c + add dst, dst, tmp1 + add src, src, tmp1 + /* Jump directly into the sequence below at the correct offset. */ + .macro dispatch_step i + sfi_breg src, \ + ldr tmp1, [\B, #-(\i * 4)] + sfi_breg dst, \ + str tmp1, [\B, #-(\i * 4)] + .endm + dispatch_15_word +#endif + + lsls count, count, #31 + sfi_breg src, \ + ldrhcs tmp1, [\B], #2 + sfi_breg src, \ + ldrbne src, [\B] /* Src is dead, use as a scratch. */ + sfi_breg dst, \ + strhcs tmp1, [\B], #2 + sfi_breg dst, \ + strbne src, [\B] + bx lr + +.Lcpy_not_short: + /* At least 64 bytes to copy, but don't know the alignment yet. */ + str tmp2, [sp, #-FRAME_SIZE]! + cfi_adjust_cfa_offset (FRAME_SIZE) + cfi_rel_offset (tmp2, 0) + cfi_remember_state + and tmp2, src, #7 + and tmp1, dst, #7 + cmp tmp1, tmp2 + bne .Lcpy_notaligned + +#ifdef USE_VFP + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show + that the FP pipeline is much better at streaming loads and + stores. This is outside the critical loop. */ + vmov.f32 s0, s0 +#endif + + /* SRC and DST have the same mutual 64-bit alignment, but we may + still need to pre-copy some bytes to get to natural alignment. + We bring SRC and DST into full 64-bit alignment. */ + lsls tmp2, dst, #29 + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + sfi_breg src, \ + ldrmi tmp1, [\B], #4 + sfi_breg dst, \ + strmi tmp1, [\B], #4 + lsls tmp2, tmp2, #2 + sfi_breg src, \ + ldrhcs tmp1, [\B], #2 + sfi_breg src, \ + ldrbne tmp2, [\B], #1 + sfi_breg dst, \ + strhcs tmp1, [\B], #2 + sfi_breg dst, \ + strbne tmp2, [\B], #1 + +1: + subs tmp2, count, #64 /* Use tmp2 for count. */ + blt .Ltail63aligned + + cmp tmp2, #512 + bge .Lcpy_body_long + +.Lcpy_body_medium: /* Count in tmp2. */ +#ifdef USE_VFP +1: + sfi_breg src, \ + vldr d0, [\B, #0] + subs tmp2, tmp2, #64 + sfi_breg src, \ + vldr d1, [\B, #8] + sfi_breg dst, \ + vstr d0, [\B, #0] + sfi_breg src, \ + vldr d0, [\B, #16] + sfi_breg dst, \ + vstr d1, [\B, #8] + sfi_breg src, \ + vldr d1, [\B, #24] + sfi_breg dst, \ + vstr d0, [\B, #16] + sfi_breg src, \ + vldr d0, [\B, #32] + sfi_breg dst, \ + vstr d1, [\B, #24] + sfi_breg src, \ + vldr d1, [\B, #40] + sfi_breg dst, \ + vstr d0, [\B, #32] + sfi_breg src, \ + vldr d0, [\B, #48] + sfi_breg dst, \ + vstr d1, [\B, #40] + sfi_breg src, \ + vldr d1, [\B, #56] + sfi_breg dst, \ + vstr d0, [\B, #48] + add src, src, #64 + sfi_breg dst, \ + vstr d1, [\B, #56] + add dst, dst, #64 + bge 1b + tst tmp2, #0x3f + beq .Ldone + +.Ltail63aligned: /* Count in tmp2. */ + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + .macro dispatch_step i + sfi_breg src, \ + vldr d0, [\B, #-(\i * 8)] + sfi_breg dst, \ + vstr d0, [\B, #-(\i * 8)] + .endm + dispatch_7_dword +#else + sub src, src, #8 + sub dst, dst, #8 +1: + sfi_breg src, \ + ldrd A_l, A_h, [\B, #8] + sfi_breg dst, \ + strd A_l, A_h, [\B, #8] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #16] + sfi_breg dst, \ + strd A_l, A_h, [\B, #16] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #24] + sfi_breg dst, \ + strd A_l, A_h, [\B, #24] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #32] + sfi_breg dst, \ + strd A_l, A_h, [\B, #32] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #40] + sfi_breg dst, \ + strd A_l, A_h, [\B, #40] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #48] + sfi_breg dst, \ + strd A_l, A_h, [\B, #48] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #56] + sfi_breg dst, \ + strd A_l, A_h, [\B, #56] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #64]! + sfi_breg dst, \ + strd A_l, A_h, [\B, #64]! + subs tmp2, tmp2, #64 + bge 1b + tst tmp2, #0x3f + bne 1f + ldr tmp2,[sp], #FRAME_SIZE + cfi_adjust_cfa_offset (-FRAME_SIZE) + cfi_restore (tmp2) + bx lr + + cfi_restore_state + cfi_remember_state +1: + add src, src, #8 + add dst, dst, #8 + +.Ltail63aligned: /* Count in tmp2. */ + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but + we know that the src and dest are 64-bit aligned so we can use + LDRD/STRD to improve efficiency. */ + /* TMP2 is now negative, but we don't care about that. The bottom + six bits still tell us how many bytes are left to copy. */ + + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + .macro dispatch_step i + sfi_breg src, \ + ldrd A_l, A_h, [\B, #-(\i * 8)] + sfi_breg dst, \ + strd A_l, A_h, [\B, #-(\i * 8)] + .endm + dispatch_7_dword +#endif + + tst tmp2, #4 + sfi_breg src, \ + ldrne tmp1, [\B], #4 + sfi_breg dst, \ + strne tmp1, [\B], #4 + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ + sfi_breg src, \ + ldrhcs tmp1, [\B], #2 + sfi_breg src, \ + ldrbne tmp2, [\B] + sfi_breg dst, \ + strhcs tmp1, [\B], #2 + sfi_breg dst, \ + strbne tmp2, [\B] + +.Ldone: + ldr tmp2, [sp], #FRAME_SIZE + cfi_adjust_cfa_offset (-FRAME_SIZE) + cfi_restore (tmp2) + bx lr + + cfi_restore_state + cfi_remember_state + +.Lcpy_body_long: /* Count in tmp2. */ + + /* Long copy. We know that there's at least (prefetch_lines * 64) + bytes to go. */ +#ifdef USE_VFP + /* Don't use PLD. Instead, read some data in advance of the current + copy position into a register. This should act like a PLD + operation but we won't have to repeat the transfer. */ + + sfi_breg src, \ + vldr d3, [\B, #0] + sfi_breg src, \ + vldr d4, [\B, #64] + sfi_breg src, \ + vldr d5, [\B, #128] + sfi_breg src, \ + vldr d6, [\B, #192] + sfi_breg src, \ + vldr d7, [\B, #256] + + sfi_breg src, \ + vldr d0, [\B, #8] + sfi_breg src, \ + vldr d1, [\B, #16] + sfi_breg src, \ + vldr d2, [\B, #24] + add src, src, #32 + + subs tmp2, tmp2, #prefetch_lines * 64 * 2 + blt 2f +1: + cpy_line_vfp d3, 0 + cpy_line_vfp d4, 64 + cpy_line_vfp d5, 128 + add dst, dst, #3 * 64 + add src, src, #3 * 64 + cpy_line_vfp d6, 0 + cpy_line_vfp d7, 64 + add dst, dst, #2 * 64 + add src, src, #2 * 64 + subs tmp2, tmp2, #prefetch_lines * 64 + bge 1b + +2: + cpy_tail_vfp d3, 0 + cpy_tail_vfp d4, 64 + cpy_tail_vfp d5, 128 + add src, src, #3 * 64 + add dst, dst, #3 * 64 + cpy_tail_vfp d6, 0 + sfi_breg dst, \ + vstr d7, [\B, #64] + sfi_breg src, \ + vldr d7, [\B, #64] + sfi_breg dst, \ + vstr d0, [\B, #64 + 8] + sfi_breg src, \ + vldr d0, [\B, #64 + 8] + sfi_breg dst, \ + vstr d1, [\B, #64 + 16] + sfi_breg src, \ + vldr d1, [\B, #64 + 16] + sfi_breg dst, \ + vstr d2, [\B, #64 + 24] + sfi_breg src, \ + vldr d2, [\B, #64 + 24] + sfi_breg dst, \ + vstr d7, [\B, #64 + 32] + add src, src, #96 + sfi_breg dst, \ + vstr d0, [\B, #64 + 40] + sfi_breg dst, \ + vstr d1, [\B, #64 + 48] + sfi_breg dst, \ + vstr d2, [\B, #64 + 56] + add dst, dst, #128 + add tmp2, tmp2, #prefetch_lines * 64 + b .Lcpy_body_medium +#else + /* Long copy. Use an SMS style loop to maximize the I/O + bandwidth of the core. We don't have enough spare registers + to synthesise prefetching, so use PLD operations. */ + /* Pre-bias src and dst. */ + sub src, src, #8 + sub dst, dst, #8 + sfi_pld src, #8 + sfi_pld src, #72 + subs tmp2, tmp2, #64 + sfi_pld src, #136 + sfi_breg src, \ + ldrd A_l, A_h, [\B, #8] + strd B_l, B_h, [sp, #8] + cfi_rel_offset (B_l, 8) + cfi_rel_offset (B_h, 12) + sfi_breg src, \ + ldrd B_l, B_h, [\B, #16] + strd C_l, C_h, [sp, #16] + cfi_rel_offset (C_l, 16) + cfi_rel_offset (C_h, 20) + sfi_breg src, \ + ldrd C_l, C_h, [\B, #24] + strd D_l, D_h, [sp, #24] + cfi_rel_offset (D_l, 24) + cfi_rel_offset (D_h, 28) + sfi_pld src, #200 + sfi_breg src, \ + ldrd D_l, D_h, [\B, #32]! + b 1f + .p2align 6 +2: + sfi_pld src, #232 + sfi_breg dst, \ + strd A_l, A_h, [\B, #40] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #40] + sfi_breg dst, \ + strd B_l, B_h, [\B, #48] + sfi_breg src, \ + ldrd B_l, B_h, [\B, #48] + sfi_breg dst, \ + strd C_l, C_h, [\B, #56] + sfi_breg src, \ + ldrd C_l, C_h, [\B, #56] + sfi_breg dst, \ + strd D_l, D_h, [\B, #64]! + sfi_breg src, \ + ldrd D_l, D_h, [\B, #64]! + subs tmp2, tmp2, #64 +1: + sfi_breg dst, \ + strd A_l, A_h, [\B, #8] + sfi_breg src, \ + ldrd A_l, A_h, [\B, #8] + sfi_breg dst, \ + strd B_l, B_h, [\B, #16] + sfi_breg src, \ + ldrd B_l, B_h, [\B, #16] + sfi_breg dst, \ + strd C_l, C_h, [\B, #24] + sfi_breg src, \ + ldrd C_l, C_h, [\B, #24] + sfi_breg dst, \ + strd D_l, D_h, [\B, #32] + sfi_breg src, \ + ldrd D_l, D_h, [\B, #32] + bcs 2b + /* Save the remaining bytes and restore the callee-saved regs. */ + sfi_breg dst, \ + strd A_l, A_h, [\B, #40] + add src, src, #40 + sfi_breg dst, \ + strd B_l, B_h, [\B, #48] + ldrd B_l, B_h, [sp, #8] + cfi_restore (B_l) + cfi_restore (B_h) + sfi_breg dst, \ + strd C_l, C_h, [\B, #56] + ldrd C_l, C_h, [sp, #16] + cfi_restore (C_l) + cfi_restore (C_h) + sfi_breg dst, \ + strd D_l, D_h, [\B, #64] + ldrd D_l, D_h, [sp, #24] + cfi_restore (D_l) + cfi_restore (D_h) + add dst, dst, #72 + tst tmp2, #0x3f + bne .Ltail63aligned + ldr tmp2, [sp], #FRAME_SIZE + cfi_adjust_cfa_offset (-FRAME_SIZE) + cfi_restore (tmp2) + bx lr +#endif + + cfi_restore_state + cfi_remember_state + +.Lcpy_notaligned: + sfi_pld src + sfi_pld src, #64 + /* There's at least 64 bytes to copy, but there is no mutual + alignment. */ + /* Bring DST to 64-bit alignment. */ + lsls tmp2, dst, #29 + sfi_pld src, #(2 * 64) + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + sfi_breg src, \ + ldrmi tmp1, [\B], #4 + sfi_breg dst, \ + strmi tmp1, [\B], #4 + lsls tmp2, tmp2, #2 + sfi_breg src, \ + ldrbne tmp1, [\B], #1 + sfi_breg src, \ + ldrhcs tmp2, [\B], #2 + sfi_breg dst, \ + strbne tmp1, [\B], #1 + sfi_breg dst, \ + strhcs tmp2, [\B], #2 +1: + sfi_pld src, #(3 * 64) + subs count, count, #64 + ldrmi tmp2, [sp], #FRAME_SIZE + bmi .Ltail63unaligned + sfi_pld src, #(4 * 64) + +#ifdef USE_NEON + /* These need an extra layer of macro just to work around a + bug in the assembler's parser when an operand starts with + a {...}. */ + .macro neon_load_multi reglist, basereg + vld1.8 {\reglist}, [\basereg]! + .endm + .macro neon_store_multi reglist, basereg + vst1.8 {\reglist}, [ALIGN (\basereg, 64)]! + .endm + + /* These are used by the NaCl sfi_breg macro. */ + .macro _sfi_breg_dmask_neon_load_multi reg + _sfi_dmask \reg + .endm + .macro _sfi_breg_dmask_neon_store_multi reg + _sfi_dmask \reg + .endm + + sfi_breg src, neon_load_multi d0-d3, \B + sfi_breg src, neon_load_multi d4-d7, \B + subs count, count, #64 + bmi 2f +1: + sfi_pld src, #(4 * 64) + sfi_breg dst, neon_store_multi d0-d3, \B + sfi_breg src, neon_load_multi d0-d3, \B + sfi_breg dst, neon_store_multi d4-d7, \B + sfi_breg src, neon_load_multi d4-d7, \B + subs count, count, #64 + bpl 1b +2: + sfi_breg dst, neon_store_multi d0-d3, \B + sfi_breg dst, neon_store_multi d4-d7, \B + ands count, count, #0x3f +#else + /* Use an SMS style loop to maximize the I/O bandwidth. */ + sub src, src, #4 + sub dst, dst, #8 + subs tmp2, count, #64 /* Use tmp2 for count. */ + sfi_breg src, \ + ldr A_l, [\B, #4] + sfi_breg src, \ + ldr A_h, [\B, #8] + strd B_l, B_h, [sp, #8] + cfi_rel_offset (B_l, 8) + cfi_rel_offset (B_h, 12) + sfi_breg src, \ + ldr B_l, [\B, #12] + sfi_breg src, \ + ldr B_h, [\B, #16] + strd C_l, C_h, [sp, #16] + cfi_rel_offset (C_l, 16) + cfi_rel_offset (C_h, 20) + sfi_breg src, \ + ldr C_l, [\B, #20] + sfi_breg src, \ + ldr C_h, [\B, #24] + strd D_l, D_h, [sp, #24] + cfi_rel_offset (D_l, 24) + cfi_rel_offset (D_h, 28) + sfi_breg src, \ + ldr D_l, [\B, #28] + sfi_breg src, \ + ldr D_h, [\B, #32]! + b 1f + .p2align 6 +2: + sfi_pld src, #(5 * 64) - (32 - 4) + sfi_breg dst, \ + strd A_l, A_h, [\B, #40] + sfi_breg src, \ + ldr A_l, [\B, #36] + sfi_breg src, \ + ldr A_h, [\B, #40] + sfi_breg dst, \ + strd B_l, B_h, [\B, #48] + sfi_breg src, \ + ldr B_l, [\B, #44] + sfi_breg src, \ + ldr B_h, [\B, #48] + sfi_breg dst, \ + strd C_l, C_h, [\B, #56] + sfi_breg src, \ + ldr C_l, [\B, #52] + sfi_breg src, \ + ldr C_h, [\B, #56] + sfi_breg dst, \ + strd D_l, D_h, [\B, #64]! + sfi_breg src, \ + ldr D_l, [\B, #60] + sfi_breg src, \ + ldr D_h, [\B, #64]! + subs tmp2, tmp2, #64 +1: + sfi_breg dst, \ + strd A_l, A_h, [\B, #8] + sfi_breg src, \ + ldr A_l, [\B, #4] + sfi_breg src, \ + ldr A_h, [\B, #8] + sfi_breg dst, \ + strd B_l, B_h, [\B, #16] + sfi_breg src, \ + ldr B_l, [\B, #12] + sfi_breg src, \ + ldr B_h, [\B, #16] + sfi_breg dst, \ + strd C_l, C_h, [\B, #24] + sfi_breg src, \ + ldr C_l, [\B, #20] + sfi_breg src, \ + ldr C_h, [\B, #24] + sfi_breg dst, \ + strd D_l, D_h, [\B, #32] + sfi_breg src, \ + ldr D_l, [\B, #28] + sfi_breg src, \ + ldr D_h, [\B, #32] + bcs 2b + + /* Save the remaining bytes and restore the callee-saved regs. */ + sfi_breg dst, \ + strd A_l, A_h, [\B, #40] + add src, src, #36 + sfi_breg dst, \ + strd B_l, B_h, [\B, #48] + ldrd B_l, B_h, [sp, #8] + cfi_restore (B_l) + cfi_restore (B_h) + sfi_breg dst, \ + strd C_l, C_h, [\B, #56] + ldrd C_l, C_h, [sp, #16] + cfi_restore (C_l) + cfi_restore (C_h) + sfi_breg dst, \ + strd D_l, D_h, [\B, #64] + ldrd D_l, D_h, [sp, #24] + cfi_restore (D_l) + cfi_restore (D_h) + add dst, dst, #72 + ands count, tmp2, #0x3f +#endif + ldr tmp2, [sp], #FRAME_SIZE + cfi_adjust_cfa_offset (-FRAME_SIZE) + cfi_restore (tmp2) + bne .Ltail63unaligned + bx lr + +END(memcpy) +libc_hidden_builtin_def (memcpy) |