diff options
Diffstat (limited to 'sysdeps/arm/armv7')
-rw-r--r-- | sysdeps/arm/armv7/Implies | 2 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/aeabi_memcpy.c | 2 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/ifunc-impl-list.c | 56 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy.S | 76 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy_impl.S | 728 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy_neon.S | 9 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy_vfp.S | 7 | ||||
-rw-r--r-- | sysdeps/arm/armv7/strcmp.S | 519 |
9 files changed, 0 insertions, 1402 deletions
diff --git a/sysdeps/arm/armv7/Implies b/sysdeps/arm/armv7/Implies deleted file mode 100644 index c6cd0eb877..0000000000 --- a/sysdeps/arm/armv7/Implies +++ /dev/null @@ -1,2 +0,0 @@ -# We can do everything that 6T2 can -arm/armv6t2 diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile deleted file mode 100644 index e834cc937f..0000000000 --- a/sysdeps/arm/armv7/multiarch/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -ifeq ($(subdir),string) -sysdep_routines += memcpy_neon memcpy_vfp -endif diff --git a/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c b/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c deleted file mode 100644 index c6a2a98a55..0000000000 --- a/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c +++ /dev/null @@ -1,2 +0,0 @@ -/* Empty file to override sysdeps/arm version. See memcpy.S for definitions - of these functions. */ diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c deleted file mode 100644 index b8094fd393..0000000000 --- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c +++ /dev/null @@ -1,56 +0,0 @@ -/* Enumerate available IFUNC implementations of a function. ARM version. - Copyright (C) 2013-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <stdbool.h> -#include <string.h> -#include <ldsodefs.h> -#include <sysdep.h> -#include <ifunc-impl-list.h> - -/* Fill ARRAY of MAX elements with IFUNC implementations for function - NAME and return the number of valid entries. */ - -size_t -__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, - size_t max) -{ - size_t i = 0; - - bool use_neon = true; -#ifdef __ARM_NEON__ -# define __memcpy_neon memcpy -#else - use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0; -#endif - -#ifndef __ARM_NEON__ - bool use_vfp = true; -# ifdef __SOFTFP__ - use_vfp = (GLRO(dl_hwcap) & HWCAP_ARM_VFP) != 0; -# endif -#endif - - IFUNC_IMPL (i, name, memcpy, - IFUNC_IMPL_ADD (array, i, memcpy, use_neon, __memcpy_neon) -#ifndef __ARM_NEON__ - IFUNC_IMPL_ADD (array, i, memcpy, use_vfp, __memcpy_vfp) -#endif - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm)); - - return i; -} diff --git a/sysdeps/arm/armv7/multiarch/memcpy.S b/sysdeps/arm/armv7/multiarch/memcpy.S deleted file mode 100644 index 8a53bdaf91..0000000000 --- a/sysdeps/arm/armv7/multiarch/memcpy.S +++ /dev/null @@ -1,76 +0,0 @@ -/* Multiple versions of memcpy - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2013-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* Thumb requires excess IT instructions here. */ -#define NO_THUMB -#include <sysdep.h> -#include <rtld-global-offsets.h> - -#if IS_IN (libc) -/* Under __ARM_NEON__, memcpy_neon.S defines the name memcpy. */ -# ifndef __ARM_NEON__ - .text -ENTRY(memcpy) - .type memcpy, %gnu_indirect_function -# ifdef __SOFTFP__ - ldr r1, .Lmemcpy_arm - tst r0, #HWCAP_ARM_VFP - ldrne r1, .Lmemcpy_vfp -# else - ldr r1, .Lmemcpy_vfp -# endif - tst r0, #HWCAP_ARM_NEON - ldrne r1, .Lmemcpy_neon -1: - add r0, r1, pc - DO_RET(lr) - -# ifdef __SOFTFP__ -.Lmemcpy_arm: - .long C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS -# endif -.Lmemcpy_neon: - .long C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS -.Lmemcpy_vfp: - .long C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS - -END(memcpy) - -libc_hidden_builtin_def (memcpy) -#endif /* Not __ARM_NEON__. */ - -/* These versions of memcpy are defined not to clobber any VFP or NEON - registers so they must always call the ARM variant of the memcpy code. */ -strong_alias (__memcpy_arm, __aeabi_memcpy) -strong_alias (__memcpy_arm, __aeabi_memcpy4) -strong_alias (__memcpy_arm, __aeabi_memcpy8) -libc_hidden_def (__memcpy_arm) - -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) -#undef weak_alias -#define weak_alias(x, y) -#undef libc_hidden_def -#define libc_hidden_def(name) - -#define memcpy __memcpy_arm - -#endif - -#include "memcpy_impl.S" diff --git a/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/sysdeps/arm/armv7/multiarch/memcpy_impl.S deleted file mode 100644 index c1b9fb0ab5..0000000000 --- a/sysdeps/arm/armv7/multiarch/memcpy_impl.S +++ /dev/null @@ -1,728 +0,0 @@ -/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15. - Copyright (C) 2013-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. - - This memcpy routine is optimised for Cortex-A15 cores and takes advantage - of VFP or NEON when built with the appropriate flags. - - Assumptions: - - ARMv6 (ARMv7-a if using Neon) - ARM state - Unaligned accesses - - */ - -/* Thumb cannot encode negative immediate offsets in memory operations. */ -#ifndef NO_THUMB -#define NO_THUMB -#endif -#include <sysdep.h> -#include <arm-features.h> - - .syntax unified - /* This implementation requires ARM state. */ - .arm - -#ifdef MEMCPY_NEON - - .fpu neon - .arch armv7-a -# define FRAME_SIZE 4 -# define USE_VFP -# define USE_NEON - -#elif defined (MEMCPY_VFP) - - .arch armv6 - .fpu vfpv2 -# define FRAME_SIZE 32 -# define USE_VFP - -#else - .arch armv6 -# define FRAME_SIZE 32 - -#endif - -#define ALIGN(addr, align) addr:align - -#define INSN_SIZE 4 - -/* Call parameters. */ -#define dstin r0 -#define src r1 -#define count r2 - -/* Locals. */ -#define tmp1 r3 -#define dst ip -#define tmp2 r8 - -/* These two macros both work by repeated invocation of the macro - dispatch_step (not defined here). That macro performs one "step", - doing one load instruction and one store instruction to copy one - "unit". On entry, TMP1 contains the number of bytes to be copied, - a multiple of the unit size. The macro clobbers TMP1 in the - process of doing a computed jump to the tail containing the - appropriate number of steps. - - In dispatch_7_dword, dispatch_step is invoked seven times, with an - argument that is 7 for the first and 1 for the last. Units are - double-words (8 bytes). TMP1 is at most 56. - - In dispatch_15_word, dispatch_step is invoked fifteen times, - with an argument that is 15 for the first and 1 for the last. - Units are words (4 bytes). TMP1 is at most 60. */ - -#ifndef ARM_ALWAYS_BX -# if ARM_BX_ALIGN_LOG2 != 2 -# error case not handled -# endif - .macro dispatch_7_dword - rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE) - add pc, pc, tmp1 - dispatch_step 7 - dispatch_step 6 - dispatch_step 5 - dispatch_step 4 - dispatch_step 3 - dispatch_step 2 - dispatch_step 1 - .purgem dispatch_step - .endm - - .macro dispatch_15_word - rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2) - add pc, pc, tmp1, lsl #1 - dispatch_step 15 - dispatch_step 14 - dispatch_step 13 - dispatch_step 12 - dispatch_step 11 - dispatch_step 10 - dispatch_step 9 - dispatch_step 8 - dispatch_step 7 - dispatch_step 6 - dispatch_step 5 - dispatch_step 4 - dispatch_step 3 - dispatch_step 2 - dispatch_step 1 - .purgem dispatch_step - .endm -#else -# if ARM_BX_ALIGN_LOG2 < 3 -# error case not handled -# endif - .macro dispatch_helper steps, log2_bytes_per_step - /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is - (STEPS << LOG2_BYTES_PER_STEP). - So this is (steps_to_skip << LOG2_BYTES_PER_STEP). - Then it needs further adjustment to compensate for the - distance between the PC value taken below (0f + PC_OFS) - and the first step's instructions (1f). */ - rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \ - + ((1f - PC_OFS - 0f) \ - >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step))) - /* Shifting down LOG2_BYTES_PER_STEP gives us the number of - steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us - the (byte) distance to add to the PC. */ -0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step) - bx tmp1 - .p2align ARM_BX_ALIGN_LOG2 -1: - .endm - - .macro dispatch_7_dword - dispatch_helper 7, 3 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 7 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 6 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 5 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 4 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 3 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 2 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 1 - .p2align ARM_BX_ALIGN_LOG2 - .purgem dispatch_step - .endm - - .macro dispatch_15_word - dispatch_helper 15, 2 - dispatch_step 15 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 14 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 13 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 12 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 11 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 10 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 9 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 8 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 7 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 6 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 5 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 4 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 3 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 2 - .p2align ARM_BX_ALIGN_LOG2 - dispatch_step 1 - .p2align ARM_BX_ALIGN_LOG2 - .purgem dispatch_step - .endm - -#endif - -#ifndef USE_NEON -/* For bulk copies using GP registers. */ -#define A_l r2 /* Call-clobbered. */ -#define A_h r3 /* Call-clobbered. */ -#define B_l r4 -#define B_h r5 -#define C_l r6 -#define C_h r7 -/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */ -#define D_l r10 -#define D_h r11 -#endif - -/* Number of lines ahead to pre-fetch data. If you change this the code - below will need adjustment to compensate. */ - -#define prefetch_lines 5 - -#ifdef USE_VFP - .macro cpy_line_vfp vreg, base - vstr \vreg, [dst, #\base] - vldr \vreg, [src, #\base] - vstr d0, [dst, #\base + 8] - vldr d0, [src, #\base + 8] - vstr d1, [dst, #\base + 16] - vldr d1, [src, #\base + 16] - vstr d2, [dst, #\base + 24] - vldr d2, [src, #\base + 24] - vstr \vreg, [dst, #\base + 32] - vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] - vstr d0, [dst, #\base + 40] - vldr d0, [src, #\base + 40] - vstr d1, [dst, #\base + 48] - vldr d1, [src, #\base + 48] - vstr d2, [dst, #\base + 56] - vldr d2, [src, #\base + 56] - .endm - - .macro cpy_tail_vfp vreg, base - vstr \vreg, [dst, #\base] - vldr \vreg, [src, #\base] - vstr d0, [dst, #\base + 8] - vldr d0, [src, #\base + 8] - vstr d1, [dst, #\base + 16] - vldr d1, [src, #\base + 16] - vstr d2, [dst, #\base + 24] - vldr d2, [src, #\base + 24] - vstr \vreg, [dst, #\base + 32] - vstr d0, [dst, #\base + 40] - vldr d0, [src, #\base + 40] - vstr d1, [dst, #\base + 48] - vldr d1, [src, #\base + 48] - vstr d2, [dst, #\base + 56] - vldr d2, [src, #\base + 56] - .endm -#endif - - .p2align 6 -ENTRY(memcpy) - - mov dst, dstin /* Preserve dstin, we need to return it. */ - cmp count, #64 - bge .Lcpy_not_short - /* Deal with small copies quickly by dropping straight into the - exit block. */ - -.Ltail63unaligned: -#ifdef USE_NEON - /* These need an extra layer of macro just to work around a - bug in the assembler's parser when an operand starts with - a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647 - tracks that bug; it was not fixed as of binutils-2.23.2. */ - .macro neon_load_d0 reg - vld1.8 {d0}, [\reg]! - .endm - .macro neon_store_d0 reg - vst1.8 {d0}, [\reg]! - .endm - - and tmp1, count, #0x38 - .macro dispatch_step i - neon_load_d0 src - neon_store_d0 dst - .endm - dispatch_7_dword - - tst count, #4 - ldrne tmp1, [src], #4 - strne tmp1, [dst], #4 -#else - /* Copy up to 15 full words of data. May not be aligned. */ - /* Cannot use VFP for unaligned data. */ - and tmp1, count, #0x3c - add dst, dst, tmp1 - add src, src, tmp1 - /* Jump directly into the sequence below at the correct offset. */ - .macro dispatch_step i - ldr tmp1, [src, #-(\i * 4)] - str tmp1, [dst, #-(\i * 4)] - .endm - dispatch_15_word -#endif - - lsls count, count, #31 - ldrhcs tmp1, [src], #2 - ldrbne src, [src] /* Src is dead, use as a scratch. */ - strhcs tmp1, [dst], #2 - strbne src, [dst] - bx lr - -.Lcpy_not_short: - /* At least 64 bytes to copy, but don't know the alignment yet. */ - str tmp2, [sp, #-FRAME_SIZE]! - cfi_adjust_cfa_offset (FRAME_SIZE) - cfi_rel_offset (tmp2, 0) - cfi_remember_state - and tmp2, src, #7 - and tmp1, dst, #7 - cmp tmp1, tmp2 - bne .Lcpy_notaligned - -#ifdef USE_VFP - /* Magic dust alert! Force VFP on Cortex-A9. Experiments show - that the FP pipeline is much better at streaming loads and - stores. This is outside the critical loop. */ - vmov.f32 s0, s0 -#endif - - /* SRC and DST have the same mutual 64-bit alignment, but we may - still need to pre-copy some bytes to get to natural alignment. - We bring SRC and DST into full 64-bit alignment. */ - lsls tmp2, dst, #29 - beq 1f - rsbs tmp2, tmp2, #0 - sub count, count, tmp2, lsr #29 - ldrmi tmp1, [src], #4 - strmi tmp1, [dst], #4 - lsls tmp2, tmp2, #2 - ldrhcs tmp1, [src], #2 - ldrbne tmp2, [src], #1 - strhcs tmp1, [dst], #2 - strbne tmp2, [dst], #1 - -1: - subs tmp2, count, #64 /* Use tmp2 for count. */ - blt .Ltail63aligned - - cmp tmp2, #512 - bge .Lcpy_body_long - -.Lcpy_body_medium: /* Count in tmp2. */ -#ifdef USE_VFP -1: - vldr d0, [src, #0] - subs tmp2, tmp2, #64 - vldr d1, [src, #8] - vstr d0, [dst, #0] - vldr d0, [src, #16] - vstr d1, [dst, #8] - vldr d1, [src, #24] - vstr d0, [dst, #16] - vldr d0, [src, #32] - vstr d1, [dst, #24] - vldr d1, [src, #40] - vstr d0, [dst, #32] - vldr d0, [src, #48] - vstr d1, [dst, #40] - vldr d1, [src, #56] - vstr d0, [dst, #48] - add src, src, #64 - vstr d1, [dst, #56] - add dst, dst, #64 - bge 1b - tst tmp2, #0x3f - beq .Ldone - -.Ltail63aligned: /* Count in tmp2. */ - and tmp1, tmp2, #0x38 - add dst, dst, tmp1 - add src, src, tmp1 - .macro dispatch_step i - vldr d0, [src, #-(\i * 8)] - vstr d0, [dst, #-(\i * 8)] - .endm - dispatch_7_dword -#else - sub src, src, #8 - sub dst, dst, #8 -1: - ldrd A_l, A_h, [src, #8] - strd A_l, A_h, [dst, #8] - ldrd A_l, A_h, [src, #16] - strd A_l, A_h, [dst, #16] - ldrd A_l, A_h, [src, #24] - strd A_l, A_h, [dst, #24] - ldrd A_l, A_h, [src, #32] - strd A_l, A_h, [dst, #32] - ldrd A_l, A_h, [src, #40] - strd A_l, A_h, [dst, #40] - ldrd A_l, A_h, [src, #48] - strd A_l, A_h, [dst, #48] - ldrd A_l, A_h, [src, #56] - strd A_l, A_h, [dst, #56] - ldrd A_l, A_h, [src, #64]! - strd A_l, A_h, [dst, #64]! - subs tmp2, tmp2, #64 - bge 1b - tst tmp2, #0x3f - bne 1f - ldr tmp2,[sp], #FRAME_SIZE - cfi_adjust_cfa_offset (-FRAME_SIZE) - cfi_restore (tmp2) - bx lr - - cfi_restore_state - cfi_remember_state -1: - add src, src, #8 - add dst, dst, #8 - -.Ltail63aligned: /* Count in tmp2. */ - /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but - we know that the src and dest are 64-bit aligned so we can use - LDRD/STRD to improve efficiency. */ - /* TMP2 is now negative, but we don't care about that. The bottom - six bits still tell us how many bytes are left to copy. */ - - and tmp1, tmp2, #0x38 - add dst, dst, tmp1 - add src, src, tmp1 - .macro dispatch_step i - ldrd A_l, A_h, [src, #-(\i * 8)] - strd A_l, A_h, [dst, #-(\i * 8)] - .endm - dispatch_7_dword -#endif - - tst tmp2, #4 - ldrne tmp1, [src], #4 - strne tmp1, [dst], #4 - lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ - ldrhcs tmp1, [src], #2 - ldrbne tmp2, [src] - strhcs tmp1, [dst], #2 - strbne tmp2, [dst] - -.Ldone: - ldr tmp2, [sp], #FRAME_SIZE - cfi_adjust_cfa_offset (-FRAME_SIZE) - cfi_restore (tmp2) - bx lr - - cfi_restore_state - cfi_remember_state - -.Lcpy_body_long: /* Count in tmp2. */ - - /* Long copy. We know that there's at least (prefetch_lines * 64) - bytes to go. */ -#ifdef USE_VFP - /* Don't use PLD. Instead, read some data in advance of the current - copy position into a register. This should act like a PLD - operation but we won't have to repeat the transfer. */ - - vldr d3, [src, #0] - vldr d4, [src, #64] - vldr d5, [src, #128] - vldr d6, [src, #192] - vldr d7, [src, #256] - - vldr d0, [src, #8] - vldr d1, [src, #16] - vldr d2, [src, #24] - add src, src, #32 - - subs tmp2, tmp2, #prefetch_lines * 64 * 2 - blt 2f -1: - cpy_line_vfp d3, 0 - cpy_line_vfp d4, 64 - cpy_line_vfp d5, 128 - add dst, dst, #3 * 64 - add src, src, #3 * 64 - cpy_line_vfp d6, 0 - cpy_line_vfp d7, 64 - add dst, dst, #2 * 64 - add src, src, #2 * 64 - subs tmp2, tmp2, #prefetch_lines * 64 - bge 1b - -2: - cpy_tail_vfp d3, 0 - cpy_tail_vfp d4, 64 - cpy_tail_vfp d5, 128 - add src, src, #3 * 64 - add dst, dst, #3 * 64 - cpy_tail_vfp d6, 0 - vstr d7, [dst, #64] - vldr d7, [src, #64] - vstr d0, [dst, #64 + 8] - vldr d0, [src, #64 + 8] - vstr d1, [dst, #64 + 16] - vldr d1, [src, #64 + 16] - vstr d2, [dst, #64 + 24] - vldr d2, [src, #64 + 24] - vstr d7, [dst, #64 + 32] - add src, src, #96 - vstr d0, [dst, #64 + 40] - vstr d1, [dst, #64 + 48] - vstr d2, [dst, #64 + 56] - add dst, dst, #128 - add tmp2, tmp2, #prefetch_lines * 64 - b .Lcpy_body_medium -#else - /* Long copy. Use an SMS style loop to maximize the I/O - bandwidth of the core. We don't have enough spare registers - to synthesise prefetching, so use PLD operations. */ - /* Pre-bias src and dst. */ - sub src, src, #8 - sub dst, dst, #8 - pld [src, #8] - pld [src, #72] - subs tmp2, tmp2, #64 - pld [src, #136] - ldrd A_l, A_h, [src, #8] - strd B_l, B_h, [sp, #8] - cfi_rel_offset (B_l, 8) - cfi_rel_offset (B_h, 12) - ldrd B_l, B_h, [src, #16] - strd C_l, C_h, [sp, #16] - cfi_rel_offset (C_l, 16) - cfi_rel_offset (C_h, 20) - ldrd C_l, C_h, [src, #24] - strd D_l, D_h, [sp, #24] - cfi_rel_offset (D_l, 24) - cfi_rel_offset (D_h, 28) - pld [src, #200] - ldrd D_l, D_h, [src, #32]! - b 1f - .p2align 6 -2: - pld [src, #232] - strd A_l, A_h, [dst, #40] - ldrd A_l, A_h, [src, #40] - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [src, #48] - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [src, #56] - strd D_l, D_h, [dst, #64]! - ldrd D_l, D_h, [src, #64]! - subs tmp2, tmp2, #64 -1: - strd A_l, A_h, [dst, #8] - ldrd A_l, A_h, [src, #8] - strd B_l, B_h, [dst, #16] - ldrd B_l, B_h, [src, #16] - strd C_l, C_h, [dst, #24] - ldrd C_l, C_h, [src, #24] - strd D_l, D_h, [dst, #32] - ldrd D_l, D_h, [src, #32] - bcs 2b - /* Save the remaining bytes and restore the callee-saved regs. */ - strd A_l, A_h, [dst, #40] - add src, src, #40 - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [sp, #8] - cfi_restore (B_l) - cfi_restore (B_h) - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [sp, #16] - cfi_restore (C_l) - cfi_restore (C_h) - strd D_l, D_h, [dst, #64] - ldrd D_l, D_h, [sp, #24] - cfi_restore (D_l) - cfi_restore (D_h) - add dst, dst, #72 - tst tmp2, #0x3f - bne .Ltail63aligned - ldr tmp2, [sp], #FRAME_SIZE - cfi_adjust_cfa_offset (-FRAME_SIZE) - cfi_restore (tmp2) - bx lr -#endif - - cfi_restore_state - cfi_remember_state - -.Lcpy_notaligned: - pld [src, #0] - pld [src, #64] - /* There's at least 64 bytes to copy, but there is no mutual - alignment. */ - /* Bring DST to 64-bit alignment. */ - lsls tmp2, dst, #29 - pld [src, #(2 * 64)] - beq 1f - rsbs tmp2, tmp2, #0 - sub count, count, tmp2, lsr #29 - ldrmi tmp1, [src], #4 - strmi tmp1, [dst], #4 - lsls tmp2, tmp2, #2 - ldrbne tmp1, [src], #1 - ldrhcs tmp2, [src], #2 - strbne tmp1, [dst], #1 - strhcs tmp2, [dst], #2 -1: - pld [src, #(3 * 64)] - subs count, count, #64 - ldrmi tmp2, [sp], #FRAME_SIZE - bmi .Ltail63unaligned - pld [src, #(4 * 64)] - -#ifdef USE_NEON - /* These need an extra layer of macro just to work around a - bug in the assembler's parser when an operand starts with - a {...}. */ - .macro neon_load_multi reglist, basereg - vld1.8 {\reglist}, [\basereg]! - .endm - .macro neon_store_multi reglist, basereg - vst1.8 {\reglist}, [ALIGN (\basereg, 64)]! - .endm - - neon_load_multi d0-d3, src - neon_load_multi d4-d7, src - subs count, count, #64 - bmi 2f -1: - pld [src, #(4 * 64)] - neon_store_multi d0-d3, dst - neon_load_multi d0-d3, src - neon_store_multi d4-d7, dst - neon_load_multi d4-d7, src - subs count, count, #64 - bpl 1b -2: - neon_store_multi d0-d3, dst - neon_store_multi d4-d7, dst - ands count, count, #0x3f -#else - /* Use an SMS style loop to maximize the I/O bandwidth. */ - sub src, src, #4 - sub dst, dst, #8 - subs tmp2, count, #64 /* Use tmp2 for count. */ - ldr A_l, [src, #4] - ldr A_h, [src, #8] - strd B_l, B_h, [sp, #8] - cfi_rel_offset (B_l, 8) - cfi_rel_offset (B_h, 12) - ldr B_l, [src, #12] - ldr B_h, [src, #16] - strd C_l, C_h, [sp, #16] - cfi_rel_offset (C_l, 16) - cfi_rel_offset (C_h, 20) - ldr C_l, [src, #20] - ldr C_h, [src, #24] - strd D_l, D_h, [sp, #24] - cfi_rel_offset (D_l, 24) - cfi_rel_offset (D_h, 28) - ldr D_l, [src, #28] - ldr D_h, [src, #32]! - b 1f - .p2align 6 -2: - pld [src, #(5 * 64) - (32 - 4)] - strd A_l, A_h, [dst, #40] - ldr A_l, [src, #36] - ldr A_h, [src, #40] - strd B_l, B_h, [dst, #48] - ldr B_l, [src, #44] - ldr B_h, [src, #48] - strd C_l, C_h, [dst, #56] - ldr C_l, [src, #52] - ldr C_h, [src, #56] - strd D_l, D_h, [dst, #64]! - ldr D_l, [src, #60] - ldr D_h, [src, #64]! - subs tmp2, tmp2, #64 -1: - strd A_l, A_h, [dst, #8] - ldr A_l, [src, #4] - ldr A_h, [src, #8] - strd B_l, B_h, [dst, #16] - ldr B_l, [src, #12] - ldr B_h, [src, #16] - strd C_l, C_h, [dst, #24] - ldr C_l, [src, #20] - ldr C_h, [src, #24] - strd D_l, D_h, [dst, #32] - ldr D_l, [src, #28] - ldr D_h, [src, #32] - bcs 2b - - /* Save the remaining bytes and restore the callee-saved regs. */ - strd A_l, A_h, [dst, #40] - add src, src, #36 - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [sp, #8] - cfi_restore (B_l) - cfi_restore (B_h) - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [sp, #16] - cfi_restore (C_l) - cfi_restore (C_h) - strd D_l, D_h, [dst, #64] - ldrd D_l, D_h, [sp, #24] - cfi_restore (D_l) - cfi_restore (D_h) - add dst, dst, #72 - ands count, tmp2, #0x3f -#endif - ldr tmp2, [sp], #FRAME_SIZE - cfi_adjust_cfa_offset (-FRAME_SIZE) - cfi_restore (tmp2) - bne .Ltail63unaligned - bx lr - -END(memcpy) -libc_hidden_builtin_def (memcpy) diff --git a/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/sysdeps/arm/armv7/multiarch/memcpy_neon.S deleted file mode 100644 index e60d1cc0e1..0000000000 --- a/sysdeps/arm/armv7/multiarch/memcpy_neon.S +++ /dev/null @@ -1,9 +0,0 @@ -#ifdef __ARM_NEON__ -/* Under __ARM_NEON__, this file defines memcpy directly. */ -libc_hidden_builtin_def (memcpy) -#else -# define memcpy __memcpy_neon -#endif - -#define MEMCPY_NEON -#include "memcpy_impl.S" diff --git a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S deleted file mode 100644 index e008c041ed..0000000000 --- a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S +++ /dev/null @@ -1,7 +0,0 @@ -/* Under __ARM_NEON__, memcpy_neon.S defines memcpy directly - and the __memcpy_vfp code will never be used. */ -#ifndef __ARM_NEON__ -# define MEMCPY_VFP -# define memcpy __memcpy_vfp -# include "memcpy_impl.S" -#endif diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S deleted file mode 100644 index 25d055754e..0000000000 --- a/sysdeps/arm/armv7/strcmp.S +++ /dev/null @@ -1,519 +0,0 @@ -/* strcmp implementation for ARMv7-A, optimized for Cortex-A15. - Copyright (C) 2012-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -#include <arm-features.h> -#include <sysdep.h> - -/* Implementation of strcmp for ARMv7 when DSP instructions are - available. Use ldrd to support wider loads, provided the data - is sufficiently aligned. Use saturating arithmetic to optimize - the compares. */ - -/* Build Options: - STRCMP_PRECHECK: Run a quick pre-check of the first byte in the - string. If comparing completely random strings the pre-check will - save time, since there is a very high probability of a mismatch in - the first character: we save significant overhead if this is the - common case. However, if strings are likely to be identical (e.g. - because we're verifying a hit in a hash table), then this check - is largely redundant. */ - -#define STRCMP_PRECHECK 1 - - .syntax unified - -#ifdef __ARM_BIG_ENDIAN -# define S2LO lsl -# define S2LOEQ lsleq -# define S2HI lsr -# define MSB 0x000000ff -# define LSB 0xff000000 -# define BYTE0_OFFSET 24 -# define BYTE1_OFFSET 16 -# define BYTE2_OFFSET 8 -# define BYTE3_OFFSET 0 -#else /* not __ARM_BIG_ENDIAN */ -# define S2LO lsr -# define S2LOEQ lsreq -# define S2HI lsl -# define BYTE0_OFFSET 0 -# define BYTE1_OFFSET 8 -# define BYTE2_OFFSET 16 -# define BYTE3_OFFSET 24 -# define MSB 0xff000000 -# define LSB 0x000000ff -#endif /* not __ARM_BIG_ENDIAN */ - -/* Parameters and result. */ -#define src1 r0 -#define src2 r1 -#define result r0 /* Overlaps src1. */ - -/* Internal variables. */ -#define tmp1 r4 -#define tmp2 r5 -#define const_m1 r12 - -/* Additional internal variables for 64-bit aligned data. */ -#define data1a r2 -#define data1b r3 -#define data2a r6 -#define data2b r7 -#define syndrome_a tmp1 -#define syndrome_b tmp2 - -/* Additional internal variables for 32-bit aligned data. */ -#define data1 r2 -#define data2 r3 -#define syndrome tmp2 - - -#ifndef NO_THUMB -/* This code is best on Thumb. */ - .thumb - -/* In Thumb code we can't use MVN with a register shift, but we do have ORN. */ -.macro prepare_mask mask_reg, nbits_reg - S2HI \mask_reg, const_m1, \nbits_reg -.endm -.macro apply_mask data_reg, mask_reg - orn \data_reg, \data_reg, \mask_reg -.endm -#else -/* In ARM code we don't have ORN, but we can use MVN with a register shift. */ -.macro prepare_mask mask_reg, nbits_reg - mvn \mask_reg, const_m1, S2HI \nbits_reg -.endm -.macro apply_mask data_reg, mask_reg - orr \data_reg, \data_reg, \mask_reg -.endm - -/* These clobber the condition codes, which the real Thumb cbz/cbnz - instructions do not. But it doesn't matter for any of the uses here. */ -.macro cbz reg, label - cmp \reg, #0 - beq \label -.endm -.macro cbnz reg, label - cmp \reg, #0 - bne \label -.endm -#endif - - - /* Macro to compute and return the result value for word-aligned - cases. */ - .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 -#ifdef __ARM_BIG_ENDIAN - /* If data1 contains a zero byte, then syndrome will contain a 1 in - bit 7 of that byte. Otherwise, the highest set bit in the - syndrome will highlight the first different bit. It is therefore - sufficient to extract the eight bits starting with the syndrome - bit. */ - clz tmp1, \synd - lsl r1, \d2, tmp1 - .if \restore_r6 - ldrd r6, r7, [sp, #8] - .endif - lsl \d1, \d1, tmp1 - lsr result, \d1, #24 - ldrd r4, r5, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - sub result, result, r1, lsr #24 - bx lr -#else - /* To use the big-endian trick we'd have to reverse all three words. - that's slower than this approach. */ - rev \synd, \synd - clz tmp1, \synd - bic tmp1, tmp1, #7 - lsr r1, \d2, tmp1 - .if \restore_r6 - ldrd r6, r7, [sp, #8] - .endif - lsr \d1, \d1, tmp1 - and result, \d1, #255 - and r1, r1, #255 - ldrd r4, r5, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - sub result, result, r1 - - bx lr -#endif - .endm - - .text - .p2align 5 -.Lstrcmp_start_addr: -#if STRCMP_PRECHECK == 1 -.Lfastpath_exit: - sub r0, r2, r3 - bx lr - nop -#endif -ENTRY (strcmp) -#if STRCMP_PRECHECK == 1 - ldrb r2, [src1] - ldrb r3, [src2] - cmp r2, #1 - it cs - cmpcs r2, r3 - bne .Lfastpath_exit -#endif - strd r4, r5, [sp, #-16]! - cfi_def_cfa_offset (16) - cfi_offset (r4, -16) - cfi_offset (r5, -12) - orr tmp1, src1, src2 - strd r6, r7, [sp, #8] - cfi_offset (r6, -8) - cfi_offset (r7, -4) - mvn const_m1, #0 - lsl r2, tmp1, #29 - cbz r2, .Lloop_aligned8 - -.Lnot_aligned: - eor tmp1, src1, src2 - tst tmp1, #7 - bne .Lmisaligned8 - - /* Deal with mutual misalignment by aligning downwards and then - masking off the unwanted loaded data to prevent a difference. */ - and tmp1, src1, #7 - bic src1, src1, #7 - and tmp2, tmp1, #3 - bic src2, src2, #7 - lsl tmp2, tmp2, #3 /* Bytes -> bits. */ - ldrd data1a, data1b, [src1], #16 - tst tmp1, #4 - ldrd data2a, data2b, [src2], #16 - prepare_mask tmp1, tmp2 - apply_mask data1a, tmp1 - apply_mask data2a, tmp1 - beq .Lstart_realigned8 - apply_mask data1b, tmp1 - mov data1a, const_m1 - apply_mask data2b, tmp1 - mov data2a, const_m1 - b .Lstart_realigned8 - - /* Unwind the inner loop by a factor of 2, giving 16 bytes per - pass. */ - .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ - .p2align 2 /* Always word aligned. */ -.Lloop_aligned8: - ldrd data1a, data1b, [src1], #16 - ldrd data2a, data2b, [src2], #16 -.Lstart_realigned8: - uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ - eor syndrome_a, data1a, data2a - sel syndrome_a, syndrome_a, const_m1 - cbnz syndrome_a, .Ldiff_in_a - uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ - eor syndrome_b, data1b, data2b - sel syndrome_b, syndrome_b, const_m1 - cbnz syndrome_b, .Ldiff_in_b - - ldrd data1a, data1b, [src1, #-8] - ldrd data2a, data2b, [src2, #-8] - uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ - eor syndrome_a, data1a, data2a - sel syndrome_a, syndrome_a, const_m1 - uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ - eor syndrome_b, data1b, data2b - sel syndrome_b, syndrome_b, const_m1 - /* Can't use CBZ for backwards branch. */ - orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ - beq .Lloop_aligned8 - -.Ldiff_found: - cbnz syndrome_a, .Ldiff_in_a - -.Ldiff_in_b: - strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 - -.Ldiff_in_a: - cfi_restore_state - strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 - - cfi_restore_state -.Lmisaligned8: - tst tmp1, #3 - bne .Lmisaligned4 - ands tmp1, src1, #3 - bne .Lmutual_align4 - - /* Unrolled by a factor of 2, to reduce the number of post-increment - operations. */ -.Lloop_aligned4: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned4: - uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ - eor syndrome, data1, data2 - sel syndrome, syndrome, const_m1 - cbnz syndrome, .Laligned4_done - ldr data1, [src1, #-4] - ldr data2, [src2, #-4] - uadd8 syndrome, data1, const_m1 - eor syndrome, data1, data2 - sel syndrome, syndrome, const_m1 - cmp syndrome, #0 - beq .Lloop_aligned4 - -.Laligned4_done: - strcmp_epilogue_aligned syndrome, data1, data2, 0 - -.Lmutual_align4: - cfi_restore_state - /* Deal with mutual misalignment by aligning downwards and then - masking off the unwanted loaded data to prevent a difference. */ - lsl tmp1, tmp1, #3 /* Bytes -> bits. */ - bic src1, src1, #3 - ldr data1, [src1], #8 - bic src2, src2, #3 - ldr data2, [src2], #8 - - prepare_mask tmp1, tmp1 - apply_mask data1, tmp1 - apply_mask data2, tmp1 - b .Lstart_realigned4 - -.Lmisaligned4: - ands tmp1, src1, #3 - beq .Lsrc1_aligned - sub src2, src2, tmp1 - bic src1, src1, #3 - lsls tmp1, tmp1, #31 - ldr data1, [src1], #4 - beq .Laligned_m2 - bcs .Laligned_m1 - -#if STRCMP_PRECHECK == 0 - ldrb data2, [src2, #1] - uxtb tmp1, data1, ror #BYTE1_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m2: - ldrb data2, [src2, #2] - uxtb tmp1, data1, ror #BYTE2_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m1: - ldrb data2, [src2, #3] - uxtb tmp1, data1, ror #BYTE3_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - add src2, src2, #4 - cbnz data2, .Lsrc1_aligned -#else /* STRCMP_PRECHECK */ - /* If we've done the pre-check, then we don't need to check the - first byte again here. */ - ldrb data2, [src2, #2] - uxtb tmp1, data1, ror #BYTE2_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m2: - ldrb data2, [src2, #3] - uxtb tmp1, data1, ror #BYTE3_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbnz data2, .Laligned_m1 -#endif - -.Lmisaligned_exit: - mov result, tmp1 - ldr r4, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - bx lr - -#if STRCMP_PRECHECK == 1 -.Laligned_m1: - add src2, src2, #4 -#endif -.Lsrc1_aligned: - cfi_restore_state - /* src1 is word aligned, but src2 has no common alignment - with it. */ - ldr data1, [src1], #4 - lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ - - bic src2, src2, #3 - ldr data2, [src2], #4 - bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ - bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ - - /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ -.Loverlap3: - bic tmp1, data1, #MSB - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #8 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #24 - bne 6f - ldr data1, [src1], #4 - b .Loverlap3 -4: - S2LO data2, data2, #8 - b .Lstrcmp_tail - -5: - bics syndrome, syndrome, #MSB - bne .Lstrcmp_done_equal - - /* We can only get here if the MSB of data1 contains 0, so - fast-path the exit. */ - ldrb result, [src2] - ldrd r4, r5, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - /* R6/7 Not used in this sequence. */ - cfi_restore (r6) - cfi_restore (r7) - neg result, result - bx lr - -6: - cfi_restore_state - S2LO data1, data1, #24 - and data2, data2, #LSB - b .Lstrcmp_tail - - .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap2: - and tmp1, data1, const_m1, S2LO #16 - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #16 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #16 - bne 6f - ldr data1, [src1], #4 - b .Loverlap2 -4: - S2LO data2, data2, #16 - b .Lstrcmp_tail -5: - ands syndrome, syndrome, const_m1, S2LO #16 - bne .Lstrcmp_done_equal - - ldrh data2, [src2] - S2LO data1, data1, #16 -#ifdef __ARM_BIG_ENDIAN - lsl data2, data2, #16 -#endif - b .Lstrcmp_tail - -6: - S2LO data1, data1, #16 - and data2, data2, const_m1, S2LO #16 - b .Lstrcmp_tail - - .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap1: - and tmp1, data1, #LSB - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #24 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #8 - bne 6f - ldr data1, [src1], #4 - b .Loverlap1 -4: - S2LO data2, data2, #24 - b .Lstrcmp_tail -5: - tst syndrome, #LSB - bne .Lstrcmp_done_equal - ldr data2, [src2] -6: - S2LO data1, data1, #8 - bic data2, data2, #MSB - b .Lstrcmp_tail - -.Lstrcmp_done_equal: - mov result, #0 - ldrd r4, r5, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - /* R6/7 not used in this sequence. */ - cfi_restore (r6) - cfi_restore (r7) - bx lr - -.Lstrcmp_tail: - cfi_restore_state -#ifndef __ARM_BIG_ENDIAN - rev data1, data1 - rev data2, data2 - /* Now everything looks big-endian... */ -#endif - uadd8 tmp1, data1, const_m1 - eor tmp1, data1, data2 - sel syndrome, tmp1, const_m1 - clz tmp1, syndrome - lsl data1, data1, tmp1 - lsl data2, data2, tmp1 - lsr result, data1, #24 - ldrd r4, r5, [sp], #16 - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - /* R6/7 not used in this sequence. */ - cfi_restore (r6) - cfi_restore (r7) - sub result, result, data2, lsr #24 - bx lr -END (strcmp) -libc_hidden_builtin_def (strcmp) |