aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/arm/armv7
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/arm/armv7')
-rw-r--r--sysdeps/arm/armv7/Implies2
-rw-r--r--sysdeps/arm/armv7/multiarch/Makefile3
-rw-r--r--sysdeps/arm/armv7/multiarch/aeabi_memcpy.c2
-rw-r--r--sysdeps/arm/armv7/multiarch/ifunc-impl-list.c56
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy.S76
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_impl.S728
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_neon.S9
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_vfp.S7
-rw-r--r--sysdeps/arm/armv7/strcmp.S519
9 files changed, 0 insertions, 1402 deletions
diff --git a/sysdeps/arm/armv7/Implies b/sysdeps/arm/armv7/Implies
deleted file mode 100644
index c6cd0eb877..0000000000
--- a/sysdeps/arm/armv7/Implies
+++ /dev/null
@@ -1,2 +0,0 @@
-# We can do everything that 6T2 can
-arm/armv6t2
diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile
deleted file mode 100644
index e834cc937f..0000000000
--- a/sysdeps/arm/armv7/multiarch/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-ifeq ($(subdir),string)
-sysdep_routines += memcpy_neon memcpy_vfp
-endif
diff --git a/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c b/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c
deleted file mode 100644
index c6a2a98a55..0000000000
--- a/sysdeps/arm/armv7/multiarch/aeabi_memcpy.c
+++ /dev/null
@@ -1,2 +0,0 @@
-/* Empty file to override sysdeps/arm version. See memcpy.S for definitions
- of these functions. */
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
deleted file mode 100644
index b8094fd393..0000000000
--- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Enumerate available IFUNC implementations of a function. ARM version.
- Copyright (C) 2013-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <stdbool.h>
-#include <string.h>
-#include <ldsodefs.h>
-#include <sysdep.h>
-#include <ifunc-impl-list.h>
-
-/* Fill ARRAY of MAX elements with IFUNC implementations for function
- NAME and return the number of valid entries. */
-
-size_t
-__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- size_t max)
-{
- size_t i = 0;
-
- bool use_neon = true;
-#ifdef __ARM_NEON__
-# define __memcpy_neon memcpy
-#else
- use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
-#endif
-
-#ifndef __ARM_NEON__
- bool use_vfp = true;
-# ifdef __SOFTFP__
- use_vfp = (GLRO(dl_hwcap) & HWCAP_ARM_VFP) != 0;
-# endif
-#endif
-
- IFUNC_IMPL (i, name, memcpy,
- IFUNC_IMPL_ADD (array, i, memcpy, use_neon, __memcpy_neon)
-#ifndef __ARM_NEON__
- IFUNC_IMPL_ADD (array, i, memcpy, use_vfp, __memcpy_vfp)
-#endif
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
-
- return i;
-}
diff --git a/sysdeps/arm/armv7/multiarch/memcpy.S b/sysdeps/arm/armv7/multiarch/memcpy.S
deleted file mode 100644
index 8a53bdaf91..0000000000
--- a/sysdeps/arm/armv7/multiarch/memcpy.S
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Multiple versions of memcpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2013-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Thumb requires excess IT instructions here. */
-#define NO_THUMB
-#include <sysdep.h>
-#include <rtld-global-offsets.h>
-
-#if IS_IN (libc)
-/* Under __ARM_NEON__, memcpy_neon.S defines the name memcpy. */
-# ifndef __ARM_NEON__
- .text
-ENTRY(memcpy)
- .type memcpy, %gnu_indirect_function
-# ifdef __SOFTFP__
- ldr r1, .Lmemcpy_arm
- tst r0, #HWCAP_ARM_VFP
- ldrne r1, .Lmemcpy_vfp
-# else
- ldr r1, .Lmemcpy_vfp
-# endif
- tst r0, #HWCAP_ARM_NEON
- ldrne r1, .Lmemcpy_neon
-1:
- add r0, r1, pc
- DO_RET(lr)
-
-# ifdef __SOFTFP__
-.Lmemcpy_arm:
- .long C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS
-# endif
-.Lmemcpy_neon:
- .long C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS
-.Lmemcpy_vfp:
- .long C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS
-
-END(memcpy)
-
-libc_hidden_builtin_def (memcpy)
-#endif /* Not __ARM_NEON__. */
-
-/* These versions of memcpy are defined not to clobber any VFP or NEON
- registers so they must always call the ARM variant of the memcpy code. */
-strong_alias (__memcpy_arm, __aeabi_memcpy)
-strong_alias (__memcpy_arm, __aeabi_memcpy4)
-strong_alias (__memcpy_arm, __aeabi_memcpy8)
-libc_hidden_def (__memcpy_arm)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-#undef weak_alias
-#define weak_alias(x, y)
-#undef libc_hidden_def
-#define libc_hidden_def(name)
-
-#define memcpy __memcpy_arm
-
-#endif
-
-#include "memcpy_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
deleted file mode 100644
index c1b9fb0ab5..0000000000
--- a/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+++ /dev/null
@@ -1,728 +0,0 @@
-/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
- Copyright (C) 2013-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>.
-
- This memcpy routine is optimised for Cortex-A15 cores and takes advantage
- of VFP or NEON when built with the appropriate flags.
-
- Assumptions:
-
- ARMv6 (ARMv7-a if using Neon)
- ARM state
- Unaligned accesses
-
- */
-
-/* Thumb cannot encode negative immediate offsets in memory operations. */
-#ifndef NO_THUMB
-#define NO_THUMB
-#endif
-#include <sysdep.h>
-#include <arm-features.h>
-
- .syntax unified
- /* This implementation requires ARM state. */
- .arm
-
-#ifdef MEMCPY_NEON
-
- .fpu neon
- .arch armv7-a
-# define FRAME_SIZE 4
-# define USE_VFP
-# define USE_NEON
-
-#elif defined (MEMCPY_VFP)
-
- .arch armv6
- .fpu vfpv2
-# define FRAME_SIZE 32
-# define USE_VFP
-
-#else
- .arch armv6
-# define FRAME_SIZE 32
-
-#endif
-
-#define ALIGN(addr, align) addr:align
-
-#define INSN_SIZE 4
-
-/* Call parameters. */
-#define dstin r0
-#define src r1
-#define count r2
-
-/* Locals. */
-#define tmp1 r3
-#define dst ip
-#define tmp2 r8
-
-/* These two macros both work by repeated invocation of the macro
- dispatch_step (not defined here). That macro performs one "step",
- doing one load instruction and one store instruction to copy one
- "unit". On entry, TMP1 contains the number of bytes to be copied,
- a multiple of the unit size. The macro clobbers TMP1 in the
- process of doing a computed jump to the tail containing the
- appropriate number of steps.
-
- In dispatch_7_dword, dispatch_step is invoked seven times, with an
- argument that is 7 for the first and 1 for the last. Units are
- double-words (8 bytes). TMP1 is at most 56.
-
- In dispatch_15_word, dispatch_step is invoked fifteen times,
- with an argument that is 15 for the first and 1 for the last.
- Units are words (4 bytes). TMP1 is at most 60. */
-
-#ifndef ARM_ALWAYS_BX
-# if ARM_BX_ALIGN_LOG2 != 2
-# error case not handled
-# endif
- .macro dispatch_7_dword
- rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
- add pc, pc, tmp1
- dispatch_step 7
- dispatch_step 6
- dispatch_step 5
- dispatch_step 4
- dispatch_step 3
- dispatch_step 2
- dispatch_step 1
- .purgem dispatch_step
- .endm
-
- .macro dispatch_15_word
- rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
- add pc, pc, tmp1, lsl #1
- dispatch_step 15
- dispatch_step 14
- dispatch_step 13
- dispatch_step 12
- dispatch_step 11
- dispatch_step 10
- dispatch_step 9
- dispatch_step 8
- dispatch_step 7
- dispatch_step 6
- dispatch_step 5
- dispatch_step 4
- dispatch_step 3
- dispatch_step 2
- dispatch_step 1
- .purgem dispatch_step
- .endm
-#else
-# if ARM_BX_ALIGN_LOG2 < 3
-# error case not handled
-# endif
- .macro dispatch_helper steps, log2_bytes_per_step
- /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
- (STEPS << LOG2_BYTES_PER_STEP).
- So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
- Then it needs further adjustment to compensate for the
- distance between the PC value taken below (0f + PC_OFS)
- and the first step's instructions (1f). */
- rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
- + ((1f - PC_OFS - 0f) \
- >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
- /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
- steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
- the (byte) distance to add to the PC. */
-0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
- bx tmp1
- .p2align ARM_BX_ALIGN_LOG2
-1:
- .endm
-
- .macro dispatch_7_dword
- dispatch_helper 7, 3
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 7
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 6
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 5
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 4
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 3
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 2
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 1
- .p2align ARM_BX_ALIGN_LOG2
- .purgem dispatch_step
- .endm
-
- .macro dispatch_15_word
- dispatch_helper 15, 2
- dispatch_step 15
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 14
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 13
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 12
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 11
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 10
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 9
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 8
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 7
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 6
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 5
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 4
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 3
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 2
- .p2align ARM_BX_ALIGN_LOG2
- dispatch_step 1
- .p2align ARM_BX_ALIGN_LOG2
- .purgem dispatch_step
- .endm
-
-#endif
-
-#ifndef USE_NEON
-/* For bulk copies using GP registers. */
-#define A_l r2 /* Call-clobbered. */
-#define A_h r3 /* Call-clobbered. */
-#define B_l r4
-#define B_h r5
-#define C_l r6
-#define C_h r7
-/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */
-#define D_l r10
-#define D_h r11
-#endif
-
-/* Number of lines ahead to pre-fetch data. If you change this the code
- below will need adjustment to compensate. */
-
-#define prefetch_lines 5
-
-#ifdef USE_VFP
- .macro cpy_line_vfp vreg, base
- vstr \vreg, [dst, #\base]
- vldr \vreg, [src, #\base]
- vstr d0, [dst, #\base + 8]
- vldr d0, [src, #\base + 8]
- vstr d1, [dst, #\base + 16]
- vldr d1, [src, #\base + 16]
- vstr d2, [dst, #\base + 24]
- vldr d2, [src, #\base + 24]
- vstr \vreg, [dst, #\base + 32]
- vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
- vstr d0, [dst, #\base + 40]
- vldr d0, [src, #\base + 40]
- vstr d1, [dst, #\base + 48]
- vldr d1, [src, #\base + 48]
- vstr d2, [dst, #\base + 56]
- vldr d2, [src, #\base + 56]
- .endm
-
- .macro cpy_tail_vfp vreg, base
- vstr \vreg, [dst, #\base]
- vldr \vreg, [src, #\base]
- vstr d0, [dst, #\base + 8]
- vldr d0, [src, #\base + 8]
- vstr d1, [dst, #\base + 16]
- vldr d1, [src, #\base + 16]
- vstr d2, [dst, #\base + 24]
- vldr d2, [src, #\base + 24]
- vstr \vreg, [dst, #\base + 32]
- vstr d0, [dst, #\base + 40]
- vldr d0, [src, #\base + 40]
- vstr d1, [dst, #\base + 48]
- vldr d1, [src, #\base + 48]
- vstr d2, [dst, #\base + 56]
- vldr d2, [src, #\base + 56]
- .endm
-#endif
-
- .p2align 6
-ENTRY(memcpy)
-
- mov dst, dstin /* Preserve dstin, we need to return it. */
- cmp count, #64
- bge .Lcpy_not_short
- /* Deal with small copies quickly by dropping straight into the
- exit block. */
-
-.Ltail63unaligned:
-#ifdef USE_NEON
- /* These need an extra layer of macro just to work around a
- bug in the assembler's parser when an operand starts with
- a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647
- tracks that bug; it was not fixed as of binutils-2.23.2. */
- .macro neon_load_d0 reg
- vld1.8 {d0}, [\reg]!
- .endm
- .macro neon_store_d0 reg
- vst1.8 {d0}, [\reg]!
- .endm
-
- and tmp1, count, #0x38
- .macro dispatch_step i
- neon_load_d0 src
- neon_store_d0 dst
- .endm
- dispatch_7_dword
-
- tst count, #4
- ldrne tmp1, [src], #4
- strne tmp1, [dst], #4
-#else
- /* Copy up to 15 full words of data. May not be aligned. */
- /* Cannot use VFP for unaligned data. */
- and tmp1, count, #0x3c
- add dst, dst, tmp1
- add src, src, tmp1
- /* Jump directly into the sequence below at the correct offset. */
- .macro dispatch_step i
- ldr tmp1, [src, #-(\i * 4)]
- str tmp1, [dst, #-(\i * 4)]
- .endm
- dispatch_15_word
-#endif
-
- lsls count, count, #31
- ldrhcs tmp1, [src], #2
- ldrbne src, [src] /* Src is dead, use as a scratch. */
- strhcs tmp1, [dst], #2
- strbne src, [dst]
- bx lr
-
-.Lcpy_not_short:
- /* At least 64 bytes to copy, but don't know the alignment yet. */
- str tmp2, [sp, #-FRAME_SIZE]!
- cfi_adjust_cfa_offset (FRAME_SIZE)
- cfi_rel_offset (tmp2, 0)
- cfi_remember_state
- and tmp2, src, #7
- and tmp1, dst, #7
- cmp tmp1, tmp2
- bne .Lcpy_notaligned
-
-#ifdef USE_VFP
- /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
- that the FP pipeline is much better at streaming loads and
- stores. This is outside the critical loop. */
- vmov.f32 s0, s0
-#endif
-
- /* SRC and DST have the same mutual 64-bit alignment, but we may
- still need to pre-copy some bytes to get to natural alignment.
- We bring SRC and DST into full 64-bit alignment. */
- lsls tmp2, dst, #29
- beq 1f
- rsbs tmp2, tmp2, #0
- sub count, count, tmp2, lsr #29
- ldrmi tmp1, [src], #4
- strmi tmp1, [dst], #4
- lsls tmp2, tmp2, #2
- ldrhcs tmp1, [src], #2
- ldrbne tmp2, [src], #1
- strhcs tmp1, [dst], #2
- strbne tmp2, [dst], #1
-
-1:
- subs tmp2, count, #64 /* Use tmp2 for count. */
- blt .Ltail63aligned
-
- cmp tmp2, #512
- bge .Lcpy_body_long
-
-.Lcpy_body_medium: /* Count in tmp2. */
-#ifdef USE_VFP
-1:
- vldr d0, [src, #0]
- subs tmp2, tmp2, #64
- vldr d1, [src, #8]
- vstr d0, [dst, #0]
- vldr d0, [src, #16]
- vstr d1, [dst, #8]
- vldr d1, [src, #24]
- vstr d0, [dst, #16]
- vldr d0, [src, #32]
- vstr d1, [dst, #24]
- vldr d1, [src, #40]
- vstr d0, [dst, #32]
- vldr d0, [src, #48]
- vstr d1, [dst, #40]
- vldr d1, [src, #56]
- vstr d0, [dst, #48]
- add src, src, #64
- vstr d1, [dst, #56]
- add dst, dst, #64
- bge 1b
- tst tmp2, #0x3f
- beq .Ldone
-
-.Ltail63aligned: /* Count in tmp2. */
- and tmp1, tmp2, #0x38
- add dst, dst, tmp1
- add src, src, tmp1
- .macro dispatch_step i
- vldr d0, [src, #-(\i * 8)]
- vstr d0, [dst, #-(\i * 8)]
- .endm
- dispatch_7_dword
-#else
- sub src, src, #8
- sub dst, dst, #8
-1:
- ldrd A_l, A_h, [src, #8]
- strd A_l, A_h, [dst, #8]
- ldrd A_l, A_h, [src, #16]
- strd A_l, A_h, [dst, #16]
- ldrd A_l, A_h, [src, #24]
- strd A_l, A_h, [dst, #24]
- ldrd A_l, A_h, [src, #32]
- strd A_l, A_h, [dst, #32]
- ldrd A_l, A_h, [src, #40]
- strd A_l, A_h, [dst, #40]
- ldrd A_l, A_h, [src, #48]
- strd A_l, A_h, [dst, #48]
- ldrd A_l, A_h, [src, #56]
- strd A_l, A_h, [dst, #56]
- ldrd A_l, A_h, [src, #64]!
- strd A_l, A_h, [dst, #64]!
- subs tmp2, tmp2, #64
- bge 1b
- tst tmp2, #0x3f
- bne 1f
- ldr tmp2,[sp], #FRAME_SIZE
- cfi_adjust_cfa_offset (-FRAME_SIZE)
- cfi_restore (tmp2)
- bx lr
-
- cfi_restore_state
- cfi_remember_state
-1:
- add src, src, #8
- add dst, dst, #8
-
-.Ltail63aligned: /* Count in tmp2. */
- /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
- we know that the src and dest are 64-bit aligned so we can use
- LDRD/STRD to improve efficiency. */
- /* TMP2 is now negative, but we don't care about that. The bottom
- six bits still tell us how many bytes are left to copy. */
-
- and tmp1, tmp2, #0x38
- add dst, dst, tmp1
- add src, src, tmp1
- .macro dispatch_step i
- ldrd A_l, A_h, [src, #-(\i * 8)]
- strd A_l, A_h, [dst, #-(\i * 8)]
- .endm
- dispatch_7_dword
-#endif
-
- tst tmp2, #4
- ldrne tmp1, [src], #4
- strne tmp1, [dst], #4
- lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
- ldrhcs tmp1, [src], #2
- ldrbne tmp2, [src]
- strhcs tmp1, [dst], #2
- strbne tmp2, [dst]
-
-.Ldone:
- ldr tmp2, [sp], #FRAME_SIZE
- cfi_adjust_cfa_offset (-FRAME_SIZE)
- cfi_restore (tmp2)
- bx lr
-
- cfi_restore_state
- cfi_remember_state
-
-.Lcpy_body_long: /* Count in tmp2. */
-
- /* Long copy. We know that there's at least (prefetch_lines * 64)
- bytes to go. */
-#ifdef USE_VFP
- /* Don't use PLD. Instead, read some data in advance of the current
- copy position into a register. This should act like a PLD
- operation but we won't have to repeat the transfer. */
-
- vldr d3, [src, #0]
- vldr d4, [src, #64]
- vldr d5, [src, #128]
- vldr d6, [src, #192]
- vldr d7, [src, #256]
-
- vldr d0, [src, #8]
- vldr d1, [src, #16]
- vldr d2, [src, #24]
- add src, src, #32
-
- subs tmp2, tmp2, #prefetch_lines * 64 * 2
- blt 2f
-1:
- cpy_line_vfp d3, 0
- cpy_line_vfp d4, 64
- cpy_line_vfp d5, 128
- add dst, dst, #3 * 64
- add src, src, #3 * 64
- cpy_line_vfp d6, 0
- cpy_line_vfp d7, 64
- add dst, dst, #2 * 64
- add src, src, #2 * 64
- subs tmp2, tmp2, #prefetch_lines * 64
- bge 1b
-
-2:
- cpy_tail_vfp d3, 0
- cpy_tail_vfp d4, 64
- cpy_tail_vfp d5, 128
- add src, src, #3 * 64
- add dst, dst, #3 * 64
- cpy_tail_vfp d6, 0
- vstr d7, [dst, #64]
- vldr d7, [src, #64]
- vstr d0, [dst, #64 + 8]
- vldr d0, [src, #64 + 8]
- vstr d1, [dst, #64 + 16]
- vldr d1, [src, #64 + 16]
- vstr d2, [dst, #64 + 24]
- vldr d2, [src, #64 + 24]
- vstr d7, [dst, #64 + 32]
- add src, src, #96
- vstr d0, [dst, #64 + 40]
- vstr d1, [dst, #64 + 48]
- vstr d2, [dst, #64 + 56]
- add dst, dst, #128
- add tmp2, tmp2, #prefetch_lines * 64
- b .Lcpy_body_medium
-#else
- /* Long copy. Use an SMS style loop to maximize the I/O
- bandwidth of the core. We don't have enough spare registers
- to synthesise prefetching, so use PLD operations. */
- /* Pre-bias src and dst. */
- sub src, src, #8
- sub dst, dst, #8
- pld [src, #8]
- pld [src, #72]
- subs tmp2, tmp2, #64
- pld [src, #136]
- ldrd A_l, A_h, [src, #8]
- strd B_l, B_h, [sp, #8]
- cfi_rel_offset (B_l, 8)
- cfi_rel_offset (B_h, 12)
- ldrd B_l, B_h, [src, #16]
- strd C_l, C_h, [sp, #16]
- cfi_rel_offset (C_l, 16)
- cfi_rel_offset (C_h, 20)
- ldrd C_l, C_h, [src, #24]
- strd D_l, D_h, [sp, #24]
- cfi_rel_offset (D_l, 24)
- cfi_rel_offset (D_h, 28)
- pld [src, #200]
- ldrd D_l, D_h, [src, #32]!
- b 1f
- .p2align 6
-2:
- pld [src, #232]
- strd A_l, A_h, [dst, #40]
- ldrd A_l, A_h, [src, #40]
- strd B_l, B_h, [dst, #48]
- ldrd B_l, B_h, [src, #48]
- strd C_l, C_h, [dst, #56]
- ldrd C_l, C_h, [src, #56]
- strd D_l, D_h, [dst, #64]!
- ldrd D_l, D_h, [src, #64]!
- subs tmp2, tmp2, #64
-1:
- strd A_l, A_h, [dst, #8]
- ldrd A_l, A_h, [src, #8]
- strd B_l, B_h, [dst, #16]
- ldrd B_l, B_h, [src, #16]
- strd C_l, C_h, [dst, #24]
- ldrd C_l, C_h, [src, #24]
- strd D_l, D_h, [dst, #32]
- ldrd D_l, D_h, [src, #32]
- bcs 2b
- /* Save the remaining bytes and restore the callee-saved regs. */
- strd A_l, A_h, [dst, #40]
- add src, src, #40
- strd B_l, B_h, [dst, #48]
- ldrd B_l, B_h, [sp, #8]
- cfi_restore (B_l)
- cfi_restore (B_h)
- strd C_l, C_h, [dst, #56]
- ldrd C_l, C_h, [sp, #16]
- cfi_restore (C_l)
- cfi_restore (C_h)
- strd D_l, D_h, [dst, #64]
- ldrd D_l, D_h, [sp, #24]
- cfi_restore (D_l)
- cfi_restore (D_h)
- add dst, dst, #72
- tst tmp2, #0x3f
- bne .Ltail63aligned
- ldr tmp2, [sp], #FRAME_SIZE
- cfi_adjust_cfa_offset (-FRAME_SIZE)
- cfi_restore (tmp2)
- bx lr
-#endif
-
- cfi_restore_state
- cfi_remember_state
-
-.Lcpy_notaligned:
- pld [src, #0]
- pld [src, #64]
- /* There's at least 64 bytes to copy, but there is no mutual
- alignment. */
- /* Bring DST to 64-bit alignment. */
- lsls tmp2, dst, #29
- pld [src, #(2 * 64)]
- beq 1f
- rsbs tmp2, tmp2, #0
- sub count, count, tmp2, lsr #29
- ldrmi tmp1, [src], #4
- strmi tmp1, [dst], #4
- lsls tmp2, tmp2, #2
- ldrbne tmp1, [src], #1
- ldrhcs tmp2, [src], #2
- strbne tmp1, [dst], #1
- strhcs tmp2, [dst], #2
-1:
- pld [src, #(3 * 64)]
- subs count, count, #64
- ldrmi tmp2, [sp], #FRAME_SIZE
- bmi .Ltail63unaligned
- pld [src, #(4 * 64)]
-
-#ifdef USE_NEON
- /* These need an extra layer of macro just to work around a
- bug in the assembler's parser when an operand starts with
- a {...}. */
- .macro neon_load_multi reglist, basereg
- vld1.8 {\reglist}, [\basereg]!
- .endm
- .macro neon_store_multi reglist, basereg
- vst1.8 {\reglist}, [ALIGN (\basereg, 64)]!
- .endm
-
- neon_load_multi d0-d3, src
- neon_load_multi d4-d7, src
- subs count, count, #64
- bmi 2f
-1:
- pld [src, #(4 * 64)]
- neon_store_multi d0-d3, dst
- neon_load_multi d0-d3, src
- neon_store_multi d4-d7, dst
- neon_load_multi d4-d7, src
- subs count, count, #64
- bpl 1b
-2:
- neon_store_multi d0-d3, dst
- neon_store_multi d4-d7, dst
- ands count, count, #0x3f
-#else
- /* Use an SMS style loop to maximize the I/O bandwidth. */
- sub src, src, #4
- sub dst, dst, #8
- subs tmp2, count, #64 /* Use tmp2 for count. */
- ldr A_l, [src, #4]
- ldr A_h, [src, #8]
- strd B_l, B_h, [sp, #8]
- cfi_rel_offset (B_l, 8)
- cfi_rel_offset (B_h, 12)
- ldr B_l, [src, #12]
- ldr B_h, [src, #16]
- strd C_l, C_h, [sp, #16]
- cfi_rel_offset (C_l, 16)
- cfi_rel_offset (C_h, 20)
- ldr C_l, [src, #20]
- ldr C_h, [src, #24]
- strd D_l, D_h, [sp, #24]
- cfi_rel_offset (D_l, 24)
- cfi_rel_offset (D_h, 28)
- ldr D_l, [src, #28]
- ldr D_h, [src, #32]!
- b 1f
- .p2align 6
-2:
- pld [src, #(5 * 64) - (32 - 4)]
- strd A_l, A_h, [dst, #40]
- ldr A_l, [src, #36]
- ldr A_h, [src, #40]
- strd B_l, B_h, [dst, #48]
- ldr B_l, [src, #44]
- ldr B_h, [src, #48]
- strd C_l, C_h, [dst, #56]
- ldr C_l, [src, #52]
- ldr C_h, [src, #56]
- strd D_l, D_h, [dst, #64]!
- ldr D_l, [src, #60]
- ldr D_h, [src, #64]!
- subs tmp2, tmp2, #64
-1:
- strd A_l, A_h, [dst, #8]
- ldr A_l, [src, #4]
- ldr A_h, [src, #8]
- strd B_l, B_h, [dst, #16]
- ldr B_l, [src, #12]
- ldr B_h, [src, #16]
- strd C_l, C_h, [dst, #24]
- ldr C_l, [src, #20]
- ldr C_h, [src, #24]
- strd D_l, D_h, [dst, #32]
- ldr D_l, [src, #28]
- ldr D_h, [src, #32]
- bcs 2b
-
- /* Save the remaining bytes and restore the callee-saved regs. */
- strd A_l, A_h, [dst, #40]
- add src, src, #36
- strd B_l, B_h, [dst, #48]
- ldrd B_l, B_h, [sp, #8]
- cfi_restore (B_l)
- cfi_restore (B_h)
- strd C_l, C_h, [dst, #56]
- ldrd C_l, C_h, [sp, #16]
- cfi_restore (C_l)
- cfi_restore (C_h)
- strd D_l, D_h, [dst, #64]
- ldrd D_l, D_h, [sp, #24]
- cfi_restore (D_l)
- cfi_restore (D_h)
- add dst, dst, #72
- ands count, tmp2, #0x3f
-#endif
- ldr tmp2, [sp], #FRAME_SIZE
- cfi_adjust_cfa_offset (-FRAME_SIZE)
- cfi_restore (tmp2)
- bne .Ltail63unaligned
- bx lr
-
-END(memcpy)
-libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/sysdeps/arm/armv7/multiarch/memcpy_neon.S
deleted file mode 100644
index e60d1cc0e1..0000000000
--- a/sysdeps/arm/armv7/multiarch/memcpy_neon.S
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifdef __ARM_NEON__
-/* Under __ARM_NEON__, this file defines memcpy directly. */
-libc_hidden_builtin_def (memcpy)
-#else
-# define memcpy __memcpy_neon
-#endif
-
-#define MEMCPY_NEON
-#include "memcpy_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
deleted file mode 100644
index e008c041ed..0000000000
--- a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* Under __ARM_NEON__, memcpy_neon.S defines memcpy directly
- and the __memcpy_vfp code will never be used. */
-#ifndef __ARM_NEON__
-# define MEMCPY_VFP
-# define memcpy __memcpy_vfp
-# include "memcpy_impl.S"
-#endif
diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S
deleted file mode 100644
index 25d055754e..0000000000
--- a/sysdeps/arm/armv7/strcmp.S
+++ /dev/null
@@ -1,519 +0,0 @@
-/* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
- Copyright (C) 2012-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <arm-features.h>
-#include <sysdep.h>
-
-/* Implementation of strcmp for ARMv7 when DSP instructions are
- available. Use ldrd to support wider loads, provided the data
- is sufficiently aligned. Use saturating arithmetic to optimize
- the compares. */
-
-/* Build Options:
- STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
- string. If comparing completely random strings the pre-check will
- save time, since there is a very high probability of a mismatch in
- the first character: we save significant overhead if this is the
- common case. However, if strings are likely to be identical (e.g.
- because we're verifying a hit in a hash table), then this check
- is largely redundant. */
-
-#define STRCMP_PRECHECK 1
-
- .syntax unified
-
-#ifdef __ARM_BIG_ENDIAN
-# define S2LO lsl
-# define S2LOEQ lsleq
-# define S2HI lsr
-# define MSB 0x000000ff
-# define LSB 0xff000000
-# define BYTE0_OFFSET 24
-# define BYTE1_OFFSET 16
-# define BYTE2_OFFSET 8
-# define BYTE3_OFFSET 0
-#else /* not __ARM_BIG_ENDIAN */
-# define S2LO lsr
-# define S2LOEQ lsreq
-# define S2HI lsl
-# define BYTE0_OFFSET 0
-# define BYTE1_OFFSET 8
-# define BYTE2_OFFSET 16
-# define BYTE3_OFFSET 24
-# define MSB 0xff000000
-# define LSB 0x000000ff
-#endif /* not __ARM_BIG_ENDIAN */
-
-/* Parameters and result. */
-#define src1 r0
-#define src2 r1
-#define result r0 /* Overlaps src1. */
-
-/* Internal variables. */
-#define tmp1 r4
-#define tmp2 r5
-#define const_m1 r12
-
-/* Additional internal variables for 64-bit aligned data. */
-#define data1a r2
-#define data1b r3
-#define data2a r6
-#define data2b r7
-#define syndrome_a tmp1
-#define syndrome_b tmp2
-
-/* Additional internal variables for 32-bit aligned data. */
-#define data1 r2
-#define data2 r3
-#define syndrome tmp2
-
-
-#ifndef NO_THUMB
-/* This code is best on Thumb. */
- .thumb
-
-/* In Thumb code we can't use MVN with a register shift, but we do have ORN. */
-.macro prepare_mask mask_reg, nbits_reg
- S2HI \mask_reg, const_m1, \nbits_reg
-.endm
-.macro apply_mask data_reg, mask_reg
- orn \data_reg, \data_reg, \mask_reg
-.endm
-#else
-/* In ARM code we don't have ORN, but we can use MVN with a register shift. */
-.macro prepare_mask mask_reg, nbits_reg
- mvn \mask_reg, const_m1, S2HI \nbits_reg
-.endm
-.macro apply_mask data_reg, mask_reg
- orr \data_reg, \data_reg, \mask_reg
-.endm
-
-/* These clobber the condition codes, which the real Thumb cbz/cbnz
- instructions do not. But it doesn't matter for any of the uses here. */
-.macro cbz reg, label
- cmp \reg, #0
- beq \label
-.endm
-.macro cbnz reg, label
- cmp \reg, #0
- bne \label
-.endm
-#endif
-
-
- /* Macro to compute and return the result value for word-aligned
- cases. */
- .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
-#ifdef __ARM_BIG_ENDIAN
- /* If data1 contains a zero byte, then syndrome will contain a 1 in
- bit 7 of that byte. Otherwise, the highest set bit in the
- syndrome will highlight the first different bit. It is therefore
- sufficient to extract the eight bits starting with the syndrome
- bit. */
- clz tmp1, \synd
- lsl r1, \d2, tmp1
- .if \restore_r6
- ldrd r6, r7, [sp, #8]
- .endif
- lsl \d1, \d1, tmp1
- lsr result, \d1, #24
- ldrd r4, r5, [sp], #16
- cfi_remember_state
- cfi_def_cfa_offset (0)
- cfi_restore (r4)
- cfi_restore (r5)
- cfi_restore (r6)
- cfi_restore (r7)
- sub result, result, r1, lsr #24
- bx lr
-#else
- /* To use the big-endian trick we'd have to reverse all three words.
- that's slower than this approach. */
- rev \synd, \synd
- clz tmp1, \synd
- bic tmp1, tmp1, #7
- lsr r1, \d2, tmp1
- .if \restore_r6
- ldrd r6, r7, [sp, #8]
- .endif
- lsr \d1, \d1, tmp1
- and result, \d1, #255
- and r1, r1, #255
- ldrd r4, r5, [sp], #16
- cfi_remember_state
- cfi_def_cfa_offset (0)
- cfi_restore (r4)
- cfi_restore (r5)
- cfi_restore (r6)
- cfi_restore (r7)
- sub result, result, r1
-
- bx lr
-#endif
- .endm
-
- .text
- .p2align 5
-.Lstrcmp_start_addr:
-#if STRCMP_PRECHECK == 1
-.Lfastpath_exit:
- sub r0, r2, r3
- bx lr
- nop
-#endif
-ENTRY (strcmp)
-#if STRCMP_PRECHECK == 1
- ldrb r2, [src1]
- ldrb r3, [src2]
- cmp r2, #1
- it cs
- cmpcs r2, r3
- bne .Lfastpath_exit
-#endif
- strd r4, r5, [sp, #-16]!
- cfi_def_cfa_offset (16)
- cfi_offset (r4, -16)
- cfi_offset (r5, -12)
- orr tmp1, src1, src2
- strd r6, r7, [sp, #8]
- cfi_offset (r6, -8)
- cfi_offset (r7, -4)
- mvn const_m1, #0
- lsl r2, tmp1, #29
- cbz r2, .Lloop_aligned8
-
-.Lnot_aligned:
- eor tmp1, src1, src2
- tst tmp1, #7
- bne .Lmisaligned8
-
- /* Deal with mutual misalignment by aligning downwards and then
- masking off the unwanted loaded data to prevent a difference. */
- and tmp1, src1, #7
- bic src1, src1, #7
- and tmp2, tmp1, #3
- bic src2, src2, #7
- lsl tmp2, tmp2, #3 /* Bytes -> bits. */
- ldrd data1a, data1b, [src1], #16
- tst tmp1, #4
- ldrd data2a, data2b, [src2], #16
- prepare_mask tmp1, tmp2
- apply_mask data1a, tmp1
- apply_mask data2a, tmp1
- beq .Lstart_realigned8
- apply_mask data1b, tmp1
- mov data1a, const_m1
- apply_mask data2b, tmp1
- mov data2a, const_m1
- b .Lstart_realigned8
-
- /* Unwind the inner loop by a factor of 2, giving 16 bytes per
- pass. */
- .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
- .p2align 2 /* Always word aligned. */
-.Lloop_aligned8:
- ldrd data1a, data1b, [src1], #16
- ldrd data2a, data2b, [src2], #16
-.Lstart_realigned8:
- uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
- eor syndrome_a, data1a, data2a
- sel syndrome_a, syndrome_a, const_m1
- cbnz syndrome_a, .Ldiff_in_a
- uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
- eor syndrome_b, data1b, data2b
- sel syndrome_b, syndrome_b, const_m1
- cbnz syndrome_b, .Ldiff_in_b
-
- ldrd data1a, data1b, [src1, #-8]
- ldrd data2a, data2b, [src2, #-8]
- uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
- eor syndrome_a, data1a, data2a
- sel syndrome_a, syndrome_a, const_m1
- uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
- eor syndrome_b, data1b, data2b
- sel syndrome_b, syndrome_b, const_m1
- /* Can't use CBZ for backwards branch. */
- orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
- beq .Lloop_aligned8
-
-.Ldiff_found:
- cbnz syndrome_a, .Ldiff_in_a
-
-.Ldiff_in_b:
- strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
-
-.Ldiff_in_a:
- cfi_restore_state
- strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
-
- cfi_restore_state
-.Lmisaligned8:
- tst tmp1, #3
- bne .Lmisaligned4
- ands tmp1, src1, #3
- bne .Lmutual_align4
-
- /* Unrolled by a factor of 2, to reduce the number of post-increment
- operations. */
-.Lloop_aligned4:
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-.Lstart_realigned4:
- uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
- eor syndrome, data1, data2
- sel syndrome, syndrome, const_m1
- cbnz syndrome, .Laligned4_done
- ldr data1, [src1, #-4]
- ldr data2, [src2, #-4]
- uadd8 syndrome, data1, const_m1
- eor syndrome, data1, data2
- sel syndrome, syndrome, const_m1
- cmp syndrome, #0
- beq .Lloop_aligned4
-
-.Laligned4_done:
- strcmp_epilogue_aligned syndrome, data1, data2, 0
-
-.Lmutual_align4:
- cfi_restore_state
- /* Deal with mutual misalignment by aligning downwards and then
- masking off the unwanted loaded data to prevent a difference. */
- lsl tmp1, tmp1, #3 /* Bytes -> bits. */
- bic src1, src1, #3
- ldr data1, [src1], #8
- bic src2, src2, #3
- ldr data2, [src2], #8
-
- prepare_mask tmp1, tmp1
- apply_mask data1, tmp1
- apply_mask data2, tmp1
- b .Lstart_realigned4
-
-.Lmisaligned4:
- ands tmp1, src1, #3
- beq .Lsrc1_aligned
- sub src2, src2, tmp1
- bic src1, src1, #3
- lsls tmp1, tmp1, #31
- ldr data1, [src1], #4
- beq .Laligned_m2
- bcs .Laligned_m1
-
-#if STRCMP_PRECHECK == 0
- ldrb data2, [src2, #1]
- uxtb tmp1, data1, ror #BYTE1_OFFSET
- subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
-
-.Laligned_m2:
- ldrb data2, [src2, #2]
- uxtb tmp1, data1, ror #BYTE2_OFFSET
- subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
-
-.Laligned_m1:
- ldrb data2, [src2, #3]
- uxtb tmp1, data1, ror #BYTE3_OFFSET
- subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- add src2, src2, #4
- cbnz data2, .Lsrc1_aligned
-#else /* STRCMP_PRECHECK */
- /* If we've done the pre-check, then we don't need to check the
- first byte again here. */
- ldrb data2, [src2, #2]
- uxtb tmp1, data1, ror #BYTE2_OFFSET
- subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
-
-.Laligned_m2:
- ldrb data2, [src2, #3]
- uxtb tmp1, data1, ror #BYTE3_OFFSET
- subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbnz data2, .Laligned_m1
-#endif
-
-.Lmisaligned_exit:
- mov result, tmp1
- ldr r4, [sp], #16
- cfi_remember_state
- cfi_def_cfa_offset (0)
- cfi_restore (r4)
- cfi_restore (r5)
- cfi_restore (r6)
- cfi_restore (r7)
- bx lr
-
-#if STRCMP_PRECHECK == 1
-.Laligned_m1:
- add src2, src2, #4
-#endif
-.Lsrc1_aligned:
- cfi_restore_state
- /* src1 is word aligned, but src2 has no common alignment
- with it. */
- ldr data1, [src1], #4
- lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
-
- bic src2, src2, #3
- ldr data2, [src2], #4
- bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
- bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
-
- /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
-.Loverlap3:
- bic tmp1, data1, #MSB
- uadd8 syndrome, data1, const_m1
- eors syndrome, tmp1, data2, S2LO #8
- sel syndrome, syndrome, const_m1
- bne 4f
- cbnz syndrome, 5f
- ldr data2, [src2], #4
- eor tmp1, tmp1, data1
- cmp tmp1, data2, S2HI #24
- bne 6f
- ldr data1, [src1], #4
- b .Loverlap3
-4:
- S2LO data2, data2, #8
- b .Lstrcmp_tail
-
-5:
- bics syndrome, syndrome, #MSB
- bne .Lstrcmp_done_equal
-
- /* We can only get here if the MSB of data1 contains 0, so
- fast-path the exit. */
- ldrb result, [src2]
- ldrd r4, r5, [sp], #16
- cfi_remember_state
- cfi_def_cfa_offset (0)
- cfi_restore (r4)
- cfi_restore (r5)
- /* R6/7 Not used in this sequence. */
- cfi_restore (r6)
- cfi_restore (r7)
- neg result, result
- bx lr
-
-6:
- cfi_restore_state
- S2LO data1, data1, #24
- and data2, data2, #LSB
- b .Lstrcmp_tail
-
- .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
-.Loverlap2:
- and tmp1, data1, const_m1, S2LO #16
- uadd8 syndrome, data1, const_m1
- eors syndrome, tmp1, data2, S2LO #16
- sel syndrome, syndrome, const_m1
- bne 4f
- cbnz syndrome, 5f
- ldr data2, [src2], #4
- eor tmp1, tmp1, data1
- cmp tmp1, data2, S2HI #16
- bne 6f
- ldr data1, [src1], #4
- b .Loverlap2
-4:
- S2LO data2, data2, #16
- b .Lstrcmp_tail
-5:
- ands syndrome, syndrome, const_m1, S2LO #16
- bne .Lstrcmp_done_equal
-
- ldrh data2, [src2]
- S2LO data1, data1, #16
-#ifdef __ARM_BIG_ENDIAN
- lsl data2, data2, #16
-#endif
- b .Lstrcmp_tail
-
-6:
- S2LO data1, data1, #16
- and data2, data2, const_m1, S2LO #16
- b .Lstrcmp_tail
-
- .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
-.Loverlap1:
- and tmp1, data1, #LSB
- uadd8 syndrome, data1, const_m1
- eors syndrome, tmp1, data2, S2LO #24
- sel syndrome, syndrome, const_m1
- bne 4f
- cbnz syndrome, 5f
- ldr data2, [src2], #4
- eor tmp1, tmp1, data1
- cmp tmp1, data2, S2HI #8
- bne 6f
- ldr data1, [src1], #4
- b .Loverlap1
-4:
- S2LO data2, data2, #24
- b .Lstrcmp_tail
-5:
- tst syndrome, #LSB
- bne .Lstrcmp_done_equal
- ldr data2, [src2]
-6:
- S2LO data1, data1, #8
- bic data2, data2, #MSB
- b .Lstrcmp_tail
-
-.Lstrcmp_done_equal:
- mov result, #0
- ldrd r4, r5, [sp], #16
- cfi_remember_state
- cfi_def_cfa_offset (0)
- cfi_restore (r4)
- cfi_restore (r5)
- /* R6/7 not used in this sequence. */
- cfi_restore (r6)
- cfi_restore (r7)
- bx lr
-
-.Lstrcmp_tail:
- cfi_restore_state
-#ifndef __ARM_BIG_ENDIAN
- rev data1, data1
- rev data2, data2
- /* Now everything looks big-endian... */
-#endif
- uadd8 tmp1, data1, const_m1
- eor tmp1, data1, data2
- sel syndrome, tmp1, const_m1
- clz tmp1, syndrome
- lsl data1, data1, tmp1
- lsl data2, data2, tmp1
- lsr result, data1, #24
- ldrd r4, r5, [sp], #16
- cfi_def_cfa_offset (0)
- cfi_restore (r4)
- cfi_restore (r5)
- /* R6/7 not used in this sequence. */
- cfi_restore (r6)
- cfi_restore (r7)
- sub result, result, data2, lsr #24
- bx lr
-END (strcmp)
-libc_hidden_builtin_def (strcmp)