diff options
-rw-r--r-- | ChangeLog | 16 | ||||
-rw-r--r-- | sysdeps/sparc/sparc32/dl-machine.h | 2 | ||||
-rw-r--r-- | sysdeps/sparc/sparc32/dl-plt.h | 47 | ||||
-rw-r--r-- | sysdeps/sparc/sparc64/dl-machine.h | 2 | ||||
-rw-r--r-- | sysdeps/sparc/sparc64/dl-plt.h | 29 |
5 files changed, 83 insertions, 13 deletions
@@ -1,3 +1,19 @@ +2010-03-03 David S. Miller <davem@davemloft.net> + + * sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must + pass '1' for 't' argument to sparc_fixup_plt. + * sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel): + Likewise. + * sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define. + (sparc_fixup_plt): Document 't' argument. Enable branch + optimization and use v9 branches when possible. Explain why we + cannot unconditionally patch the branch into the first PLT + instruction. + * sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't' + argument. Use v9 branches when possible. Explain why we can in + fact unconditionally use a branch in the first PLT instruction + here. + 2010-02-28 Roland McGrath <roland@redhat.com> * elf/elf.h (NT_X86_XSTATE): New macro. diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h index 53257104a6..9631db32e1 100644 --- a/sysdeps/sparc/sparc32/dl-machine.h +++ b/sysdeps/sparc/sparc32/dl-machine.h @@ -563,7 +563,7 @@ elf_machine_lazy_rel (struct link_map *map, { Elf32_Addr value = map->l_addr + reloc->r_addend; value = ((Elf32_Addr (*) (void)) value) (); - sparc_fixup_plt (reloc, reloc_addr, value, 0, 1); + sparc_fixup_plt (reloc, reloc_addr, value, 1, 1); } else if (r_type == R_SPARC_NONE) ; diff --git a/sysdeps/sparc/sparc32/dl-plt.h b/sysdeps/sparc/sparc32/dl-plt.h index edcc5c1374..bfb891fe69 100644 --- a/sysdeps/sparc/sparc32/dl-plt.h +++ b/sysdeps/sparc/sparc32/dl-plt.h @@ -25,19 +25,55 @@ #define OPCODE_JMP_G1 0x81c06000 /* jmp %g1+?; add lo 10 bits of value */ #define OPCODE_SAVE_SP 0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */ #define OPCODE_BA 0x30800000 /* b,a ?; add PC-rel word address */ +#define OPCODE_BA_PT 0x30480000 /* ba,a,pt %icc, ?; add PC-rel word address */ static inline __attribute__ ((always_inline)) Elf32_Addr sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr, Elf32_Addr value, int t, int do_flush) { - Elf32_Sword disp = value - (Elf32_Addr) reloc_addr; + Elf32_Sword disp; - if (0 && disp >= -0x800000 && disp < 0x800000) + /* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap, + in which case we'll be resolving all PLT entries and thus can + optimize by overwriting instructions starting at the first PLT entry + instruction and we need not be mindful of thread safety. + + Otherwise, 't' is '1'. */ + reloc_addr += t; + disp = value - (Elf32_Addr) reloc_addr; + + if (disp >= -0x800000 && disp < 0x800000) { - /* Don't need to worry about thread safety. We're writing just one - instruction. */ + unsigned int insn = OPCODE_BA | ((disp >> 2) & 0x3fffff); + +#ifdef __sparc_v9__ + /* On V9 we can do even better by using a branch with + prediction if we fit into the even smaller 19-bit + displacement field. */ + if (disp >= -0x100000 && disp < 0x100000) + insn = OPCODE_BA_PT | ((disp >> 2) & 0x07ffff); +#endif + + /* Even if we are writing just a single branch, we must not + ignore the 't' offset. Consider a case where we have some + PLT slots which can be optimized into a single branch and + some which cannot. Then we can end up with a PLT which looks + like: + + PLT4.0: sethi %(PLT_4_INDEX), %g1 + sethi %(fully_resolved_sym_4), %g1 + jmp %g1 + %lo(fully_resolved_sym_4) + PLT5.0: ba,a fully_resolved_sym_5 + ba,a PLT0.0 + ... + + The delay slot of that jmp must always be either a sethi to + %g1 or a nop. But if we try to place this displacement + branch there, PLT4.0 will jump to fully_resolved_sym_4 for 1 + instruction and then go immediately to + fully_resolved_sym_5. */ - reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff); + reloc_addr[0] = insn; if (do_flush) __asm __volatile ("flush %0" : : "r"(reloc_addr)); } @@ -48,7 +84,6 @@ sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr, need not be done during bootstrapping, since there are no threads. But we also can't tell if we _can_ use flush, so don't. */ - reloc_addr += t; reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff); if (do_flush) __asm __volatile ("flush %0+4" : : "r"(reloc_addr)); diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h index 4c915eb586..fcfbb06ac2 100644 --- a/sysdeps/sparc/sparc64/dl-machine.h +++ b/sysdeps/sparc/sparc64/dl-machine.h @@ -661,7 +661,7 @@ elf_machine_lazy_rel (struct link_map *map, { /* 'high' is always zero, for large PLT entries the linker emits an R_SPARC_IRELATIVE. */ - sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0); + sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 1); } else *reloc_addr = value; diff --git a/sysdeps/sparc/sparc64/dl-plt.h b/sysdeps/sparc/sparc64/dl-plt.h index e06be43a0a..ca2fe3bbd8 100644 --- a/sysdeps/sparc/sparc64/dl-plt.h +++ b/sysdeps/sparc/sparc64/dl-plt.h @@ -28,7 +28,14 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc, Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr; Elf64_Sxword disp = value - plt_vaddr; - /* Now move plt_vaddr up to the call instruction. */ + /* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap, + in which case we'll be resolving all PLT entries and thus can + optimize by overwriting instructions starting at the first PLT entry + instruction and we need not be mindful of thread safety. + + Otherwise, 't' is '1'. + + Now move plt_vaddr up to the call instruction. */ plt_vaddr += ((t + 1) * 4); /* PLT entries .PLT32768 and above look always the same. */ @@ -39,10 +46,22 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc, /* Near destination. */ else if (disp >= -0x800000 && disp < 0x800000) { - /* As this is just one instruction, it is thread safe and so - we can avoid the unnecessary sethi FOO, %g1. - b,a target */ - insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff); + unsigned int insn; + + /* ba,a */ + insn = 0x30800000 | ((disp >> 2) & 0x3fffff); + + if (disp >= -0x100000 && disp < 0x100000) + { + /* ba,a,pt %icc */ + insn = 0x30480000 | ((disp >> 2) & 0x07ffff); + } + + /* As this is just one instruction, it is thread safe and so we + can avoid the unnecessary sethi FOO, %g1. Each 64-bit PLT + entry is 8 instructions long, so we can't run into the 'jmp' + delay slot problems 32-bit PLTs can. */ + insns[0] = insn; __asm __volatile ("flush %0" : : "r" (insns)); } /* 32-bit Sparc style, the target is in the lower 32-bits of |