aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog16
-rw-r--r--sysdeps/sparc/sparc32/dl-machine.h2
-rw-r--r--sysdeps/sparc/sparc32/dl-plt.h47
-rw-r--r--sysdeps/sparc/sparc64/dl-machine.h2
-rw-r--r--sysdeps/sparc/sparc64/dl-plt.h29
5 files changed, 83 insertions, 13 deletions
diff --git a/ChangeLog b/ChangeLog
index b8118dd8f0..bd9f6ecab4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2010-03-03 David S. Miller <davem@davemloft.net>
+
+ * sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must
+ pass '1' for 't' argument to sparc_fixup_plt.
+ * sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel):
+ Likewise.
+ * sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define.
+ (sparc_fixup_plt): Document 't' argument. Enable branch
+ optimization and use v9 branches when possible. Explain why we
+ cannot unconditionally patch the branch into the first PLT
+ instruction.
+ * sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't'
+ argument. Use v9 branches when possible. Explain why we can in
+ fact unconditionally use a branch in the first PLT instruction
+ here.
+
2010-02-28 Roland McGrath <roland@redhat.com>
* elf/elf.h (NT_X86_XSTATE): New macro.
diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h
index 53257104a6..9631db32e1 100644
--- a/sysdeps/sparc/sparc32/dl-machine.h
+++ b/sysdeps/sparc/sparc32/dl-machine.h
@@ -563,7 +563,7 @@ elf_machine_lazy_rel (struct link_map *map,
{
Elf32_Addr value = map->l_addr + reloc->r_addend;
value = ((Elf32_Addr (*) (void)) value) ();
- sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
+ sparc_fixup_plt (reloc, reloc_addr, value, 1, 1);
}
else if (r_type == R_SPARC_NONE)
;
diff --git a/sysdeps/sparc/sparc32/dl-plt.h b/sysdeps/sparc/sparc32/dl-plt.h
index edcc5c1374..bfb891fe69 100644
--- a/sysdeps/sparc/sparc32/dl-plt.h
+++ b/sysdeps/sparc/sparc32/dl-plt.h
@@ -25,19 +25,55 @@
#define OPCODE_JMP_G1 0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
#define OPCODE_SAVE_SP 0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
#define OPCODE_BA 0x30800000 /* b,a ?; add PC-rel word address */
+#define OPCODE_BA_PT 0x30480000 /* ba,a,pt %icc, ?; add PC-rel word address */
static inline __attribute__ ((always_inline)) Elf32_Addr
sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
Elf32_Addr value, int t, int do_flush)
{
- Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
+ Elf32_Sword disp;
- if (0 && disp >= -0x800000 && disp < 0x800000)
+ /* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
+ in which case we'll be resolving all PLT entries and thus can
+ optimize by overwriting instructions starting at the first PLT entry
+ instruction and we need not be mindful of thread safety.
+
+ Otherwise, 't' is '1'. */
+ reloc_addr += t;
+ disp = value - (Elf32_Addr) reloc_addr;
+
+ if (disp >= -0x800000 && disp < 0x800000)
{
- /* Don't need to worry about thread safety. We're writing just one
- instruction. */
+ unsigned int insn = OPCODE_BA | ((disp >> 2) & 0x3fffff);
+
+#ifdef __sparc_v9__
+ /* On V9 we can do even better by using a branch with
+ prediction if we fit into the even smaller 19-bit
+ displacement field. */
+ if (disp >= -0x100000 && disp < 0x100000)
+ insn = OPCODE_BA_PT | ((disp >> 2) & 0x07ffff);
+#endif
+
+ /* Even if we are writing just a single branch, we must not
+ ignore the 't' offset. Consider a case where we have some
+ PLT slots which can be optimized into a single branch and
+ some which cannot. Then we can end up with a PLT which looks
+ like:
+
+ PLT4.0: sethi %(PLT_4_INDEX), %g1
+ sethi %(fully_resolved_sym_4), %g1
+ jmp %g1 + %lo(fully_resolved_sym_4)
+ PLT5.0: ba,a fully_resolved_sym_5
+ ba,a PLT0.0
+ ...
+
+ The delay slot of that jmp must always be either a sethi to
+ %g1 or a nop. But if we try to place this displacement
+ branch there, PLT4.0 will jump to fully_resolved_sym_4 for 1
+ instruction and then go immediately to
+ fully_resolved_sym_5. */
- reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
+ reloc_addr[0] = insn;
if (do_flush)
__asm __volatile ("flush %0" : : "r"(reloc_addr));
}
@@ -48,7 +84,6 @@ sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
need not be done during bootstrapping, since there are no threads.
But we also can't tell if we _can_ use flush, so don't. */
- reloc_addr += t;
reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
if (do_flush)
__asm __volatile ("flush %0+4" : : "r"(reloc_addr));
diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h
index 4c915eb586..fcfbb06ac2 100644
--- a/sysdeps/sparc/sparc64/dl-machine.h
+++ b/sysdeps/sparc/sparc64/dl-machine.h
@@ -661,7 +661,7 @@ elf_machine_lazy_rel (struct link_map *map,
{
/* 'high' is always zero, for large PLT entries the linker
emits an R_SPARC_IRELATIVE. */
- sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0);
+ sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 1);
}
else
*reloc_addr = value;
diff --git a/sysdeps/sparc/sparc64/dl-plt.h b/sysdeps/sparc/sparc64/dl-plt.h
index e06be43a0a..ca2fe3bbd8 100644
--- a/sysdeps/sparc/sparc64/dl-plt.h
+++ b/sysdeps/sparc/sparc64/dl-plt.h
@@ -28,7 +28,14 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
Elf64_Sxword disp = value - plt_vaddr;
- /* Now move plt_vaddr up to the call instruction. */
+ /* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
+ in which case we'll be resolving all PLT entries and thus can
+ optimize by overwriting instructions starting at the first PLT entry
+ instruction and we need not be mindful of thread safety.
+
+ Otherwise, 't' is '1'.
+
+ Now move plt_vaddr up to the call instruction. */
plt_vaddr += ((t + 1) * 4);
/* PLT entries .PLT32768 and above look always the same. */
@@ -39,10 +46,22 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
/* Near destination. */
else if (disp >= -0x800000 && disp < 0x800000)
{
- /* As this is just one instruction, it is thread safe and so
- we can avoid the unnecessary sethi FOO, %g1.
- b,a target */
- insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
+ unsigned int insn;
+
+ /* ba,a */
+ insn = 0x30800000 | ((disp >> 2) & 0x3fffff);
+
+ if (disp >= -0x100000 && disp < 0x100000)
+ {
+ /* ba,a,pt %icc */
+ insn = 0x30480000 | ((disp >> 2) & 0x07ffff);
+ }
+
+ /* As this is just one instruction, it is thread safe and so we
+ can avoid the unnecessary sethi FOO, %g1. Each 64-bit PLT
+ entry is 8 instructions long, so we can't run into the 'jmp'
+ delay slot problems 32-bit PLTs can. */
+ insns[0] = insn;
__asm __volatile ("flush %0" : : "r" (insns));
}
/* 32-bit Sparc style, the target is in the lower 32-bits of