4 files changed, 70 insertions, 63 deletions
diff --git a/elf/dl-close.c b/elf/dl-close.c
index b887a44888..1c7a861db1 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -703,7 +703,7 @@ _dl_close_worker (struct link_map *map, bool force)
       if (__glibc_unlikely (newgen == 0))
 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
       /* Can be read concurrently.  */
-      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+      atomic_store_release (&GL(dl_tls_generation), newgen);
 
       if (tls_free_end == GL(dl_tls_static_used))
 	GL(dl_tls_static_used) = tls_free_start;
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 2d985e21d8..351931af04 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -405,7 +405,7 @@ update_tls_slotinfo (struct link_map *new)
     _dl_fatal_printf (N_("\
 TLS generation counter wrapped!  Please report this."));
   /* Can be read concurrently.  */
-  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+  atomic_store_release (&GL(dl_tls_generation), newgen);
 
   /* We need a second pass for static tls data, because
      _dl_update_slotinfo must not be run while calls to
@@ -422,8 +422,8 @@ TLS generation counter wrapped!  Please report this."));
 	     now, but we can delay updating the DTV.  */
 	  imap->l_need_tls_init = 0;
 #ifdef SHARED
-	  /* Update the slot information data for at least the
-	     generation of the DSO we are allocating data for.  */
+	  /* Update the slot information data for the current
+	     generation.  */
 
 	  /* FIXME: This can terminate the process on memory
 	     allocation failure.  It is not possible to raise
@@ -431,7 +431,7 @@ TLS generation counter wrapped!  Please report this."));
 	     _dl_update_slotinfo would have to be split into two
 	     operations, similar to resize_scopes and update_scopes
 	     above.  This is related to bug 16134.  */
-	  _dl_update_slotinfo (imap->l_tls_modid);
+	  _dl_update_slotinfo (imap->l_tls_modid, newgen);
 #endif
 
 	  dl_init_static_tls (imap);
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
index 1d558c1e0c..e5c555d82c 100644
--- a/elf/dl-reloc.c
+++ b/elf/dl-reloc.c
@@ -112,11 +112,11 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional)
   if (map->l_real->l_relocated)
     {
 #ifdef SHARED
+      /* Update the DTV of the current thread.  Note: GL(dl_load_tls_lock)
+	 is held here so normal load of the generation counter is valid.  */
       if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
 			    0))
-	/* Update the slot information data for at least the generation of
-	   the DSO we are allocating data for.  */
-	(void) _dl_update_slotinfo (map->l_tls_modid);
+	(void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
 #endif
 
       dl_init_static_tls (map);
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 99b83ca696..c192b5a13a 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -715,57 +715,57 @@ allocate_and_init (struct link_map *map)
 
 
 struct link_map *
-_dl_update_slotinfo (unsigned long int req_modid)
+_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
 {
   struct link_map *the_map = NULL;
   dtv_t *dtv = THREAD_DTV ();
 
-  /* The global dl_tls_dtv_slotinfo array contains for each module
-     index the generation counter current when the entry was created.
+  /* CONCURRENCY NOTES:
+
+     The global dl_tls_dtv_slotinfo_list array contains for each module
+     index the generation counter current when that entry was updated.
      This array never shrinks so that all module indices which were
-     valid at some time can be used to access it.  Before the first
-     use of a new module index in this function the array was extended
-     appropriately.  Access also does not have to be guarded against
-     modifications of the array.  It is assumed that pointer-size
-     values can be read atomically even in SMP environments.  It is
-     possible that other threads at the same time dynamically load
-     code and therefore add to the slotinfo list.  This is a problem
-     since we must not pick up any information about incomplete work.
-     The solution to this is to ignore all dtv slots which were
-     created after the one we are currently interested.  We know that
-     dynamic loading for this module is completed and this is the last
-     load operation we know finished.  */
-  unsigned long int idx = req_modid;
+     valid at some time can be used to access it.  Concurrent loading
+     and unloading of modules can update slotinfo entries or extend
+     the array.  The updates happen under the GL(dl_load_tls_lock) and
+     finish with the release store of the generation counter to
+     GL(dl_tls_generation) which is synchronized with the load of
+     new_gen in the caller.  So updates up to new_gen are synchronized
+     but updates for later generations may not be.
+
+     Here we update the thread dtv from old_gen (== dtv[0].counter) to
+     new_gen generation.  For this, each dtv[i] entry is either set to
+     an unallocated state (set), or left unmodified (nop).  Where (set)
+     may resize the dtv first if modid i >= dtv[-1].counter. The rules
+     for the decision between (set) and (nop) are
+
+     (1) If slotinfo entry i is concurrently updated then either (set)
+         or (nop) is valid: TLS access cannot use dtv[i] unless it is
+         synchronized with a generation > new_gen.
+
+     Otherwise, if the generation of slotinfo entry i is gen and the
+     loaded module for this entry is map then
+
+     (2) If gen <= old_gen then do (nop).
+
+     (3) If old_gen < gen <= new_gen then
+         (3.1) if map != 0 then (set)
+         (3.2) if map == 0 then either (set) or (nop).
+
+     Note that (1) cannot be reliably detected, but since both actions
+     are valid it does not have to be.  Only (2) and (3.1) cases need
+     to be distinguished for which relaxed mo access of gen and map is
+     enough: their value is synchronized when it matters.
+
+     Note that a relaxed mo load may give an out-of-thin-air value since
+     it is used in decisions that can affect concurrent stores.  But this
+     should only happen if the OOTA value causes UB that justifies the
+     concurrent store of the value.  This is not expected to be an issue
+     in practice.  */
   struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
 
-  while (idx >= listp->len)
+  if (dtv[0].counter < new_gen)
     {
-      idx -= listp->len;
-      listp = listp->next;
-    }
-
-  if (dtv[0].counter < listp->slotinfo[idx].gen)
-    {
-      /* CONCURRENCY NOTES:
-
-	 Here the dtv needs to be updated to new_gen generation count.
-
-	 This code may be called during TLS access when GL(dl_load_tls_lock)
-	 is not held.  In that case the user code has to synchronize with
-	 dlopen and dlclose calls of relevant modules.  A module m is
-	 relevant if the generation of m <= new_gen and dlclose of m is
-	 synchronized: a memory access here happens after the dlopen and
-	 before the dlclose of relevant modules.  The dtv entries for
-	 relevant modules need to be updated, other entries can be
-	 arbitrary.
-
-	 This e.g. means that the first part of the slotinfo list can be
-	 accessed race free, but the tail may be concurrently extended.
-	 Similarly relevant slotinfo entries can be read race free, but
-	 other entries are racy.  However updating a non-relevant dtv
-	 entry does not affect correctness.  For a relevant module m,
-	 max_modid >= modid of m.  */
-      size_t new_gen = listp->slotinfo[idx].gen;
       size_t total = 0;
       size_t max_modid  = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
       assert (max_modid >= req_modid);
@@ -778,31 +778,33 @@ _dl_update_slotinfo (unsigned long int req_modid)
 	    {
 	      size_t modid = total + cnt;
 
-	      /* Later entries are not relevant.  */
+	      /* Case (1) for all later modids.  */
 	      if (modid > max_modid)
 		break;
 
 	      size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
 
+	      /* Case (1).  */
 	      if (gen > new_gen)
-		/* Not relevant.  */
 		continue;
 
-	      /* If the entry is older than the current dtv layout we
-		 know we don't have to handle it.  */
+	      /* Case (2) or (1).  */
 	      if (gen <= dtv[0].counter)
 		continue;
 
+	      /* Case (3) or (1).  */
+
 	      /* If there is no map this means the entry is empty.  */
 	      struct link_map *map
 		= atomic_load_relaxed (&listp->slotinfo[cnt].map);
 	      /* Check whether the current dtv array is large enough.  */
 	      if (dtv[-1].counter < modid)
 		{
+		  /* Case (3.2) or (1).  */
 		  if (map == NULL)
 		    continue;
 
-		  /* Resize the dtv.  */
+		  /* Resizing the dtv aborts on failure: bug 16134.  */
 		  dtv = _dl_resize_dtv (dtv, max_modid);
 
 		  assert (modid <= dtv[-1].counter);
@@ -813,7 +815,7 @@ _dl_update_slotinfo (unsigned long int req_modid)
 		}
 
 	      /* If there is currently memory allocate for this
-		 dtv entry free it.  */
+		 dtv entry free it.  Note: this is not AS-safe.  */
 	      /* XXX Ideally we will at some point create a memory
 		 pool.  */
 	      free (dtv[modid].pointer.to_free);
@@ -908,9 +910,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
 
 static struct link_map *
 __attribute_noinline__
-update_get_addr (GET_ADDR_ARGS)
+update_get_addr (GET_ADDR_ARGS, size_t gen)
 {
-  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
+  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
   dtv_t *dtv = THREAD_DTV ();
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -940,12 +942,17 @@ __tls_get_addr (GET_ADDR_ARGS)
   dtv_t *dtv = THREAD_DTV ();
 
   /* Update is needed if dtv[0].counter < the generation of the accessed
-     module.  The global generation counter is used here as it is easier
-     to check.  Synchronization for the relaxed MO access is guaranteed
-     by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
+     module, but the global generation counter is easier to check (which
+     must be synchronized up to the generation of the accessed module by
+     user code doing the TLS access so relaxed mo read is enough).  */
   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
-    return update_get_addr (GET_ADDR_PARAM);
+    {
+      /* Update DTV up to the global generation, see CONCURRENCY NOTES
+         in _dl_update_slotinfo.  */
+      gen = atomic_load_acquire (&GL(dl_tls_generation));
+      return update_get_addr (GET_ADDR_PARAM, gen);
+    }
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;