diff options
Diffstat (limited to 'nptl')
-rw-r--r-- | nptl/pthread_spin_init.c | 3 | ||||
-rw-r--r-- | nptl/pthread_spin_lock.c | 71 | ||||
-rw-r--r-- | nptl/pthread_spin_trylock.c | 54 | ||||
-rw-r--r-- | nptl/pthread_spin_unlock.c | 6 |
4 files changed, 100 insertions, 34 deletions
diff --git a/nptl/pthread_spin_init.c b/nptl/pthread_spin_init.c index 01dec5eea4..fe3091377e 100644 --- a/nptl/pthread_spin_init.c +++ b/nptl/pthread_spin_init.c @@ -22,6 +22,7 @@ int pthread_spin_init (pthread_spinlock_t *lock, int pshared) { - *lock = 0; + /* Relaxed MO is fine because this is an initializing store. */ + atomic_store_relaxed (lock, 0); return 0; } diff --git a/nptl/pthread_spin_lock.c b/nptl/pthread_spin_lock.c index 4d03b7893a..682af80240 100644 --- a/nptl/pthread_spin_lock.c +++ b/nptl/pthread_spin_lock.c @@ -19,27 +19,35 @@ #include <atomic.h> #include "pthreadP.h" -/* A machine-specific version can define SPIN_LOCK_READS_BETWEEN_CMPXCHG - to the number of plain reads that it's optimal to spin on between uses - of atomic_compare_and_exchange_val_acq. If spinning forever is optimal - then use -1. If no plain reads here would ever be optimal, use 0. */ -#ifndef SPIN_LOCK_READS_BETWEEN_CMPXCHG -# warning machine-dependent file should define SPIN_LOCK_READS_BETWEEN_CMPXCHG -# define SPIN_LOCK_READS_BETWEEN_CMPXCHG 1000 -#endif - int pthread_spin_lock (pthread_spinlock_t *lock) { - /* atomic_exchange usually takes less instructions than - atomic_compare_and_exchange. On the other hand, - atomic_compare_and_exchange potentially generates less bus traffic - when the lock is locked. - We assume that the first try mostly will be successful, and we use - atomic_exchange. For the subsequent tries we use - atomic_compare_and_exchange. */ - if (atomic_exchange_acq (lock, 1) == 0) + int val = 0; + + /* We assume that the first try mostly will be successful, thus we use + atomic_exchange if it is not implemented by a CAS loop (we also assume + that atomic_exchange can be faster if it succeeds, see + ATOMIC_EXCHANGE_USES_CAS). Otherwise, we use a weak CAS and not an + exchange so we bail out after the first failed attempt to change the + state. For the subsequent attempts we use atomic_compare_and_exchange + after we observe that the lock is not acquired. + See also comment in pthread_spin_trylock. + We use acquire MO to synchronize-with the release MO store in + pthread_spin_unlock, and thus ensure that prior critical sections + happen-before this critical section. */ +#if ! ATOMIC_EXCHANGE_USES_CAS + /* Try to acquire the lock with an exchange instruction as this architecture + has such an instruction and we assume it is faster than a CAS. + The acquisition succeeds if the lock is not in an acquired state. */ + if (__glibc_likely (atomic_exchange_acquire (lock, 1) == 0)) return 0; +#else + /* Try to acquire the lock with a CAS instruction as this architecture + has no exchange instruction. The acquisition succeeds if the lock is not + acquired. */ + if (__glibc_likely (atomic_compare_exchange_weak_acquire (lock, &val, 1))) + return 0; +#endif do { @@ -47,23 +55,26 @@ pthread_spin_lock (pthread_spinlock_t *lock) to cmpxchg is not a good idea on many targets as that will force expensive memory synchronizations among processors and penalize other running threads. - On the other hand, we do want to update memory state on the local core - once in a while to avoid spinning indefinitely until some event that - will happen to update local memory as a side-effect. */ - if (SPIN_LOCK_READS_BETWEEN_CMPXCHG >= 0) + There is no technical reason for throwing in a CAS every now and then, + and so far we have no evidence that it can improve performance. + If that would be the case, we have to adjust other spin-waiting loops + elsewhere, too! + Thus we use relaxed MO reads until we observe the lock to not be + acquired anymore. */ + do { - int wait = SPIN_LOCK_READS_BETWEEN_CMPXCHG; + /* TODO Back-off. */ - while (*lock != 0 && wait > 0) - --wait; - } - else - { - while (*lock != 0) - ; + atomic_spin_nop (); + + val = atomic_load_relaxed (lock); } + while (val != 0); + + /* We need acquire memory order here for the same reason as mentioned + for the first try to lock the spinlock. */ } - while (atomic_compare_and_exchange_val_acq (lock, 1, 0) != 0); + while (!atomic_compare_exchange_weak_acquire (lock, &val, 1)); return 0; } diff --git a/nptl/pthread_spin_trylock.c b/nptl/pthread_spin_trylock.c index 593bba3ed8..83921b06b8 100644 --- a/nptl/pthread_spin_trylock.c +++ b/nptl/pthread_spin_trylock.c @@ -23,5 +23,57 @@ int pthread_spin_trylock (pthread_spinlock_t *lock) { - return atomic_exchange_acq (lock, 1) ? EBUSY : 0; + /* For the spin try lock, we have the following possibilities: + + 1) If we assume that trylock will most likely succeed in practice: + * We just do an exchange. + + 2) If we want to bias towards cases where trylock succeeds, but don't + rule out contention: + * If exchange is not implemented by a CAS loop, and exchange is faster + than CAS, do an exchange. + * If exchange is implemented by a CAS loop, use a weak CAS and not an + exchange so we bail out after the first failed attempt to change the state. + + 3) If we expect contention to be likely: + * If CAS always brings the cache line into an exclusive state even if the + spinlock is already acquired, then load the value first with + atomic_load_relaxed and test if lock is not acquired. Then do 2). + + We assume that 2) is the common case, and that this won't be slower than + 1) in the common case. + + We use acquire MO to synchronize-with the release MO store in + pthread_spin_unlock, and thus ensure that prior critical sections + happen-before this critical section. */ +#if ! ATOMIC_EXCHANGE_USES_CAS + /* Try to acquire the lock with an exchange instruction as this architecture + has such an instruction and we assume it is faster than a CAS. + The acquisition succeeds if the lock is not in an acquired state. */ + if (atomic_exchange_acquire (lock, 1) == 0) + return 0; +#else + /* Try to acquire the lock with a CAS instruction as this architecture + has no exchange instruction. The acquisition succeeds if the lock is not + acquired. */ + do + { + int val = 0; + if (atomic_compare_exchange_weak_acquire (lock, &val, 1)) + return 0; + } + /* atomic_compare_exchange_weak_acquire can fail spuriously. Whereas + C++11 and C11 make it clear that trylock operations can fail spuriously, + POSIX does not explicitly specify this; it only specifies that failing + synchronization operations do not need to have synchronization effects + themselves, but a spurious failure is something that could contradict a + happens-before established earlier (e.g., that we need to observe that + the lock is acquired). Therefore, we emulate a strong CAS by simply + checking with a relaxed MO load that the lock is really acquired before + returning EBUSY; the additional overhead this may cause is on the slow + path. */ + while (atomic_load_relaxed (lock) == 0); +#endif + + return EBUSY; } diff --git a/nptl/pthread_spin_unlock.c b/nptl/pthread_spin_unlock.c index 5fd73e578b..f83b69639a 100644 --- a/nptl/pthread_spin_unlock.c +++ b/nptl/pthread_spin_unlock.c @@ -23,7 +23,9 @@ int pthread_spin_unlock (pthread_spinlock_t *lock) { - atomic_full_barrier (); - *lock = 0; + /* The atomic_store_release synchronizes-with the atomic_exchange_acquire + or atomic_compare_exchange_weak_acquire in pthread_spin_lock / + pthread_spin_trylock. */ + atomic_store_release (lock, 0); return 0; } |