aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--manual/tunables.texi6
-rw-r--r--sysdeps/x86/cacheinfo.c16
2 files changed, 16 insertions, 6 deletions
diff --git a/manual/tunables.texi b/manual/tunables.texi
index 23ef0d40e7..d72d7a5ec0 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -432,7 +432,11 @@ set shared cache size in bytes for use in memory and string routines.
@deftp Tunable glibc.cpu.x86_non_temporal_threshold
The @code{glibc.cpu.x86_non_temporal_threshold} tunable allows the user
-to set threshold in bytes for non temporal store.
+to set threshold in bytes for non temporal store. Non temporal stores
+give a hint to the hardware to move data directly to memory without
+displacing other data from the cache. This tunable is used by some
+platforms to determine when to use non temporal stores in operations
+like memmove and memcpy.
This tunable is specific to i386 and x86-64.
@end deftp
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 217c21c34f..dadec5d58f 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -854,14 +854,20 @@ init_cacheinfo (void)
__x86_shared_cache_size = shared;
}
- /* The large memcpy micro benchmark in glibc shows that 6 times of
- shared cache size is the approximate value above which non-temporal
- store becomes faster on a 8-core processor. This is the 3/4 of the
- total shared cache size. */
+ /* The default setting for the non_temporal threshold is 3/4 of one
+ thread's share of the chip's cache. For most Intel and AMD processors
+ with an initial release date between 2017 and 2020, a thread's typical
+ share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
+ threshold leaves 125 KBytes to 500 KBytes of the thread's data
+ in cache after a maximum temporal copy, which will maintain
+ in cache a reasonable portion of the thread's stack and other
+ active data. If the threshold is set higher than one thread's
+ share of the cache, it has a substantial risk of negatively
+ impacting the performance of other threads running on the chip. */
__x86_shared_non_temporal_threshold
= (cpu_features->non_temporal_threshold != 0
? cpu_features->non_temporal_threshold
- : __x86_shared_cache_size * threads * 3 / 4);
+ : __x86_shared_cache_size * 3 / 4);
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
unsigned int minimum_rep_movsb_threshold;