diff options
-rw-r--r-- | manual/tunables.texi | 6 | ||||
-rw-r--r-- | sysdeps/x86/cacheinfo.c | 16 |
2 files changed, 16 insertions, 6 deletions
diff --git a/manual/tunables.texi b/manual/tunables.texi index 23ef0d40e7..d72d7a5ec0 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -432,7 +432,11 @@ set shared cache size in bytes for use in memory and string routines. @deftp Tunable glibc.cpu.x86_non_temporal_threshold The @code{glibc.cpu.x86_non_temporal_threshold} tunable allows the user -to set threshold in bytes for non temporal store. +to set threshold in bytes for non temporal store. Non temporal stores +give a hint to the hardware to move data directly to memory without +displacing other data from the cache. This tunable is used by some +platforms to determine when to use non temporal stores in operations +like memmove and memcpy. This tunable is specific to i386 and x86-64. @end deftp diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c index 217c21c34f..dadec5d58f 100644 --- a/sysdeps/x86/cacheinfo.c +++ b/sysdeps/x86/cacheinfo.c @@ -854,14 +854,20 @@ init_cacheinfo (void) __x86_shared_cache_size = shared; } - /* The large memcpy micro benchmark in glibc shows that 6 times of - shared cache size is the approximate value above which non-temporal - store becomes faster on a 8-core processor. This is the 3/4 of the - total shared cache size. */ + /* The default setting for the non_temporal threshold is 3/4 of one + thread's share of the chip's cache. For most Intel and AMD processors + with an initial release date between 2017 and 2020, a thread's typical + share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 + threshold leaves 125 KBytes to 500 KBytes of the thread's data + in cache after a maximum temporal copy, which will maintain + in cache a reasonable portion of the thread's stack and other + active data. If the threshold is set higher than one thread's + share of the cache, it has a substantial risk of negatively + impacting the performance of other threads running on the chip. */ __x86_shared_non_temporal_threshold = (cpu_features->non_temporal_threshold != 0 ? cpu_features->non_temporal_threshold - : __x86_shared_cache_size * threads * 3 / 4); + : __x86_shared_cache_size * 3 / 4); /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ unsigned int minimum_rep_movsb_threshold; |