diff options
author | elliottk <elliottk@google.com> | 2018-12-19 13:35:30 -0800 |
---|---|---|
committer | elliottk <elliottk@google.com> | 2018-12-21 11:28:37 -0800 |
commit | a81768561ca1de2a947a45f224bf91d3913c1e2a (patch) | |
tree | 479c90314c6975e77897c49c51e4ac4033945b8e | |
parent | c17d6997cd6babd8dbd805deb1f1d104c9324a36 (diff) | |
download | libvpx-a81768561ca1de2a947a45f224bf91d3913c1e2a.tar libvpx-a81768561ca1de2a947a45f224bf91d3913c1e2a.tar.gz libvpx-a81768561ca1de2a947a45f224bf91d3913c1e2a.tar.bz2 libvpx-a81768561ca1de2a947a45f224bf91d3913c1e2a.zip |
Improve accuracy of benchmarking
For small code regions, readtsc can give inaccurate results because it does
not account for out-of-order execution. Add x86_tsc_start and x86_tsc_end
that account for this, according to the white paper at
https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
Using x86_tsc_start/end will also add in several more instructions; I imagine
this is negligible.
Change-Id: I54a1c8fa7977c34bf91b422369c96f036c93a08a
-rw-r--r-- | vpx_ports/x86.h | 60 |
1 files changed, 55 insertions, 5 deletions
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index f6aac1832..58eeb7b63 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -223,11 +223,26 @@ static INLINE int x86_simd_caps(void) { return flags & mask; } -// Note: -// 32-bit CPU cycle counter is light-weighted for most function performance -// measurement. For large function (CPU time > a couple of seconds), 64-bit -// counter should be used. -// 32-bit CPU cycle counter +// Fine-Grain Measurement Functions +// +// If you are timing a small region of code, access the timestamp counter +// (TSC) via: +// +// unsigned int start = x86_tsc_start(); +// ... +// unsigned int end = x86_tsc_end(); +// unsigned int diff = end - start; +// +// The start/end functions introduce a few more instructions than using +// x86_readtsc directly, but prevent the CPU's out-of-order execution from +// affecting the measurement (by having earlier/later instructions be evaluated +// in the time interval). See the white paper, "How to Benchmark Code +// Execution Times on IntelĀ® IA-32 and IA-64 Instruction Set Architectures" by +// Gabriele Paoloni for more information. +// +// If you are timing a large function (CPU time > a couple of seconds), use +// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The +// out-of-order leakage that can occur is minimal compared to total runtime. static INLINE unsigned int x86_readtsc(void) { #if defined(__GNUC__) && __GNUC__ unsigned int tsc; @@ -264,6 +279,41 @@ static INLINE uint64_t x86_readtsc64(void) { #endif } +// 32-bit CPU cycle counter with a partial fence against out-of-order execution. +static INLINE unsigned int x86_readtscp(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tscp; + __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tscp; + asm volatile("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(_MSC_VER) + unsigned int ui; + return (unsigned int)__rdtscp(&ui); +#else +#if ARCH_X86_64 + return (unsigned int)__rdtscp(); +#else + __asm rdtscp; +#endif +#endif +} + +static INLINE unsigned int x86_tsc_start(void) { + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + return x86_readtsc(); +} + +static INLINE unsigned int x86_tsc_end(void) { + uint32_t v = x86_readtscp(); + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + return v; +} + #if defined(__GNUC__) && __GNUC__ #define x86_pause_hint() __asm__ __volatile__("pause \n\t") #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |