summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorelliottk <elliottk@google.com>2018-12-19 13:35:30 -0800
committerelliottk <elliottk@google.com>2018-12-21 11:28:37 -0800
commita81768561ca1de2a947a45f224bf91d3913c1e2a (patch)
tree479c90314c6975e77897c49c51e4ac4033945b8e
parentc17d6997cd6babd8dbd805deb1f1d104c9324a36 (diff)
downloadlibvpx-a81768561ca1de2a947a45f224bf91d3913c1e2a.tar
libvpx-a81768561ca1de2a947a45f224bf91d3913c1e2a.tar.gz
libvpx-a81768561ca1de2a947a45f224bf91d3913c1e2a.tar.bz2
libvpx-a81768561ca1de2a947a45f224bf91d3913c1e2a.zip
Improve accuracy of benchmarking
For small code regions, readtsc can give inaccurate results because it does not account for out-of-order execution. Add x86_tsc_start and x86_tsc_end that account for this, according to the white paper at https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf Using x86_tsc_start/end will also add in several more instructions; I imagine this is negligible. Change-Id: I54a1c8fa7977c34bf91b422369c96f036c93a08a
-rw-r--r--vpx_ports/x86.h60
1 files changed, 55 insertions, 5 deletions
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index f6aac1832..58eeb7b63 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -223,11 +223,26 @@ static INLINE int x86_simd_caps(void) {
return flags & mask;
}
-// Note:
-// 32-bit CPU cycle counter is light-weighted for most function performance
-// measurement. For large function (CPU time > a couple of seconds), 64-bit
-// counter should be used.
-// 32-bit CPU cycle counter
+// Fine-Grain Measurement Functions
+//
+// If you are timing a small region of code, access the timestamp counter
+// (TSC) via:
+//
+// unsigned int start = x86_tsc_start();
+// ...
+// unsigned int end = x86_tsc_end();
+// unsigned int diff = end - start;
+//
+// The start/end functions introduce a few more instructions than using
+// x86_readtsc directly, but prevent the CPU's out-of-order execution from
+// affecting the measurement (by having earlier/later instructions be evaluated
+// in the time interval). See the white paper, "How to Benchmark Code
+// Execution Times on IntelĀ® IA-32 and IA-64 Instruction Set Architectures" by
+// Gabriele Paoloni for more information.
+//
+// If you are timing a large function (CPU time > a couple of seconds), use
+// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
+// out-of-order leakage that can occur is minimal compared to total runtime.
static INLINE unsigned int x86_readtsc(void) {
#if defined(__GNUC__) && __GNUC__
unsigned int tsc;
@@ -264,6 +279,41 @@ static INLINE uint64_t x86_readtsc64(void) {
#endif
}
+// 32-bit CPU cycle counter with a partial fence against out-of-order execution.
+static INLINE unsigned int x86_readtscp(void) {
+#if defined(__GNUC__) && __GNUC__
+ unsigned int tscp;
+ __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
+ return tscp;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+ unsigned int tscp;
+ asm volatile("rdtscp\n\t" : "=a"(tscp) :);
+ return tscp;
+#elif defined(_MSC_VER)
+ unsigned int ui;
+ return (unsigned int)__rdtscp(&ui);
+#else
+#if ARCH_X86_64
+ return (unsigned int)__rdtscp();
+#else
+ __asm rdtscp;
+#endif
+#endif
+}
+
+static INLINE unsigned int x86_tsc_start(void) {
+ unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+ cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+ return x86_readtsc();
+}
+
+static INLINE unsigned int x86_tsc_end(void) {
+ uint32_t v = x86_readtscp();
+ unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+ cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+ return v;
+}
+
#if defined(__GNUC__) && __GNUC__
#define x86_pause_hint() __asm__ __volatile__("pause \n\t")
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)