forked from bkase/CUDA-grep
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cycleTimer.h
177 lines (161 loc) · 5.4 KB
/
cycleTimer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#ifndef _SYRAH_CYCLE_TIMER_H_
#define _SYRAH_CYCLE_TIMER_H_
#if defined(__APPLE__)
#if defined(__x86_64__)
#include <sys/sysctl.h>
#else
#include <mach/mach.h>
#include <mach/mach_time.h>
#endif // __x86_64__ or not
#include <stdio.h> // fprintf
#include <stdlib.h> // exit
#elif _WIN32
# include <windows.h>
# include <time.h>
#else
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <sys/time.h>
#endif
// This uses the cycle counter of the processor. Different
// processors in the system will have different values for this. If
// you process moves across processors, then the delta time you
// measure will likely be incorrect. This is mostly for fine
// grained measurements where the process is likely to be on the
// same processor. For more global things you should use the
// Time interface.
// Also note that if you processors' speeds change (i.e. processors
// scaling) or if you are in a heterogenous environment, you will
// likely get spurious results.
class CycleTimer {
public:
typedef unsigned long long SysClock;
//////////
// Return the current CPU time, in terms of clock ticks.
// Time zero is at some arbitrary point in the past.
static SysClock currentTicks() {
#if defined(__APPLE__) && !defined(__x86_64__)
return mach_absolute_time();
#elif defined(_WIN32)
LARGE_INTEGER qwTime;
QueryPerformanceCounter(&qwTime);
return qwTime.QuadPart;
#elif defined(__x86_64__)
unsigned int a, d;
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return static_cast<unsigned long long>(a) |
(static_cast<unsigned long long>(d) << 32);
#elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
unsigned int val;
asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
return val;
#else
timespec spec;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
#endif
}
//////////
// Return the current CPU time, in terms of seconds.
// This is slower than currentTicks(). Time zero is at
// some arbitrary point in the past.
static double currentSeconds() {
return currentTicks() * secondsPerTick();
}
//////////
// Return the conversion from seconds to ticks.
static double ticksPerSecond() {
return 1.0/secondsPerTick();
}
static const char* tickUnits() {
#if defined(__APPLE__) && !defined(__x86_64__)
return "ns";
#elif defined(__WIN32__) || defined(__x86_64__)
return "cycles";
#else
return "ns"; // clock_gettime
#endif
}
//////////
// Return the conversion from ticks to seconds.
static double secondsPerTick() {
static bool initialized = false;
static double secondsPerTick_val;
if (initialized) return secondsPerTick_val;
#if defined(__APPLE__)
#ifdef __x86_64__
int args[] = {CTL_HW, HW_CPU_FREQ};
unsigned int Hz;
size_t len = sizeof(Hz);
if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
exit(-1);
}
secondsPerTick_val = 1.0 / (double) Hz;
#else
mach_timebase_info_data_t time_info;
mach_timebase_info(&time_info);
// Scales to nanoseconds without 1e-9f
secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
static_cast<double>(time_info.denom);
#endif // x86_64 or not
#elif defined(_WIN32)
LARGE_INTEGER qwTicksPerSec;
QueryPerformanceFrequency(&qwTicksPerSec);
secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
#else
FILE *fp = fopen("/proc/cpuinfo","r");
char input[1024];
if (!fp) {
fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
exit(-1);
}
// In case we don't find it, e.g. on the N900
secondsPerTick_val = 1e-9;
while (!feof(fp) && fgets(input, 1024, fp)) {
// NOTE(boulos): Because reading cpuinfo depends on dynamic
// frequency scaling it's better to read the @ sign first
float GHz, MHz;
if (strstr(input, "model name")) {
char* at_sign = strstr(input, "@");
if (at_sign) {
char* after_at = at_sign + 1;
char* GHz_str = strstr(after_at, "GHz");
char* MHz_str = strstr(after_at, "MHz");
if (GHz_str) {
*GHz_str = '\0';
if (1 == sscanf(after_at, "%f", &GHz)) {
//printf("GHz = %f\n", GHz);
secondsPerTick_val = 1e-9f / GHz;
break;
}
} else if (MHz_str) {
*MHz_str = '\0';
if (1 == sscanf(after_at, "%f", &MHz)) {
//printf("MHz = %f\n", MHz);
secondsPerTick_val = 1e-6f / GHz;
break;
}
}
}
} else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
//printf("MHz = %f\n", MHz);
secondsPerTick_val = 1e-6f / MHz;
break;
}
}
fclose(fp);
#endif
initialized = true;
return secondsPerTick_val;
}
//////////
// Return the conversion from ticks to milliseconds.
static double msPerTick() {
return secondsPerTick() * 1000.0;
}
private:
CycleTimer();
};
#endif // #ifndef _SYRAH_CYCLE_TIMER_H_