-rw-r--r-- 3691 libcpucycles-20230115/cpucycles/default-perfevent.c raw
// version 20230106 // public domain // djb // adapted from supercop/cpucycles/perfevent.c // 20230106 djb: read() into int64_t instead of long long // 20230106 djb: add comment on RUNNING/ENABLED /* This code intentionally avoids dividing by the PERF_FORMAT_TOTAL_TIME_RUNNING/ENABLED ratio. The motivation for that ratio is as follows: * A typical CPU has a limited number of performance-monitoring counters active at once. For example, there are 8 "programmable" counters on Intel Skylake. * "perf stat" allows the user to enable more counters. The OS kernel periodically (e.g., every millisecond) changes the limited number of active hardware counters to a new subset of the enabled counters, and "perf stat" reports PERF_FORMAT_TOTAL_TIME_RUNNING/ENABLED for each counter, the fraction of time spent with that counter running. For long-running programs, dividing the hardware counter by RUNNING/ENABLED usually produces a reasonable estimate of what the count would have been without competition from other counters. A fixable problem with this multiplexing of counters is that the kernel appears to simply cycle through counters, so unlucky programs can trigger moiré effects. The fix is to select random subsets of counters. A more fundamental problem is that cpucycles() has to be usable for timing short subroutines, including subroutines so short that the OS has no opportunity to change from one selection of counters to another. Say RUNNING is 0; should cpucycles() then divide by 0? If a caller runs cpucycles(), X(), cpucycles(), X(), etc., and the cycle counter happens to be enabled for only 80% of the runs of X(), then simply computing the median difference of adjacent cycle counts, with no scaling, will filter out the zeros and correctly compute the cost of X. Averages won't (without scaling), but averages have other problems, such as being heavily influenced by interrupts. (Omitting kernel time from perf results does not remove the influence of interrupts on caches.) Given the importance of cycle counting, it is better to have cycle counters always running. For example, on Skylake, Intel provides the 8 "programmable" counters on top of a separate cycle counter ("fixed counter 1"), so there is no good reason for the kernel to waste a "programmable" counter on a cycle counter, there is no good reason to turn the cycle counter off, and there is no good reason for RUNNING to be below ENABLED for the cycle counter. Of course, applications that use just one performance counter at a time don't have to worry about kernels getting this wrong, and don't have to worry about the possibility of getting noisy or invalid results on CPUs that have heavier constraints on the number of simultaneous counters. */ #include <stdlib.h> #include <stdint.h> #include <unistd.h> #include <string.h> #include <sys/ioctl.h> #include <sys/types.h> #include <sys/syscall.h> #include <linux/perf_event.h> #include "cpucycles_internal.h" static int fddev = -1; long long ticks(void) { int64_t result; if (read(fddev,&result,sizeof result) < sizeof result) return 0; return result; } long long ticks_setup(void) { if (fddev == -1) { static struct perf_event_attr attr; memset(&attr,0,sizeof attr); attr.type = PERF_TYPE_HARDWARE; attr.size = sizeof(struct perf_event_attr); attr.config = PERF_COUNT_HW_CPU_CYCLES; attr.disabled = 1; attr.exclude_kernel = 1; attr.exclude_hv = 1; fddev = syscall(__NR_perf_event_open,&attr,0,-1,-1,0); if (fddev == -1) return cpucycles_SKIP; ioctl(fddev,PERF_EVENT_IOC_RESET,0); ioctl(fddev,PERF_EVENT_IOC_ENABLE,0); } return cpucycles_MAYBECYCLECOUNTER; }