Here's a utility that I've been using to examine frequency
and idle residency on Nehalem systems. If others find it
useful, I can add it to the pmutils package.
See comments at top of source for usage and examples.
$ cc turbostat.c -o turbostat
cheers,
-Len Brown, Intel Open Source Technology Center
---
/*
* turbostat -- show CPU frequency and C-state residency
* on modern Intel turbo-capable processors.
*
* Copyright (c) 2010, Intel Corporation.
* Len Brown <len.brown@xxxxxxxxx>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
/*
* Works properly on Nehalem and newer processors, since
* Nehalem features an always-running TSC, plus hardware
* C-state residency MSRs.
*
* Works poorly on systems before Nehalem with
* a TSC that stops in deep C-states.
*
* Works properly on Linux-2.6.30 and later.
* Works poorly Linux-2.6.29 and earlier, as acpi-cpufreq
* used to clear APERF/MPERF counters on access.
*
* APERF, MPERF count non-halted cycles.
* Although it is not guaranteed by the architecture, we assume
* here that they count at TSC rate, which is true for Nehalem.
*
* References:
* "Intel® Turbo Boost Technology
* in Intel® Core™ Microarchitecture (Nehalem) Based Processors"
* http://download.intel.com/design/processor/applnots/320354.pdf
*
* "Intel® 64 and IA-32 Architectures Software Developer's Manual
* Volume 3B: System Programming Guide"
* http://www.intel.com/products/processor/manuals/
*/
/*
* usage:
* # turbostat [-v verbosity] [-i interval_sec] [command [arg]...]
*
* examples:
*
* [root@nehalem lenb]# ./turbostat
* CPU GHz TSC %c0 %c1 %c3 %c6 %pc3 %pc6 %pc7
* 0 1.60 2.93 0.03 0.10 40.82 59.04 0.00 0.00 0.00
* 1 1.60 2.93 0.01 0.03 99.96 0.00 0.00 0.00 0.00
* 2 1.60 2.93 0.06 0.07 26.51 73.36 0.00 0.00 0.00
* 3 1.60 2.93 0.15 0.06 99.17 0.62 0.00 0.00 0.00
* 4 1.60 2.93 0.03 0.10 40.82 59.04 0.00 0.00 0.00
* 5 1.59 2.93 0.01 0.03 99.97 0.00 0.00 0.00 0.00
* 6 1.60 2.93 0.03 0.10 26.51 73.36 0.00 0.00 0.00
* 7 1.60 2.93 0.00 0.20 99.17 0.62 0.00 0.00 0.00
*
* Without any parameters, turbostat prints out counters ever 5 seconds.
* (override interval with "-i sec" option).
*
* %c0 is the percent of the interval that the core retired instructions.
*
* GHz is the average clock rate while the core was in c0 state.
*
* TSC is the average GHz that the TSC ran during the entire interval.
*
* %c1, %c3, %c6 show the residency in hardware CPU core idle states.
* Note that these may not equal the software states requested by Linux.
*
* pc3%, pc6%, pc7% show package idle C-states, which happen are disabled
* by the BIOS for the stepping in this example.
*
* The "-v" option adds verbosity to the output: eg.
*
* CPUID GenuineIntel 11 levels family:model:stepping 6:26:4
* Nehalem multiplier 22, TSC frequency 2933 MHz
* Nehalem 4 cores active: 23 mult, max turbo frequency = 3067 MHz
* Nehalem 3 cores active: 23 mult, max turbo frequency = 3067 MHz
* Nehalem 2 cores active: 23 mult, max turbo frequency = 3067 MHz
* Nehalem 1 core active: 24 mult, max turbo frequency = 3200 MHz
*
* If a command is handed to turbostat, it will invoke that
* command and output the statistics gathered while that
* command was running. eg. Here a cycle soaker is run
* on 1 CPU (until ^C) while the others are mostly idle:
*
*[root@nehalem lenb]# ./turbostat cat /dev/zero > /dev/null
*^C CPU GHz TSC %c0 %c1 %c3 %c6 %pc3 %pc6 %pc7
* 0 3.04 2.93 1.67 5.65 8.54 84.14 0.00 0.00 0.00
* 1 2.26 2.93 0.03 0.22 99.75 0.00 0.00 0.00 0.00
* 2 2.86 2.93 0.06 99.91 0.01 0.02 0.00 0.00 0.00
* 3 3.03 2.93 6.06 5.37 88.55 0.03 0.00 0.00 0.00
* 4 2.94 2.93 0.67 6.65 8.54 84.14 0.00 0.00 0.00
* 5 2.59 2.93 0.08 0.17 99.74 0.01 0.00 0.00 0.00
* 6 3.16 2.93 99.46 0.51 0.00 0.03 0.00 0.00 0.00
* 7 1.60 2.93 0.01 11.42 88.55 0.03 0.00 0.00 0.00
*
* Above the cycles soaker drives 1 CPU up to almost 3.2Ghz
* while the other processors are generally in various states of idle.
*
* Note that turbostat reads hardware counters, but doesn't write them.
* So it will not interfere with the OS or other programs, including
* multiple invocations of itself.
*
* turbostat depends on the Linux msr driver for /dev/cpu/.../msr
*/
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/time.h>
#define MSR_TSC 0x10
#define MSR_NEHALEM_PLATFORM_INFO 0xCE
#define MSR_NEHALEM_TURBO_RATIO_LIMIT 0x1AD
#define MSR_APERF 0xE8
#define MSR_MPERF 0xE7
#define MSR_PKG_C3_RESIDENCY 0x3F8
#define MSR_PKG_C6_RESIDENCY 0x3F9
#define MSR_PKG_C7_RESIDENCY 0x3FA
#define MSR_CORE_C3_RESIDENCY 0x3FC
#define MSR_CORE_C3_RESIDENCY 0x3FC
#define MSR_CORE_C3_RESIDENCY 0x3FC
#define MSR_CORE_C6_RESIDENCY 0x3FD
unsigned int interval_sec = 5; /* set with -i interval_sec */
unsigned int verbose; /* set with -v */
unsigned int debug; /* set with -d */
unsigned int do_nehalem;
unsigned int do_c0, skip_c0;
unsigned int do_c1, skip_c1;
unsigned int do_c3;
unsigned int do_c6;
unsigned int do_pkg;
unsigned int do_aperf = 1; /* TBD set with CPUID */
unsigned int iterations;
unsigned int units = 1000000000.0; /* Ghz etc */
int aperf_mperf_unstable;
int backwards_count;
char *progname;
#define MAX_CPUS 16 /* TBD: make dynamic */
int num_cpus;
int fd_msr[MAX_CPUS];
typedef struct per_core_counters {
unsigned long long tsc;
unsigned long long c1;
unsigned long long c3;
unsigned long long c6;
unsigned long long aperf;
unsigned long long mperf;
unsigned long long pc3;
unsigned long long pc6;
unsigned long long pc7;
} PCC;
PCC pcc_even[MAX_CPUS];
PCC pcc_odd[MAX_CPUS];
PCC pcc_delta[MAX_CPUS];
struct timeval tv_even;
struct timeval tv_odd;
struct timeval tv_delta;
unsigned long long get_msr(int cpu, off_t offset)
{
ssize_t retval;
unsigned long long msr;
retval = pread(fd_msr[cpu], &msr, sizeof msr, offset);
if (retval != sizeof msr) {
fprintf(stderr, "pread cpu%d 0x%x = %d\n", cpu, offset, retval);
_exit(-2);
}
return msr;
}
void print_header()
{
fprintf(stderr, " CPU");
fprintf(stderr, " GHz ");
fprintf(stderr, " TSC ");
if (do_c0) fprintf(stderr, " %%c0 ");
if (do_c1) fprintf(stderr, " %%c1 ");
if (do_c3) fprintf(stderr, " %%c3 ");
if (do_c6) fprintf(stderr, " %%c6 ");
if (do_pkg) fprintf(stderr, " %%pc3 ");
if (do_pkg) fprintf(stderr, " %%pc6 ");
if (do_pkg) fprintf(stderr, " %%pc7 ");
putc('\n', stderr);
}
void dump_counters(PCC *c)
{
int i;
fprintf(stderr, "TSC: ");
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%llX ", c[i].tsc);
}
putc('\n', stderr);
fprintf(stderr, "c3: ");
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%llX ", c[i].c3);
}
putc('\n', stderr);
fprintf(stderr, "c6: ");
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%llX ", c[i].c6);
}
putc('\n', stderr);
fprintf(stderr, "aperf: ");
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%llX ", c[i].aperf);
}
putc('\n', stderr);
fprintf(stderr, "mperf: ");
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%llX ", c[i].mperf);
}
putc('\n', stderr);
fprintf(stderr, "pc3: ");
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%llX ", c[i].pc3);
}
putc('\n', stderr);
fprintf(stderr, "pc6: ");
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%llX ", c[i].pc6);
}
putc('\n', stderr);
fprintf(stderr, "pc7: ");
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%llX ", c[i].pc7);
}
putc('\n', stderr);
}
void print_counters(PCC *c)
{
int i;
double interval_float;
interval_float = tv_delta.tv_sec + tv_delta.tv_usec/1000000.0;
if (debug)
fprintf(stderr, "%.6f sec\n", interval_float);
print_header();
for (i = 0; i < num_cpus; ++i) {
fprintf(stderr, "%4d", i);
if (do_aperf) {
if (!aperf_mperf_unstable) {
fprintf(stderr, "%7.2f",
1.0 * c[i].tsc / units * c[i].aperf /
c[i].mperf / interval_float);
} else {
if (c[i].aperf > c[i].tsc || c[i].mperf > c[i].tsc) {
fprintf(stderr, " ****");
} else {
fprintf(stderr, "%6.1f*",
1.0 * c[i].tsc / units * c[i].aperf /
c[i].mperf / interval_float);
}
}
}
fprintf(stderr, "%7.2f", 1.0 * c[i].tsc/units/interval_float);
if (do_c0) {
if (!skip_c0)
fprintf(stderr, "%7.2f", 100.0 * c[i].mperf/c[i].tsc);
else
fprintf(stderr, " ****");
}
if (do_c1) {
if (!skip_c1)
fprintf(stderr, "%7.2f", 100.0 * c[i].c1/c[i].tsc);
else
fprintf(stderr, " ****");
}
if (do_c3)
fprintf(stderr, "%7.2f", 100.0 * c[i].c3/c[i].tsc);
if (do_c6)
fprintf(stderr, "%7.2f", 100.0 * c[i].c6/c[i].tsc);
if (do_pkg)
fprintf(stderr, "%7.2f", 100.0 * c[i].pc3/c[i].tsc);
if (do_pkg)
fprintf(stderr, "%7.2f", 100.0 * c[i].pc6/c[i].tsc);
if (do_pkg)
fprintf(stderr, "%7.2f", 100.0 * c[i].pc7/c[i].tsc);
putc('\n', stderr);
}
}
#define SUBTRACT_COUNTER(after, before, delta) (delta = (after - before), (before > after))
compute_delta(PCC *after, PCC *before)
{
int i;
int error, error1, error2;
skip_c0 = skip_c1 = 0;
for (i = 0; i < num_cpus; ++i) {
error = SUBTRACT_COUNTER(after[i].tsc, before[i].tsc, pcc_delta[i].tsc);
if (error) {
fprintf(stderr, "TSC went backwards %llX to %llX\n",
after[i].tsc, before[i].tsc);
}
if (pcc_delta[i].tsc < (1000 * 1000) ) { /* check for TSC < 1 Mcycles over interval */
fprintf(stderr, "Insanely slow TSC rate, TSC stops in idle?\n");
fprintf(stderr, "You can disable all c-states by booting with \"idle=poll\"\n");
fprintf(stderr, "or just the deep ones with \"processor.max_cstate=1\"\n");
_exit(-3);
}
error1 = SUBTRACT_COUNTER(after[i].c3, before[i].c3, pcc_delta[i].c3);
error2 = SUBTRACT_COUNTER(after[i].c6, before[i].c6, pcc_delta[i].c6);
if (error1 || error2) {
fprintf(stderr, "c3 or c6 residency counter went backwards\n");
_exit(-1);
}
error = SUBTRACT_COUNTER(after[i].pc3, before[i].pc3, pcc_delta[i].pc3);
error1 = SUBTRACT_COUNTER(after[i].pc6, before[i].pc6, pcc_delta[i].pc6);
error2 = SUBTRACT_COUNTER(after[i].pc7, before[i].pc7, pcc_delta[i].pc7);
if (error || error1 || error2) {
fprintf(stderr, "package residency counter went backwards\n");
_exit(-1);
}
error1 = SUBTRACT_COUNTER(after[i].aperf, before[i].aperf, pcc_delta[i].aperf);
error2 = SUBTRACT_COUNTER(after[i].mperf, before[i].mperf, pcc_delta[i].mperf);
if (error1 || error2) {
if (!aperf_mperf_unstable) {
fprintf(stderr, "%s: APERF or MPERF went backwards *\n", progname);
fprintf(stderr, "* Frequency results do not cover entire interval *\n");
fprintf(stderr, "* fix this by running Linux-2.6.30 or later *\n");
aperf_mperf_unstable = 1;
}
/*
* mperf delta is likely a huge "positive" number
* can not use it for calculating c0 time
*/
skip_c0 = 1;
skip_c1 = 1;
}
/*
* As mperf and tsc collection are not atomic,
* it is possible for mperf's non-halted cycles
* to exceed TSC's all cycles: show c1l = 0% in that case.
*/
if (pcc_delta[i].mperf > pcc_delta[i].tsc)
pcc_delta[i].c1 = 0;
else /* normal case, derive c1 */
pcc_delta[i].c1 = pcc_delta[i].tsc - pcc_delta[i].mperf - pcc_delta[i].c3 - pcc_delta[i].c6;
if (pcc_delta[i].mperf == 0)
pcc_delta[i].mperf = 1; /* divide by 0 protection */
}
}
void get_counters(PCC *c)
{
int i;
for (i = 0; i < num_cpus; ++i)
{
c[i].tsc = get_msr(i, MSR_TSC);
if (do_c3) c[i].c3 = get_msr(i, MSR_CORE_C3_RESIDENCY);
if (do_c6) c[i].c6 = get_msr(i, MSR_CORE_C6_RESIDENCY);
if (do_aperf) c[i].aperf = get_msr(i, MSR_APERF);
if (do_aperf) c[i].mperf = get_msr(i, MSR_MPERF);
if (do_pkg) c[i].pc3 = get_msr(i, MSR_PKG_C3_RESIDENCY);
if (do_pkg) c[i].pc6 = get_msr(i, MSR_PKG_C6_RESIDENCY);
if (do_pkg) c[i].pc7 = get_msr(i, MSR_PKG_C7_RESIDENCY);
}
}
print_nehalem_info()
{
unsigned long long msr;
unsigned int ratio;
if (!do_nehalem)
return;
msr = get_msr(0, MSR_NEHALEM_PLATFORM_INFO);
ratio = (msr >> 8) & 0xFF;
fprintf(stderr, "Nehalem multiplier %d, TSC frequency %.0f MHz\n", ratio, ratio * 133.33);
msr = get_msr(0, MSR_NEHALEM_TURBO_RATIO_LIMIT);
ratio = (msr >> 24) & 0xFF;
fprintf(stderr, "Nehalem 4 cores active: %d mult, max turbo frequency = %.0f MHz\n", ratio, ratio * 133.33);
ratio = (msr >> 16) & 0xFF;
fprintf(stderr, "Nehalem 3 cores active: %d mult, max turbo frequency = %.0f MHz\n", ratio, ratio * 133.33);
ratio = (msr >> 8) & 0xFF;
fprintf(stderr, "Nehalem 2 cores active: %d mult, max turbo frequency = %.0f MHz\n", ratio, ratio * 133.33);
ratio = (msr >> 0) & 0xFF;
fprintf(stderr, "Nehalem 1 core active: %d mult, max turbo frequency = %.0f MHz\n", ratio, ratio * 133.33);
}
void turbostat_loop()
{
get_counters(pcc_even);
gettimeofday(&tv_even, (struct timezone *)NULL);
for (iterations = 1; ; iterations++) {
sleep(interval_sec);
get_counters(pcc_odd);
gettimeofday(&tv_odd, (struct timezone *)NULL);
compute_delta(pcc_odd, pcc_even);
timersub(&tv_odd, &tv_even, &tv_delta);
print_counters(pcc_delta);
sleep(interval_sec);
get_counters(pcc_even);
gettimeofday(&tv_even, (struct timezone *)NULL);
compute_delta(pcc_even, pcc_odd);
timersub(&tv_even, &tv_odd, &tv_delta);
print_counters(pcc_delta);
}
}
check_dev_msr() {
struct stat sb;
if (stat("/dev/cpu/0/msr", &sb)) {
fprintf(stderr, "no /dev/cpu/0/msr\n");
fprintf(stderr, "Please load the msr driver\n");
_exit(-5);
}
}
int is_nehalem(unsigned int family, unsigned int model)
{
if (family != 6)
return 0;
switch(model) {
case 0x1A: /* Core i7, Xeon 5500 series */
case 0x1E: /* Core i7 and i5 Processor */
case 0x1F: /* Core i7 and i5 Processor */
case 0x2E: /* Nehalem Xeon */
case 0x25: /* Westmere */
case 0x2C: /* Westmere */
return 1;
default:
return 0;
}
}
void do_cpuid()
{
unsigned int eax, ebx, ecx, edx, max_level;
char brand[16];
unsigned int fms, family, model, stepping, ht_capable;
eax = ebx = ecx = edx = 0;
asm("cpuid" : "=a" (max_level), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (0));
sprintf(brand, "%.4s%.4s%.4s", &ebx, &edx, &ecx);
if (strncmp(brand, "GenuineIntel", 12)) {
fprintf(stderr, "CPUID: %s GenuineIntel\n", brand);
_exit(-1);
}
asm("cpuid" : "=a" (fms), "=c" (ecx), "=d" (edx) : "a" (1) : "ebx");
family = (fms >> 8) & 0xf;
model = (fms >> 4) & 0xf;
stepping = fms & 0xf;
if (family == 6 || family == 0xf)
model += ((fms >> 16) & 0xf) << 4;
if (!(edx & (1 << 5))) {
fprintf(stderr, "CPUID: no MSR\n");
_exit(-1);
}
ht_capable = edx & (1 << 28);
if (verbose)
fprintf(stderr, "CPUID %s %d levels family:model:stepping %d:%d:%d\n",
brand, max_level, family, model, stepping);
do_nehalem = is_nehalem(family, model);
}
usage() {
fprintf(stderr, "%s: [-v verbosity] [-i interval_sec] [command [arg]...]\n",
progname);
}
void turbostat_init()
{
int i;
num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
if (num_cpus > MAX_CPUS) {
fprintf(stderr, "%s: limited to %d CPUs, rebuild with MAX_CPUS %d\n",
progname, MAX_CPUS, num_cpus);
num_cpus = MAX_CPUS;
}
do_cpuid();
do_c0 = do_c1 = do_c3 = do_c6 = do_pkg = do_nehalem;
check_dev_msr();
for (i = 0; i < num_cpus; ++i) {
char msr_path[32];
sprintf(msr_path, "/dev/cpu/%d/msr", i);
fd_msr[i] = open(msr_path, O_RDONLY);
if (fd_msr[i] < 0) {
perror(msr_path);
break;
}
}
num_cpus = i;
if (num_cpus == 0)
_exit(-1);
if (debug)
fprintf(stderr, "num_cpus %d\n", num_cpus);
if (verbose)
print_nehalem_info();
}
int fork_it(char **argv) {
pid_t child_pid;
get_counters(pcc_even);
gettimeofday(&tv_even, (struct timezone *)NULL);
child_pid = fork();
if (!child_pid) {
/* child */
execvp(argv[0], argv);
} else {
int status;
/* parent */
if (child_pid == -1) {
perror("fork");
_exit(-1);
}
signal(SIGINT, SIG_IGN);
signal(SIGQUIT, SIG_IGN);
if (waitpid(child_pid, &status, 0) == -1) {
perror("wait");
_exit(-1);
}
}
get_counters(pcc_odd);
gettimeofday(&tv_odd, (struct timezone *)NULL);
compute_delta(pcc_odd, pcc_even);
timersub(&tv_odd, &tv_even, &tv_delta);
print_counters(pcc_delta);
return 0;
}
cmdline(int argc, char **argv) {
int opt;
progname = argv[0];
while((opt = getopt(argc, argv, "+vdi:")) != -1) {
switch (opt) {
case 'v':
verbose = 1;
break;
case 'i':
interval_sec = atoi(optarg);
break;
case 'd':
debug = 1;
fprintf(stderr, "turbostat Jan-23, 2010 - Len Brown <lenb@xxxxxxxxxx>\n");
break;
default:
usage();
_exit(-1);
}
}
}
int main(int argc, char **argv)
{
cmdline(argc, argv);
turbostat_init();
/*
* if any params left, it must be a command to fork
*/
if (argc - optind)
return fork_it(argv + optind);
else
turbostat_loop();
return 0;
}
_______________________________________________
linux-pm mailing list
linux-pm@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linux-foundation.org/mailman/listinfo/linux-pm