2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
61 * Released under the GPL v2. (and only v2, not any later version)
65 #include <sys/types.h>
83 #include <sys/syscall.h>
84 #include <sys/ioctl.h>
86 #include <sys/prctl.h>
91 #include <linux/unistd.h>
92 #include <linux/types.h>
94 #include "../../include/linux/perf_counter.h"
98 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
99 * counters in the current task.
101 #define PR_TASK_PERF_COUNTERS_DISABLE 31
102 #define PR_TASK_PERF_COUNTERS_ENABLE 32
104 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
108 struct timespec ts; \
110 clock_gettime(CLOCK_MONOTONIC, &ts); \
111 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
115 * Pick up some kernel type conventions:
121 #define __NR_perf_counter_open 295
122 #define rmb() asm volatile("lfence" ::: "memory")
123 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
127 #define __NR_perf_counter_open 333
128 #define rmb() asm volatile("lfence" ::: "memory")
129 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
133 #define __NR_perf_counter_open 319
134 #define rmb() asm volatile ("sync" ::: "memory")
135 #define cpu_relax() asm volatile ("" ::: "memory");
138 #define unlikely(x) __builtin_expect(!!(x), 0)
139 #define min(x, y) ({ \
140 typeof(x) _min1 = (x); \
141 typeof(y) _min2 = (y); \
142 (void) (&_min1 == &_min2); \
143 _min1 < _min2 ? _min1 : _min2; })
145 asmlinkage int sys_perf_counter_open(
146 struct perf_counter_hw_event *hw_event_uptr __user,
153 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
156 #define MAX_COUNTERS 64
157 #define MAX_NR_CPUS 256
159 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
161 static int run_perfstat = 0;
162 static int system_wide = 0;
164 static int nr_counters = 0;
165 static __u64 event_id[MAX_COUNTERS] = {
166 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
167 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
168 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
169 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
171 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
172 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
173 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
174 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
176 static int default_interval = 100000;
177 static int event_count[MAX_COUNTERS];
178 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
180 static __u64 count_filter = 100;
183 static int profile_cpu = -1;
184 static int nr_cpus = 0;
186 static unsigned int realtime_prio = 0;
187 static int group = 0;
188 static unsigned int page_size;
189 static unsigned int mmap_pages = 16;
190 static int use_mmap = 0;
191 static int use_munmap = 0;
193 static char *vmlinux;
195 static char *sym_filter;
196 static unsigned long filter_start;
197 static unsigned long filter_end;
199 static int delay_secs = 2;
201 static int dump_symtab;
209 struct source_line *next;
212 static struct source_line *lines;
213 static struct source_line **lines_tail;
215 const unsigned int default_count[] = {
224 static char *hw_event_names[] = {
234 static char *sw_event_names[] = {
244 struct event_symbol {
249 static struct event_symbol event_symbols[] = {
250 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
251 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
252 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
253 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
254 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
255 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
256 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
257 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
258 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
260 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
261 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
262 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
263 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
264 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
265 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
266 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
267 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
268 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
269 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
272 #define __PERF_COUNTER_FIELD(config, name) \
273 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
275 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
276 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
277 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
278 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
280 static void display_events_help(void)
286 " -e EVENT --event=EVENT # symbolic-name abbreviations");
288 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
291 e = event_symbols[i].event;
292 type = PERF_COUNTER_TYPE(e);
293 id = PERF_COUNTER_ID(e);
295 printf("\n %d:%d: %-20s",
296 type, id, event_symbols[i].symbol);
300 " rNNN: raw PMU events (eventsel+umask)\n\n");
303 static void display_perfstat_help(void)
306 "Usage: perfstat [<events...>] <cmd...>\n\n"
307 "PerfStat Options (up to %d event types can be specified):\n\n",
310 display_events_help();
313 " -l # scale counter values\n"
314 " -a # system-wide collection\n");
318 static void display_help(void)
321 return display_perfstat_help();
324 "Usage: kerneltop [<options>]\n"
325 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
326 "KernelTop Options (up to %d event types can be specified at once):\n\n",
329 display_events_help();
332 " -S --stat # perfstat COMMAND\n"
333 " -a # system-wide collection (for perfstat)\n\n"
334 " -c CNT --count=CNT # event period to sample\n\n"
335 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
336 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
337 " -l # show scale factor for RR events\n"
338 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
339 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
340 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
341 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
342 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
343 " -z --zero # zero counts after display\n"
344 " -D --dump_symtab # dump symbol table to stderr on startup\n"
345 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
346 " -M --mmap_info # print mmap info stream\n"
347 " -U --munmap_info # print munmap info stream\n"
353 static char *event_name(int ctr)
355 __u64 config = event_id[ctr];
356 int type = PERF_COUNTER_TYPE(config);
357 int id = PERF_COUNTER_ID(config);
360 if (PERF_COUNTER_RAW(config)) {
361 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
366 case PERF_TYPE_HARDWARE:
367 if (id < PERF_HW_EVENTS_MAX)
368 return hw_event_names[id];
369 return "unknown-hardware";
371 case PERF_TYPE_SOFTWARE:
372 if (id < PERF_SW_EVENTS_MAX)
373 return sw_event_names[id];
374 return "unknown-software";
384 * Each event can have multiple symbolic names.
385 * Symbolic names are (almost) exactly matched.
387 static __u64 match_event_symbols(char *str)
393 if (sscanf(str, "r%llx", &config) == 1)
394 return config | PERF_COUNTER_RAW_MASK;
396 if (sscanf(str, "%d:%llu", &type, &id) == 2)
397 return EID(type, id);
399 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
400 if (!strncmp(str, event_symbols[i].symbol,
401 strlen(event_symbols[i].symbol)))
402 return event_symbols[i].event;
408 static int parse_events(char *str)
413 if (nr_counters == MAX_COUNTERS)
416 config = match_event_symbols(str);
420 event_id[nr_counters] = config;
423 str = strstr(str, ",");
437 char fault_here[1000000];
439 static void create_perfstat_counter(int counter)
441 struct perf_counter_hw_event hw_event;
443 memset(&hw_event, 0, sizeof(hw_event));
444 hw_event.config = event_id[counter];
445 hw_event.record_type = 0;
448 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
449 PERF_FORMAT_TOTAL_TIME_RUNNING;
453 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
455 if (fd[cpu][counter] < 0) {
456 printf("perfstat error: syscall returned with %d (%s)\n",
457 fd[cpu][counter], strerror(errno));
462 hw_event.inherit = 1;
463 hw_event.disabled = 1;
465 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
466 if (fd[0][counter] < 0) {
467 printf("perfstat error: syscall returned with %d (%s)\n",
468 fd[0][counter], strerror(errno));
474 int do_perfstat(int argc, char *argv[])
476 unsigned long long t0, t1;
485 for (counter = 0; counter < nr_counters; counter++)
486 create_perfstat_counter(counter);
495 * Enable counters and exec the command:
498 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
500 if ((pid = fork()) < 0)
501 perror("failed to fork");
503 if (execvp(argv[0], argv)) {
508 while (wait(&status) >= 0)
510 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
515 fprintf(stderr, "\n");
516 fprintf(stderr, " Performance counter stats for \'%s\':\n",
518 fprintf(stderr, "\n");
520 for (counter = 0; counter < nr_counters; counter++) {
522 __u64 count[3], single_count[3];
525 count[0] = count[1] = count[2] = 0;
527 for (cpu = 0; cpu < nr_cpus; cpu ++) {
528 res = read(fd[cpu][counter],
529 single_count, nv * sizeof(__u64));
530 assert(res == nv * sizeof(__u64));
532 count[0] += single_count[0];
534 count[1] += single_count[1];
535 count[2] += single_count[2];
542 fprintf(stderr, " %14s %-20s\n",
543 "<not counted>", event_name(counter));
546 if (count[2] < count[1]) {
548 count[0] = (unsigned long long)
549 ((double)count[0] * count[1] / count[2] + 0.5);
553 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
554 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
556 double msecs = (double)count[0] / 1000000;
558 fprintf(stderr, " %14.6f %-20s (msecs)",
559 msecs, event_name(counter));
561 fprintf(stderr, " %14Ld %-20s (events)",
562 count[0], event_name(counter));
565 fprintf(stderr, " (scaled from %.2f%%)",
566 (double) count[2] / count[1] * 100);
567 fprintf(stderr, "\n");
569 fprintf(stderr, "\n");
570 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
571 (double)(t1-t0)/1e6);
572 fprintf(stderr, "\n");
581 static uint64_t min_ip;
582 static uint64_t max_ip = -1ll;
585 unsigned long long addr;
587 unsigned long count[MAX_COUNTERS];
589 struct source_line *source;
592 #define MAX_SYMS 100000
594 static int sym_table_count;
596 struct sym_entry *sym_filter_entry;
598 static struct sym_entry sym_table[MAX_SYMS];
600 static void show_details(struct sym_entry *sym);
603 * Ordering weight: count-1 * count-2 * ... / count-n
605 static double sym_weight(const struct sym_entry *sym)
610 weight = sym->count[0];
612 for (counter = 1; counter < nr_counters-1; counter++)
613 weight *= sym->count[counter];
615 weight /= (sym->count[counter] + 1);
620 static int compare(const void *__sym1, const void *__sym2)
622 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
624 return sym_weight(sym1) < sym_weight(sym2);
628 static long userspace_events;
629 static const char CONSOLE_CLEAR[] = "
\e[H
\e[2J";
631 static struct sym_entry tmp[MAX_SYMS];
633 static void print_sym_table(void)
637 float events_per_sec = events/delay_secs;
638 float kevents_per_sec = (events-userspace_events)/delay_secs;
640 events = userspace_events = 0;
641 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
642 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
644 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
647 "------------------------------------------------------------------------------\n");
648 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
650 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
651 nmi ? "NMI" : "IRQ");
653 if (nr_counters == 1)
654 printf("%d ", event_count[0]);
656 for (counter = 0; counter < nr_counters; counter++) {
660 printf("%s", event_name(counter));
666 printf(" (tid: %d", tid);
670 if (profile_cpu != -1)
671 printf(", cpu: %d)\n", profile_cpu);
676 printf(", %d CPUs)\n", nr_cpus);
679 printf("------------------------------------------------------------------------------\n\n");
681 if (nr_counters == 1)
684 printf(" weight events");
686 printf(" RIP kernel function\n"
687 " ______ ______ ________________ _______________\n\n"
691 for (i = 0; i < sym_table_count; i++) {
694 if (nr_counters == 1) {
696 tmp[i].count[0] >= count_filter) {
697 printf("%19.2f - %016llx : %s\n",
698 sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
703 tmp[i].count[0] >= count_filter) {
704 printf("%8.1f %10ld - %016llx : %s\n",
707 tmp[i].addr, tmp[i].sym);
712 * Add decay to the counts:
714 for (count = 0; count < nr_counters; count++)
715 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
718 if (sym_filter_entry)
719 show_details(sym_filter_entry);
722 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
724 if (poll(&stdin_poll, 1, 0) == 1) {
725 printf("key pressed - exiting.\n");
731 static void *display_thread(void *arg)
733 printf("KernelTop refresh period: %d seconds\n", delay_secs);
735 while (!sleep(delay_secs))
741 static int read_symbol(FILE *in, struct sym_entry *s)
743 static int filter_match = 0;
748 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
754 /* skip until end of line: */
758 if (rc == '\n' || rc == EOF || pos >= 499)
767 /* Filter out known duplicates and non-text symbols. */
768 if (!strcmp(sym, "_text"))
770 if (!min_ip && !strcmp(sym, "_stext"))
772 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
774 if (stype != 'T' && stype != 't')
776 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
778 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
781 s->sym = malloc(strlen(str));
784 strcpy((char *)s->sym, str);
787 /* Tag events to be skipped. */
788 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
790 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
792 else if (!strcmp("mwait_idle", s->sym))
795 if (filter_match == 1) {
796 filter_end = s->addr;
798 if (filter_end - filter_start > 10000) {
799 printf("hm, too large filter symbol <%s> - skipping.\n",
801 printf("symbol filter start: %016lx\n", filter_start);
802 printf(" end: %016lx\n", filter_end);
803 filter_end = filter_start = 0;
808 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
810 filter_start = s->addr;
816 int compare_addr(const void *__sym1, const void *__sym2)
818 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
820 return sym1->addr > sym2->addr;
823 static void sort_symbol_table(void)
828 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
829 for (i = 0, dups = 0; i < sym_table_count; i++) {
830 if (sym_table[i].addr == sym_table[i+1].addr) {
831 sym_table[i+1].addr = -1ll;
835 sym_table_count -= dups;
839 static void parse_symbols(void)
841 struct sym_entry *last;
843 FILE *kallsyms = fopen("/proc/kallsyms", "r");
846 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
850 while (!feof(kallsyms)) {
851 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
853 assert(sym_table_count <= MAX_SYMS);
858 min_ip = sym_table[0].addr;
859 max_ip = sym_table[sym_table_count-1].addr;
860 last = sym_table + sym_table_count++;
867 for (count=0; count < sym_table_count; count ++) {
868 if (!strcmp(sym_table[count].sym, sym_filter)) {
869 sym_filter_entry = &sym_table[count];
877 for (i = 0; i < sym_table_count; i++)
878 fprintf(stderr, "%llx %s\n",
879 sym_table[i].addr, sym_table[i].sym);
887 static void parse_vmlinux(char *filename)
890 char command[PATH_MAX*2];
894 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
896 file = popen(command, "r");
901 while (!feof(file)) {
902 struct source_line *src;
906 src = malloc(sizeof(struct source_line));
908 memset(src, 0, sizeof(struct source_line));
910 if (getline(&src->line, &dummy, file) < 0)
915 c = strchr(src->line, '\n');
921 lines_tail = &src->next;
923 if (strlen(src->line)>8 && src->line[8] == ':')
924 src->EIP = strtoull(src->line, NULL, 16);
925 if (strlen(src->line)>8 && src->line[16] == ':')
926 src->EIP = strtoull(src->line, NULL, 16);
931 static void record_precise_ip(uint64_t ip)
933 struct source_line *line;
935 for (line = lines; line; line = line->next) {
943 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
945 struct source_line *line;
946 char pattern[PATH_MAX];
947 sprintf(pattern, "<%s>:", sym->sym);
949 for (line = lines; line; line = line->next) {
950 if (strstr(line->line, pattern)) {
957 static void show_lines(struct source_line *line_queue, int line_queue_count)
960 struct source_line *line;
963 for (i = 0; i < line_queue_count; i++) {
964 printf("%8li\t%s\n", line->count, line->line);
969 #define TRACE_COUNT 3
971 static void show_details(struct sym_entry *sym)
973 struct source_line *line;
974 struct source_line *line_queue = NULL;
976 int line_queue_count = 0;
979 lookup_sym_in_vmlinux(sym);
983 printf("Showing details for %s\n", sym->sym);
987 if (displayed && strstr(line->line, ">:"))
990 if (!line_queue_count)
994 if (line->count >= count_filter) {
995 show_lines(line_queue, line_queue_count);
996 line_queue_count = 0;
998 } else if (line_queue_count > TRACE_COUNT) {
999 line_queue = line_queue->next;
1000 line_queue_count --;
1005 if (displayed > 300)
1012 * Binary search in the histogram table and record the hit:
1014 static void record_ip(uint64_t ip, int counter)
1016 int left_idx, middle_idx, right_idx, idx;
1017 unsigned long left, middle, right;
1019 record_precise_ip(ip);
1022 right_idx = sym_table_count-1;
1023 assert(ip <= max_ip && ip >= min_ip);
1025 while (left_idx + 1 < right_idx) {
1026 middle_idx = (left_idx + right_idx) / 2;
1028 left = sym_table[ left_idx].addr;
1029 middle = sym_table[middle_idx].addr;
1030 right = sym_table[ right_idx].addr;
1032 if (!(left <= middle && middle <= right)) {
1033 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1034 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1036 assert(left <= middle && middle <= right);
1037 if (!(left <= ip && ip <= right)) {
1038 printf(" left: %016lx\n", left);
1039 printf(" ip: %016lx\n", (unsigned long)ip);
1040 printf("right: %016lx\n", right);
1042 assert(left <= ip && ip <= right);
1044 * [ left .... target .... middle .... right ]
1045 * => right := middle
1048 right_idx = middle_idx;
1052 * [ left .... middle ... target ... right ]
1055 left_idx = middle_idx;
1060 if (!sym_table[idx].skip)
1061 sym_table[idx].count[counter]++;
1065 static void process_event(uint64_t ip, int counter)
1069 if (ip < min_ip || ip > max_ip) {
1074 record_ip(ip, counter);
1077 static void process_options(int argc, char *argv[])
1079 int error = 0, counter;
1081 if (strstr(argv[0], "perfstat"))
1085 int option_index = 0;
1086 /** Options for getopt */
1087 static struct option long_options[] = {
1088 {"count", required_argument, NULL, 'c'},
1089 {"cpu", required_argument, NULL, 'C'},
1090 {"delay", required_argument, NULL, 'd'},
1091 {"dump_symtab", no_argument, NULL, 'D'},
1092 {"event", required_argument, NULL, 'e'},
1093 {"filter", required_argument, NULL, 'f'},
1094 {"group", required_argument, NULL, 'g'},
1095 {"help", no_argument, NULL, 'h'},
1096 {"nmi", required_argument, NULL, 'n'},
1097 {"mmap_info", no_argument, NULL, 'M'},
1098 {"mmap_pages", required_argument, NULL, 'm'},
1099 {"munmap_info", no_argument, NULL, 'U'},
1100 {"pid", required_argument, NULL, 'p'},
1101 {"realtime", required_argument, NULL, 'r'},
1102 {"scale", no_argument, NULL, 'l'},
1103 {"symbol", required_argument, NULL, 's'},
1104 {"stat", no_argument, NULL, 'S'},
1105 {"vmlinux", required_argument, NULL, 'x'},
1106 {"zero", no_argument, NULL, 'z'},
1109 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
1110 long_options, &option_index);
1115 case 'a': system_wide = 1; break;
1116 case 'c': default_interval = atoi(optarg); break;
1118 /* CPU and PID are mutually exclusive */
1120 printf("WARNING: CPU switch overriding PID\n");
1124 profile_cpu = atoi(optarg); break;
1125 case 'd': delay_secs = atoi(optarg); break;
1126 case 'D': dump_symtab = 1; break;
1128 case 'e': error = parse_events(optarg); break;
1130 case 'f': count_filter = atoi(optarg); break;
1131 case 'g': group = atoi(optarg); break;
1132 case 'h': display_help(); break;
1133 case 'l': scale = 1; break;
1134 case 'n': nmi = atoi(optarg); break;
1136 /* CPU and PID are mutually exclusive */
1137 if (profile_cpu != -1) {
1138 printf("WARNING: PID switch overriding CPU\n");
1142 tid = atoi(optarg); break;
1143 case 'r': realtime_prio = atoi(optarg); break;
1144 case 's': sym_filter = strdup(optarg); break;
1145 case 'S': run_perfstat = 1; break;
1146 case 'x': vmlinux = strdup(optarg); break;
1147 case 'z': zero = 1; break;
1148 case 'm': mmap_pages = atoi(optarg); break;
1149 case 'M': use_mmap = 1; break;
1150 case 'U': use_munmap = 1; break;
1151 default: error = 1; break;
1166 for (counter = 0; counter < nr_counters; counter++) {
1167 if (event_count[counter])
1170 event_count[counter] = default_interval;
1181 static unsigned int mmap_read_head(struct mmap_data *md)
1183 struct perf_counter_mmap_page *pc = md->base;
1186 head = pc->data_head;
1192 struct timeval last_read, this_read;
1194 static void mmap_read(struct mmap_data *md)
1196 unsigned int head = mmap_read_head(md);
1197 unsigned int old = md->prev;
1198 unsigned char *data = md->base + page_size;
1201 gettimeofday(&this_read, NULL);
1204 * If we're further behind than half the buffer, there's a chance
1205 * the writer will bite our tail and screw up the events under us.
1207 * If we somehow ended up ahead of the head, we got messed up.
1209 * In either case, truncate and restart at head.
1212 if (diff > md->mask / 2 || diff < 0) {
1214 unsigned long msecs;
1216 timersub(&this_read, &last_read, &iv);
1217 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1219 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1220 " Last read %lu msecs ago.\n", msecs);
1223 * head points to a known good entry, start there.
1228 last_read = this_read;
1230 for (; old != head;) {
1232 struct perf_event_header header;
1237 struct perf_event_header header;
1242 char filename[PATH_MAX];
1245 typedef union event_union {
1246 struct perf_event_header header;
1248 struct mmap_event mmap;
1251 event_t *event = (event_t *)&data[old & md->mask];
1255 unsigned int size = event->header.size;
1258 * Event straddles the mmap boundary -- header should always
1259 * be inside due to u64 alignment of output.
1261 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1262 unsigned int offset = old;
1263 unsigned int len = min(sizeof(*event), size), cpy;
1264 void *dst = &event_copy;
1267 cpy = min(md->mask + 1 - (offset & md->mask), len);
1268 memcpy(dst, &data[offset & md->mask], cpy);
1274 event = &event_copy;
1279 switch (event->header.type) {
1280 case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
1281 case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
1282 process_event(event->ip.ip, md->counter);
1285 case PERF_EVENT_MMAP:
1286 case PERF_EVENT_MUNMAP:
1287 printf("%s: %Lu %Lu %Lu %s\n",
1288 event->header.type == PERF_EVENT_MMAP
1289 ? "mmap" : "munmap",
1293 event->mmap.filename);
1301 int main(int argc, char *argv[])
1303 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1304 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1305 struct perf_counter_hw_event hw_event;
1307 int i, counter, group_fd, nr_poll = 0;
1311 page_size = sysconf(_SC_PAGE_SIZE);
1313 process_options(argc, argv);
1315 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1316 assert(nr_cpus <= MAX_NR_CPUS);
1317 assert(nr_cpus >= 0);
1320 return do_perfstat(argc, argv);
1322 if (tid != -1 || profile_cpu != -1)
1326 if (vmlinux && sym_filter_entry)
1327 parse_vmlinux(vmlinux);
1329 for (i = 0; i < nr_cpus; i++) {
1331 for (counter = 0; counter < nr_counters; counter++) {
1334 if (tid == -1 && profile_cpu == -1)
1337 memset(&hw_event, 0, sizeof(hw_event));
1338 hw_event.config = event_id[counter];
1339 hw_event.irq_period = event_count[counter];
1340 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1342 hw_event.mmap = use_mmap;
1343 hw_event.munmap = use_munmap;
1345 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1346 if (fd[i][counter] < 0) {
1348 printf("kerneltop error: syscall returned with %d (%s)\n",
1349 fd[i][counter], strerror(err));
1351 printf("Are you root?\n");
1354 assert(fd[i][counter] >= 0);
1355 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1358 * First counter acts as the group leader:
1360 if (group && group_fd == -1)
1361 group_fd = fd[i][counter];
1363 event_array[nr_poll].fd = fd[i][counter];
1364 event_array[nr_poll].events = POLLIN;
1367 mmap_array[i][counter].counter = counter;
1368 mmap_array[i][counter].prev = 0;
1369 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1370 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1371 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1372 if (mmap_array[i][counter].base == MAP_FAILED) {
1373 printf("kerneltop error: failed to mmap with %d (%s)\n",
1374 errno, strerror(errno));
1380 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1381 printf("Could not create display thread.\n");
1385 if (realtime_prio) {
1386 struct sched_param param;
1388 param.sched_priority = realtime_prio;
1389 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
1390 printf("Could not set realtime priority.\n");
1398 for (i = 0; i < nr_cpus; i++) {
1399 for (counter = 0; counter < nr_counters; counter++)
1400 mmap_read(&mmap_array[i][counter]);
1404 ret = poll(event_array, nr_poll, 100);