]> git.karo-electronics.de Git - karo-tx-linux.git/blob - Documentation/perf_counter/kerneltop.c
4f8d7917aba128ec2564d4e174bf1f3e90942bab
[karo-tx-linux.git] / Documentation / perf_counter / kerneltop.c
1 /*
2  * kerneltop.c: show top kernel functions - performance counters showcase
3
4    Build with:
5
6      cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8    Sample output:
9
10 ------------------------------------------------------------------------------
11  KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
12 ------------------------------------------------------------------------------
13
14              weight         RIP          kernel function
15              ______   ________________   _______________
16
17               35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18               33.00 - ffffffff804cb740 : sock_alloc_send_skb
19               31.26 - ffffffff804ce808 : skb_push
20               22.43 - ffffffff80510004 : tcp_established_options
21               19.00 - ffffffff8027d250 : find_get_page
22               15.76 - ffffffff804e4fc9 : eth_type_trans
23               15.20 - ffffffff804d8baa : dst_release
24               14.86 - ffffffff804cf5d8 : skb_release_head_state
25               14.00 - ffffffff802217d5 : read_hpet
26               12.00 - ffffffff804ffb7f : __ip_local_out
27               11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28                8.54 - ffffffff805001a3 : ip_queue_xmit
29  */
30
31 /*
32  * perfstat:  /usr/bin/time -alike performance counter statistics utility
33
34           It summarizes the counter events of all tasks (and child tasks),
35           covering all CPUs that the command (or workload) executes on.
36           It only counts the per-task events of the workload started,
37           independent of how many other tasks run on those CPUs.
38
39    Sample output:
40
41    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43    Performance counter stats for 'ls':
44
45            163516953 instructions
46                 2295 cache-misses
47              2855182 branch-misses
48  */
49
50  /*
51   * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52   *
53   * Improvements and fixes by:
54   *
55   *   Arjan van de Ven <arjan@linux.intel.com>
56   *   Yanmin Zhang <yanmin.zhang@intel.com>
57   *   Wu Fengguang <fengguang.wu@intel.com>
58   *   Mike Galbraith <efault@gmx.de>
59   *   Paul Mackerras <paulus@samba.org>
60   *
61   * Released under the GPL v2. (and only v2, not any later version)
62   */
63
64 #define _GNU_SOURCE
65 #include <sys/types.h>
66 #include <sys/stat.h>
67 #include <sys/time.h>
68 #include <unistd.h>
69 #include <stdint.h>
70 #include <stdlib.h>
71 #include <string.h>
72 #include <limits.h>
73 #include <getopt.h>
74 #include <assert.h>
75 #include <fcntl.h>
76 #include <stdio.h>
77 #include <errno.h>
78 #include <ctype.h>
79 #include <time.h>
80 #include <sched.h>
81 #include <pthread.h>
82
83 #include <sys/syscall.h>
84 #include <sys/ioctl.h>
85 #include <sys/poll.h>
86 #include <sys/prctl.h>
87 #include <sys/wait.h>
88 #include <sys/uio.h>
89 #include <sys/mman.h>
90
91 #include <linux/unistd.h>
92 #include <linux/types.h>
93
94 #include "../../include/linux/perf_counter.h"
95
96
97 /*
98  * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
99  * counters in the current task.
100  */
101 #define PR_TASK_PERF_COUNTERS_DISABLE   31
102 #define PR_TASK_PERF_COUNTERS_ENABLE    32
103
104 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
105
106 #define rdclock()                                       \
107 ({                                                      \
108         struct timespec ts;                             \
109                                                         \
110         clock_gettime(CLOCK_MONOTONIC, &ts);            \
111         ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
112 })
113
114 /*
115  * Pick up some kernel type conventions:
116  */
117 #define __user
118 #define asmlinkage
119
120 #ifdef __x86_64__
121 #define __NR_perf_counter_open 295
122 #define rmb()           asm volatile("lfence" ::: "memory")
123 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
124 #endif
125
126 #ifdef __i386__
127 #define __NR_perf_counter_open 333
128 #define rmb()           asm volatile("lfence" ::: "memory")
129 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
130 #endif
131
132 #ifdef __powerpc__
133 #define __NR_perf_counter_open 319
134 #define rmb()           asm volatile ("sync" ::: "memory")
135 #define cpu_relax()     asm volatile ("" ::: "memory");
136 #endif
137
138 #define unlikely(x)     __builtin_expect(!!(x), 0)
139 #define min(x, y) ({                            \
140         typeof(x) _min1 = (x);                  \
141         typeof(y) _min2 = (y);                  \
142         (void) (&_min1 == &_min2);              \
143         _min1 < _min2 ? _min1 : _min2; })
144
145 asmlinkage int sys_perf_counter_open(
146         struct perf_counter_hw_event    *hw_event_uptr          __user,
147         pid_t                           pid,
148         int                             cpu,
149         int                             group_fd,
150         unsigned long                   flags)
151 {
152         return syscall(
153                 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
154 }
155
156 #define MAX_COUNTERS                    64
157 #define MAX_NR_CPUS                     256
158
159 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
160
161 static int                      run_perfstat                    =  0;
162 static int                      system_wide                     =  0;
163
164 static int                      nr_counters                     =  0;
165 static __u64                    event_id[MAX_COUNTERS]          = {
166         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
167         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
168         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
169         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
170
171         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
172         EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
173         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
174         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
175 };
176 static int                      default_interval = 100000;
177 static int                      event_count[MAX_COUNTERS];
178 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
179
180 static __u64                    count_filter                   = 100;
181
182 static int                      tid                             = -1;
183 static int                      profile_cpu                     = -1;
184 static int                      nr_cpus                         =  0;
185 static int                      nmi                             =  1;
186 static unsigned int             realtime_prio                   =  0;
187 static int                      group                           =  0;
188 static unsigned int             page_size;
189 static unsigned int             mmap_pages                      =  16;
190 static int                      use_mmap                        = 0;
191 static int                      use_munmap                      = 0;
192
193 static char                     *vmlinux;
194
195 static char                     *sym_filter;
196 static unsigned long            filter_start;
197 static unsigned long            filter_end;
198
199 static int                      delay_secs                      =  2;
200 static int                      zero;
201 static int                      dump_symtab;
202
203 static int                      scale;
204
205 struct source_line {
206         uint64_t                EIP;
207         unsigned long           count;
208         char                    *line;
209         struct source_line      *next;
210 };
211
212 static struct source_line       *lines;
213 static struct source_line       **lines_tail;
214
215 const unsigned int default_count[] = {
216         1000000,
217         1000000,
218           10000,
219           10000,
220         1000000,
221           10000,
222 };
223
224 static char *hw_event_names[] = {
225         "CPU cycles",
226         "instructions",
227         "cache references",
228         "cache misses",
229         "branches",
230         "branch misses",
231         "bus cycles",
232 };
233
234 static char *sw_event_names[] = {
235         "cpu clock ticks",
236         "task clock ticks",
237         "pagefaults",
238         "context switches",
239         "CPU migrations",
240         "minor faults",
241         "major faults",
242 };
243
244 struct event_symbol {
245         __u64 event;
246         char *symbol;
247 };
248
249 static struct event_symbol event_symbols[] = {
250         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cpu-cycles",           },
251         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cycles",               },
252         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),              "instructions",         },
253         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),          "cache-references",     },
254         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),              "cache-misses",         },
255         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branch-instructions",  },
256         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branches",             },
257         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),             "branch-misses",        },
258         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),                "bus-cycles",           },
259
260         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),                 "cpu-clock",            },
261         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),                "task-clock",           },
262         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "page-faults",          },
263         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "faults",               },
264         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),           "minor-faults",         },
265         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),           "major-faults",         },
266         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "context-switches",     },
267         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "cs",                   },
268         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "cpu-migrations",       },
269         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "migrations",           },
270 };
271
272 #define __PERF_COUNTER_FIELD(config, name) \
273         ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
274
275 #define PERF_COUNTER_RAW(config)        __PERF_COUNTER_FIELD(config, RAW)
276 #define PERF_COUNTER_CONFIG(config)     __PERF_COUNTER_FIELD(config, CONFIG)
277 #define PERF_COUNTER_TYPE(config)       __PERF_COUNTER_FIELD(config, TYPE)
278 #define PERF_COUNTER_ID(config)         __PERF_COUNTER_FIELD(config, EVENT)
279
280 static void display_events_help(void)
281 {
282         unsigned int i;
283         __u64 e;
284
285         printf(
286         " -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
287
288         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
289                 int type, id;
290
291                 e = event_symbols[i].event;
292                 type = PERF_COUNTER_TYPE(e);
293                 id = PERF_COUNTER_ID(e);
294
295                 printf("\n                             %d:%d: %-20s",
296                                 type, id, event_symbols[i].symbol);
297         }
298
299         printf("\n"
300         "                           rNNN: raw PMU events (eventsel+umask)\n\n");
301 }
302
303 static void display_perfstat_help(void)
304 {
305         printf(
306         "Usage: perfstat [<events...>] <cmd...>\n\n"
307         "PerfStat Options (up to %d event types can be specified):\n\n",
308                  MAX_COUNTERS);
309
310         display_events_help();
311
312         printf(
313         " -l                           # scale counter values\n"
314         " -a                           # system-wide collection\n");
315         exit(0);
316 }
317
318 static void display_help(void)
319 {
320         if (run_perfstat)
321                 return display_perfstat_help();
322
323         printf(
324         "Usage: kerneltop [<options>]\n"
325         "   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
326         "KernelTop Options (up to %d event types can be specified at once):\n\n",
327                  MAX_COUNTERS);
328
329         display_events_help();
330
331         printf(
332         " -S        --stat             # perfstat COMMAND\n"
333         " -a                           # system-wide collection (for perfstat)\n\n"
334         " -c CNT    --count=CNT        # event period to sample\n\n"
335         " -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
336         " -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
337         " -l                           # show scale factor for RR events\n"
338         " -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
339         " -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
340         " -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
341         " -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
342         " -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
343         " -z        --zero             # zero counts after display\n"
344         " -D        --dump_symtab      # dump symbol table to stderr on startup\n"
345         " -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
346         " -M        --mmap_info        # print mmap info stream\n"
347         " -U        --munmap_info      # print munmap info stream\n"
348         );
349
350         exit(0);
351 }
352
353 static char *event_name(int ctr)
354 {
355         __u64 config = event_id[ctr];
356         int type = PERF_COUNTER_TYPE(config);
357         int id = PERF_COUNTER_ID(config);
358         static char buf[32];
359
360         if (PERF_COUNTER_RAW(config)) {
361                 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
362                 return buf;
363         }
364
365         switch (type) {
366         case PERF_TYPE_HARDWARE:
367                 if (id < PERF_HW_EVENTS_MAX)
368                         return hw_event_names[id];
369                 return "unknown-hardware";
370
371         case PERF_TYPE_SOFTWARE:
372                 if (id < PERF_SW_EVENTS_MAX)
373                         return sw_event_names[id];
374                 return "unknown-software";
375
376         default:
377                 break;
378         }
379
380         return "unknown";
381 }
382
383 /*
384  * Each event can have multiple symbolic names.
385  * Symbolic names are (almost) exactly matched.
386  */
387 static __u64 match_event_symbols(char *str)
388 {
389         __u64 config, id;
390         int type;
391         unsigned int i;
392
393         if (sscanf(str, "r%llx", &config) == 1)
394                 return config | PERF_COUNTER_RAW_MASK;
395
396         if (sscanf(str, "%d:%llu", &type, &id) == 2)
397                 return EID(type, id);
398
399         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
400                 if (!strncmp(str, event_symbols[i].symbol,
401                              strlen(event_symbols[i].symbol)))
402                         return event_symbols[i].event;
403         }
404
405         return ~0ULL;
406 }
407
408 static int parse_events(char *str)
409 {
410         __u64 config;
411
412 again:
413         if (nr_counters == MAX_COUNTERS)
414                 return -1;
415
416         config = match_event_symbols(str);
417         if (config == ~0ULL)
418                 return -1;
419
420         event_id[nr_counters] = config;
421         nr_counters++;
422
423         str = strstr(str, ",");
424         if (str) {
425                 str++;
426                 goto again;
427         }
428
429         return 0;
430 }
431
432
433 /*
434  * perfstat
435  */
436
437 char fault_here[1000000];
438
439 static void create_perfstat_counter(int counter)
440 {
441         struct perf_counter_hw_event hw_event;
442
443         memset(&hw_event, 0, sizeof(hw_event));
444         hw_event.config         = event_id[counter];
445         hw_event.record_type    = 0;
446         hw_event.nmi            = 0;
447         if (scale)
448                 hw_event.read_format    = PERF_FORMAT_TOTAL_TIME_ENABLED |
449                                           PERF_FORMAT_TOTAL_TIME_RUNNING;
450
451         if (system_wide) {
452                 int cpu;
453                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454                         fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
455                         if (fd[cpu][counter] < 0) {
456                                 printf("perfstat error: syscall returned with %d (%s)\n",
457                                                 fd[cpu][counter], strerror(errno));
458                                 exit(-1);
459                         }
460                 }
461         } else {
462                 hw_event.inherit        = 1;
463                 hw_event.disabled       = 1;
464
465                 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
466                 if (fd[0][counter] < 0) {
467                         printf("perfstat error: syscall returned with %d (%s)\n",
468                                         fd[0][counter], strerror(errno));
469                         exit(-1);
470                 }
471         }
472 }
473
474 int do_perfstat(int argc, char *argv[])
475 {
476         unsigned long long t0, t1;
477         int counter;
478         ssize_t res;
479         int status;
480         int pid;
481
482         if (!system_wide)
483                 nr_cpus = 1;
484
485         for (counter = 0; counter < nr_counters; counter++)
486                 create_perfstat_counter(counter);
487
488         argc -= optind;
489         argv += optind;
490
491         if (!argc)
492                 display_help();
493
494         /*
495          * Enable counters and exec the command:
496          */
497         t0 = rdclock();
498         prctl(PR_TASK_PERF_COUNTERS_ENABLE);
499
500         if ((pid = fork()) < 0)
501                 perror("failed to fork");
502         if (!pid) {
503                 if (execvp(argv[0], argv)) {
504                         perror(argv[0]);
505                         exit(-1);
506                 }
507         }
508         while (wait(&status) >= 0)
509                 ;
510         prctl(PR_TASK_PERF_COUNTERS_DISABLE);
511         t1 = rdclock();
512
513         fflush(stdout);
514
515         fprintf(stderr, "\n");
516         fprintf(stderr, " Performance counter stats for \'%s\':\n",
517                 argv[0]);
518         fprintf(stderr, "\n");
519
520         for (counter = 0; counter < nr_counters; counter++) {
521                 int cpu, nv;
522                 __u64 count[3], single_count[3];
523                 int scaled;
524
525                 count[0] = count[1] = count[2] = 0;
526                 nv = scale ? 3 : 1;
527                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
528                         res = read(fd[cpu][counter],
529                                    single_count, nv * sizeof(__u64));
530                         assert(res == nv * sizeof(__u64));
531
532                         count[0] += single_count[0];
533                         if (scale) {
534                                 count[1] += single_count[1];
535                                 count[2] += single_count[2];
536                         }
537                 }
538
539                 scaled = 0;
540                 if (scale) {
541                         if (count[2] == 0) {
542                                 fprintf(stderr, " %14s  %-20s\n",
543                                         "<not counted>", event_name(counter));
544                                 continue;
545                         }
546                         if (count[2] < count[1]) {
547                                 scaled = 1;
548                                 count[0] = (unsigned long long)
549                                         ((double)count[0] * count[1] / count[2] + 0.5);
550                         }
551                 }
552
553                 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
554                     event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
555
556                         double msecs = (double)count[0] / 1000000;
557
558                         fprintf(stderr, " %14.6f  %-20s (msecs)",
559                                 msecs, event_name(counter));
560                 } else {
561                         fprintf(stderr, " %14Ld  %-20s (events)",
562                                 count[0], event_name(counter));
563                 }
564                 if (scaled)
565                         fprintf(stderr, "  (scaled from %.2f%%)",
566                                 (double) count[2] / count[1] * 100);
567                 fprintf(stderr, "\n");
568         }
569         fprintf(stderr, "\n");
570         fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
571                         (double)(t1-t0)/1e6);
572         fprintf(stderr, "\n");
573
574         return 0;
575 }
576
577 /*
578  * Symbols
579  */
580
581 static uint64_t                 min_ip;
582 static uint64_t                 max_ip = -1ll;
583
584 struct sym_entry {
585         unsigned long long      addr;
586         char                    *sym;
587         unsigned long           count[MAX_COUNTERS];
588         int                     skip;
589         struct source_line      *source;
590 };
591
592 #define MAX_SYMS                100000
593
594 static int sym_table_count;
595
596 struct sym_entry                *sym_filter_entry;
597
598 static struct sym_entry         sym_table[MAX_SYMS];
599
600 static void show_details(struct sym_entry *sym);
601
602 /*
603  * Ordering weight: count-1 * count-2 * ... / count-n
604  */
605 static double sym_weight(const struct sym_entry *sym)
606 {
607         double weight;
608         int counter;
609
610         weight = sym->count[0];
611
612         for (counter = 1; counter < nr_counters-1; counter++)
613                 weight *= sym->count[counter];
614
615         weight /= (sym->count[counter] + 1);
616
617         return weight;
618 }
619
620 static int compare(const void *__sym1, const void *__sym2)
621 {
622         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
623
624         return sym_weight(sym1) < sym_weight(sym2);
625 }
626
627 static long                     events;
628 static long                     userspace_events;
629 static const char               CONSOLE_CLEAR[] = "\e[H\e[2J";
630
631 static struct sym_entry         tmp[MAX_SYMS];
632
633 static void print_sym_table(void)
634 {
635         int i, printed;
636         int counter;
637         float events_per_sec = events/delay_secs;
638         float kevents_per_sec = (events-userspace_events)/delay_secs;
639
640         events = userspace_events = 0;
641         memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
642         qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
643
644         write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
645
646         printf(
647 "------------------------------------------------------------------------------\n");
648         printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
649                 events_per_sec,
650                 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
651                 nmi ? "NMI" : "IRQ");
652
653         if (nr_counters == 1)
654                 printf("%d ", event_count[0]);
655
656         for (counter = 0; counter < nr_counters; counter++) {
657                 if (counter)
658                         printf("/");
659
660                 printf("%s", event_name(counter));
661         }
662
663         printf( "], ");
664
665         if (tid != -1)
666                 printf(" (tid: %d", tid);
667         else
668                 printf(" (all");
669
670         if (profile_cpu != -1)
671                 printf(", cpu: %d)\n", profile_cpu);
672         else {
673                 if (tid != -1)
674                         printf(")\n");
675                 else
676                         printf(", %d CPUs)\n", nr_cpus);
677         }
678
679         printf("------------------------------------------------------------------------------\n\n");
680
681         if (nr_counters == 1)
682                 printf("             events");
683         else
684                 printf("  weight     events");
685
686         printf("         RIP          kernel function\n"
687                        "  ______     ______   ________________   _______________\n\n"
688         );
689
690         printed = 0;
691         for (i = 0; i < sym_table_count; i++) {
692                 int count;
693
694                 if (nr_counters == 1) {
695                         if (printed <= 18 &&
696                                         tmp[i].count[0] >= count_filter) {
697                                 printf("%19.2f - %016llx : %s\n",
698                                   sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
699                                 printed++;
700                         }
701                 } else {
702                         if (printed <= 18 &&
703                                         tmp[i].count[0] >= count_filter) {
704                                 printf("%8.1f %10ld - %016llx : %s\n",
705                                   sym_weight(tmp + i),
706                                   tmp[i].count[0],
707                                   tmp[i].addr, tmp[i].sym);
708                                 printed++;
709                         }
710                 }
711                 /*
712                  * Add decay to the counts:
713                  */
714                 for (count = 0; count < nr_counters; count++)
715                         sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
716         }
717
718         if (sym_filter_entry)
719                 show_details(sym_filter_entry);
720
721         {
722                 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
723
724                 if (poll(&stdin_poll, 1, 0) == 1) {
725                         printf("key pressed - exiting.\n");
726                         exit(0);
727                 }
728         }
729 }
730
731 static void *display_thread(void *arg)
732 {
733         printf("KernelTop refresh period: %d seconds\n", delay_secs);
734
735         while (!sleep(delay_secs))
736                 print_sym_table();
737
738         return NULL;
739 }
740
741 static int read_symbol(FILE *in, struct sym_entry *s)
742 {
743         static int filter_match = 0;
744         char *sym, stype;
745         char str[500];
746         int rc, pos;
747
748         rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
749         if (rc == EOF)
750                 return -1;
751
752         assert(rc == 3);
753
754         /* skip until end of line: */
755         pos = strlen(str);
756         do {
757                 rc = fgetc(in);
758                 if (rc == '\n' || rc == EOF || pos >= 499)
759                         break;
760                 str[pos] = rc;
761                 pos++;
762         } while (1);
763         str[pos] = 0;
764
765         sym = str;
766
767         /* Filter out known duplicates and non-text symbols. */
768         if (!strcmp(sym, "_text"))
769                 return 1;
770         if (!min_ip && !strcmp(sym, "_stext"))
771                 return 1;
772         if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
773                 return 1;
774         if (stype != 'T' && stype != 't')
775                 return 1;
776         if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
777                 return 1;
778         if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
779                 return 1;
780
781         s->sym = malloc(strlen(str));
782         assert(s->sym);
783
784         strcpy((char *)s->sym, str);
785         s->skip = 0;
786
787         /* Tag events to be skipped. */
788         if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
789                 s->skip = 1;
790         else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
791                 s->skip = 1;
792         else if (!strcmp("mwait_idle", s->sym))
793                 s->skip = 1;
794
795         if (filter_match == 1) {
796                 filter_end = s->addr;
797                 filter_match = -1;
798                 if (filter_end - filter_start > 10000) {
799                         printf("hm, too large filter symbol <%s> - skipping.\n",
800                                 sym_filter);
801                         printf("symbol filter start: %016lx\n", filter_start);
802                         printf("                end: %016lx\n", filter_end);
803                         filter_end = filter_start = 0;
804                         sym_filter = NULL;
805                         sleep(1);
806                 }
807         }
808         if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
809                 filter_match = 1;
810                 filter_start = s->addr;
811         }
812
813         return 0;
814 }
815
816 int compare_addr(const void *__sym1, const void *__sym2)
817 {
818         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
819
820         return sym1->addr > sym2->addr;
821 }
822
823 static void sort_symbol_table(void)
824 {
825         int i, dups;
826
827         do {
828                 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
829                 for (i = 0, dups = 0; i < sym_table_count; i++) {
830                         if (sym_table[i].addr == sym_table[i+1].addr) {
831                                 sym_table[i+1].addr = -1ll;
832                                 dups++;
833                         }
834                 }
835                 sym_table_count -= dups;
836         } while(dups);
837 }
838
839 static void parse_symbols(void)
840 {
841         struct sym_entry *last;
842
843         FILE *kallsyms = fopen("/proc/kallsyms", "r");
844
845         if (!kallsyms) {
846                 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
847                 exit(-1);
848         }
849
850         while (!feof(kallsyms)) {
851                 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
852                         sym_table_count++;
853                         assert(sym_table_count <= MAX_SYMS);
854                 }
855         }
856
857         sort_symbol_table();
858         min_ip = sym_table[0].addr;
859         max_ip = sym_table[sym_table_count-1].addr;
860         last = sym_table + sym_table_count++;
861
862         last->addr = -1ll;
863         last->sym = "<end>";
864
865         if (filter_end) {
866                 int count;
867                 for (count=0; count < sym_table_count; count ++) {
868                         if (!strcmp(sym_table[count].sym, sym_filter)) {
869                                 sym_filter_entry = &sym_table[count];
870                                 break;
871                         }
872                 }
873         }
874         if (dump_symtab) {
875                 int i;
876
877                 for (i = 0; i < sym_table_count; i++)
878                         fprintf(stderr, "%llx %s\n",
879                                 sym_table[i].addr, sym_table[i].sym);
880         }
881 }
882
883 /*
884  * Source lines
885  */
886
887 static void parse_vmlinux(char *filename)
888 {
889         FILE *file;
890         char command[PATH_MAX*2];
891         if (!filename)
892                 return;
893
894         sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
895
896         file = popen(command, "r");
897         if (!file)
898                 return;
899
900         lines_tail = &lines;
901         while (!feof(file)) {
902                 struct source_line *src;
903                 size_t dummy = 0;
904                 char *c;
905
906                 src = malloc(sizeof(struct source_line));
907                 assert(src != NULL);
908                 memset(src, 0, sizeof(struct source_line));
909
910                 if (getline(&src->line, &dummy, file) < 0)
911                         break;
912                 if (!src->line)
913                         break;
914
915                 c = strchr(src->line, '\n');
916                 if (c)
917                         *c = 0;
918
919                 src->next = NULL;
920                 *lines_tail = src;
921                 lines_tail = &src->next;
922
923                 if (strlen(src->line)>8 && src->line[8] == ':')
924                         src->EIP = strtoull(src->line, NULL, 16);
925                 if (strlen(src->line)>8 && src->line[16] == ':')
926                         src->EIP = strtoull(src->line, NULL, 16);
927         }
928         pclose(file);
929 }
930
931 static void record_precise_ip(uint64_t ip)
932 {
933         struct source_line *line;
934
935         for (line = lines; line; line = line->next) {
936                 if (line->EIP == ip)
937                         line->count++;
938                 if (line->EIP > ip)
939                         break;
940         }
941 }
942
943 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
944 {
945         struct source_line *line;
946         char pattern[PATH_MAX];
947         sprintf(pattern, "<%s>:", sym->sym);
948
949         for (line = lines; line; line = line->next) {
950                 if (strstr(line->line, pattern)) {
951                         sym->source = line;
952                         break;
953                 }
954         }
955 }
956
957 static void show_lines(struct source_line *line_queue, int line_queue_count)
958 {
959         int i;
960         struct source_line *line;
961
962         line = line_queue;
963         for (i = 0; i < line_queue_count; i++) {
964                 printf("%8li\t%s\n", line->count, line->line);
965                 line = line->next;
966         }
967 }
968
969 #define TRACE_COUNT     3
970
971 static void show_details(struct sym_entry *sym)
972 {
973         struct source_line *line;
974         struct source_line *line_queue = NULL;
975         int displayed = 0;
976         int line_queue_count = 0;
977
978         if (!sym->source)
979                 lookup_sym_in_vmlinux(sym);
980         if (!sym->source)
981                 return;
982
983         printf("Showing details for %s\n", sym->sym);
984
985         line = sym->source;
986         while (line) {
987                 if (displayed && strstr(line->line, ">:"))
988                         break;
989
990                 if (!line_queue_count)
991                         line_queue = line;
992                 line_queue_count ++;
993
994                 if (line->count >= count_filter) {
995                         show_lines(line_queue, line_queue_count);
996                         line_queue_count = 0;
997                         line_queue = NULL;
998                 } else if (line_queue_count > TRACE_COUNT) {
999                         line_queue = line_queue->next;
1000                         line_queue_count --;
1001                 }
1002
1003                 line->count = 0;
1004                 displayed++;
1005                 if (displayed > 300)
1006                         break;
1007                 line = line->next;
1008         }
1009 }
1010
1011 /*
1012  * Binary search in the histogram table and record the hit:
1013  */
1014 static void record_ip(uint64_t ip, int counter)
1015 {
1016         int left_idx, middle_idx, right_idx, idx;
1017         unsigned long left, middle, right;
1018
1019         record_precise_ip(ip);
1020
1021         left_idx = 0;
1022         right_idx = sym_table_count-1;
1023         assert(ip <= max_ip && ip >= min_ip);
1024
1025         while (left_idx + 1 < right_idx) {
1026                 middle_idx = (left_idx + right_idx) / 2;
1027
1028                 left   = sym_table[  left_idx].addr;
1029                 middle = sym_table[middle_idx].addr;
1030                 right  = sym_table[ right_idx].addr;
1031
1032                 if (!(left <= middle && middle <= right)) {
1033                         printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1034                         printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1035                 }
1036                 assert(left <= middle && middle <= right);
1037                 if (!(left <= ip && ip <= right)) {
1038                         printf(" left: %016lx\n", left);
1039                         printf("   ip: %016lx\n", (unsigned long)ip);
1040                         printf("right: %016lx\n", right);
1041                 }
1042                 assert(left <= ip && ip <= right);
1043                 /*
1044                  * [ left .... target .... middle .... right ]
1045                  *   => right := middle
1046                  */
1047                 if (ip < middle) {
1048                         right_idx = middle_idx;
1049                         continue;
1050                 }
1051                 /*
1052                  * [ left .... middle ... target ... right ]
1053                  *   => left := middle
1054                  */
1055                 left_idx = middle_idx;
1056         }
1057
1058         idx = left_idx;
1059
1060         if (!sym_table[idx].skip)
1061                 sym_table[idx].count[counter]++;
1062         else events--;
1063 }
1064
1065 static void process_event(uint64_t ip, int counter)
1066 {
1067         events++;
1068
1069         if (ip < min_ip || ip > max_ip) {
1070                 userspace_events++;
1071                 return;
1072         }
1073
1074         record_ip(ip, counter);
1075 }
1076
1077 static void process_options(int argc, char *argv[])
1078 {
1079         int error = 0, counter;
1080
1081         if (strstr(argv[0], "perfstat"))
1082                 run_perfstat = 1;
1083
1084         for (;;) {
1085                 int option_index = 0;
1086                 /** Options for getopt */
1087                 static struct option long_options[] = {
1088                         {"count",       required_argument,      NULL, 'c'},
1089                         {"cpu",         required_argument,      NULL, 'C'},
1090                         {"delay",       required_argument,      NULL, 'd'},
1091                         {"dump_symtab", no_argument,            NULL, 'D'},
1092                         {"event",       required_argument,      NULL, 'e'},
1093                         {"filter",      required_argument,      NULL, 'f'},
1094                         {"group",       required_argument,      NULL, 'g'},
1095                         {"help",        no_argument,            NULL, 'h'},
1096                         {"nmi",         required_argument,      NULL, 'n'},
1097                         {"mmap_info",   no_argument,            NULL, 'M'},
1098                         {"mmap_pages",  required_argument,      NULL, 'm'},
1099                         {"munmap_info", no_argument,            NULL, 'U'},
1100                         {"pid",         required_argument,      NULL, 'p'},
1101                         {"realtime",    required_argument,      NULL, 'r'},
1102                         {"scale",       no_argument,            NULL, 'l'},
1103                         {"symbol",      required_argument,      NULL, 's'},
1104                         {"stat",        no_argument,            NULL, 'S'},
1105                         {"vmlinux",     required_argument,      NULL, 'x'},
1106                         {"zero",        no_argument,            NULL, 'z'},
1107                         {NULL,          0,                      NULL,  0 }
1108                 };
1109                 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
1110                                     long_options, &option_index);
1111                 if (c == -1)
1112                         break;
1113
1114                 switch (c) {
1115                 case 'a': system_wide                   =              1; break;
1116                 case 'c': default_interval              =   atoi(optarg); break;
1117                 case 'C':
1118                         /* CPU and PID are mutually exclusive */
1119                         if (tid != -1) {
1120                                 printf("WARNING: CPU switch overriding PID\n");
1121                                 sleep(1);
1122                                 tid = -1;
1123                         }
1124                         profile_cpu                     =   atoi(optarg); break;
1125                 case 'd': delay_secs                    =   atoi(optarg); break;
1126                 case 'D': dump_symtab                   =              1; break;
1127
1128                 case 'e': error                         = parse_events(optarg); break;
1129
1130                 case 'f': count_filter                  =   atoi(optarg); break;
1131                 case 'g': group                         =   atoi(optarg); break;
1132                 case 'h':                                 display_help(); break;
1133                 case 'l': scale                         =              1; break;
1134                 case 'n': nmi                           =   atoi(optarg); break;
1135                 case 'p':
1136                         /* CPU and PID are mutually exclusive */
1137                         if (profile_cpu != -1) {
1138                                 printf("WARNING: PID switch overriding CPU\n");
1139                                 sleep(1);
1140                                 profile_cpu = -1;
1141                         }
1142                         tid                             =   atoi(optarg); break;
1143                 case 'r': realtime_prio                 =   atoi(optarg); break;
1144                 case 's': sym_filter                    = strdup(optarg); break;
1145                 case 'S': run_perfstat                  =              1; break;
1146                 case 'x': vmlinux                       = strdup(optarg); break;
1147                 case 'z': zero                          =              1; break;
1148                 case 'm': mmap_pages                    =   atoi(optarg); break;
1149                 case 'M': use_mmap                      =              1; break;
1150                 case 'U': use_munmap                    =              1; break;
1151                 default: error = 1; break;
1152                 }
1153         }
1154         if (error)
1155                 display_help();
1156
1157         if (!nr_counters) {
1158                 if (run_perfstat)
1159                         nr_counters = 8;
1160                 else {
1161                         nr_counters = 1;
1162                         event_id[0] = 0;
1163                 }
1164         }
1165
1166         for (counter = 0; counter < nr_counters; counter++) {
1167                 if (event_count[counter])
1168                         continue;
1169
1170                 event_count[counter] = default_interval;
1171         }
1172 }
1173
1174 struct mmap_data {
1175         int counter;
1176         void *base;
1177         unsigned int mask;
1178         unsigned int prev;
1179 };
1180
1181 static unsigned int mmap_read_head(struct mmap_data *md)
1182 {
1183         struct perf_counter_mmap_page *pc = md->base;
1184         int head;
1185
1186         head = pc->data_head;
1187         rmb();
1188
1189         return head;
1190 }
1191
1192 struct timeval last_read, this_read;
1193
1194 static void mmap_read(struct mmap_data *md)
1195 {
1196         unsigned int head = mmap_read_head(md);
1197         unsigned int old = md->prev;
1198         unsigned char *data = md->base + page_size;
1199         int diff;
1200
1201         gettimeofday(&this_read, NULL);
1202
1203         /*
1204          * If we're further behind than half the buffer, there's a chance
1205          * the writer will bite our tail and screw up the events under us.
1206          *
1207          * If we somehow ended up ahead of the head, we got messed up.
1208          *
1209          * In either case, truncate and restart at head.
1210          */
1211         diff = head - old;
1212         if (diff > md->mask / 2 || diff < 0) {
1213                 struct timeval iv;
1214                 unsigned long msecs;
1215
1216                 timersub(&this_read, &last_read, &iv);
1217                 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1218
1219                 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1220                                 "  Last read %lu msecs ago.\n", msecs);
1221
1222                 /*
1223                  * head points to a known good entry, start there.
1224                  */
1225                 old = head;
1226         }
1227
1228         last_read = this_read;
1229
1230         for (; old != head;) {
1231                 struct ip_event {
1232                         struct perf_event_header header;
1233                         __u64 ip;
1234                         __u32 pid, tid;
1235                 };
1236                 struct mmap_event {
1237                         struct perf_event_header header;
1238                         __u32 pid, tid;
1239                         __u64 start;
1240                         __u64 len;
1241                         __u64 pgoff;
1242                         char filename[PATH_MAX];
1243                 };
1244
1245                 typedef union event_union {
1246                         struct perf_event_header header;
1247                         struct ip_event ip;
1248                         struct mmap_event mmap;
1249                 } event_t;
1250
1251                 event_t *event = (event_t *)&data[old & md->mask];
1252
1253                 event_t event_copy;
1254
1255                 unsigned int size = event->header.size;
1256
1257                 /*
1258                  * Event straddles the mmap boundary -- header should always
1259                  * be inside due to u64 alignment of output.
1260                  */
1261                 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1262                         unsigned int offset = old;
1263                         unsigned int len = min(sizeof(*event), size), cpy;
1264                         void *dst = &event_copy;
1265
1266                         do {
1267                                 cpy = min(md->mask + 1 - (offset & md->mask), len);
1268                                 memcpy(dst, &data[offset & md->mask], cpy);
1269                                 offset += cpy;
1270                                 dst += cpy;
1271                                 len -= cpy;
1272                         } while (len);
1273
1274                         event = &event_copy;
1275                 }
1276
1277                 old += size;
1278
1279                 switch (event->header.type) {
1280                 case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
1281                 case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
1282                         process_event(event->ip.ip, md->counter);
1283                         break;
1284
1285                 case PERF_EVENT_MMAP:
1286                 case PERF_EVENT_MUNMAP:
1287                         printf("%s: %Lu %Lu %Lu %s\n",
1288                                         event->header.type == PERF_EVENT_MMAP
1289                                           ? "mmap" : "munmap",
1290                                         event->mmap.start,
1291                                         event->mmap.len,
1292                                         event->mmap.pgoff,
1293                                         event->mmap.filename);
1294                         break;
1295                 }
1296         }
1297
1298         md->prev = old;
1299 }
1300
1301 int main(int argc, char *argv[])
1302 {
1303         struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1304         struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1305         struct perf_counter_hw_event hw_event;
1306         pthread_t thread;
1307         int i, counter, group_fd, nr_poll = 0;
1308         unsigned int cpu;
1309         int ret;
1310
1311         page_size = sysconf(_SC_PAGE_SIZE);
1312
1313         process_options(argc, argv);
1314
1315         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1316         assert(nr_cpus <= MAX_NR_CPUS);
1317         assert(nr_cpus >= 0);
1318
1319         if (run_perfstat)
1320                 return do_perfstat(argc, argv);
1321
1322         if (tid != -1 || profile_cpu != -1)
1323                 nr_cpus = 1;
1324
1325         parse_symbols();
1326         if (vmlinux && sym_filter_entry)
1327                 parse_vmlinux(vmlinux);
1328
1329         for (i = 0; i < nr_cpus; i++) {
1330                 group_fd = -1;
1331                 for (counter = 0; counter < nr_counters; counter++) {
1332
1333                         cpu     = profile_cpu;
1334                         if (tid == -1 && profile_cpu == -1)
1335                                 cpu = i;
1336
1337                         memset(&hw_event, 0, sizeof(hw_event));
1338                         hw_event.config         = event_id[counter];
1339                         hw_event.irq_period     = event_count[counter];
1340                         hw_event.record_type    = PERF_RECORD_IP | PERF_RECORD_TID;
1341                         hw_event.nmi            = nmi;
1342                         hw_event.mmap           = use_mmap;
1343                         hw_event.munmap         = use_munmap;
1344
1345                         fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1346                         if (fd[i][counter] < 0) {
1347                                 int err = errno;
1348                                 printf("kerneltop error: syscall returned with %d (%s)\n",
1349                                         fd[i][counter], strerror(err));
1350                                 if (err == EPERM)
1351                                         printf("Are you root?\n");
1352                                 exit(-1);
1353                         }
1354                         assert(fd[i][counter] >= 0);
1355                         fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1356
1357                         /*
1358                          * First counter acts as the group leader:
1359                          */
1360                         if (group && group_fd == -1)
1361                                 group_fd = fd[i][counter];
1362
1363                         event_array[nr_poll].fd = fd[i][counter];
1364                         event_array[nr_poll].events = POLLIN;
1365                         nr_poll++;
1366
1367                         mmap_array[i][counter].counter = counter;
1368                         mmap_array[i][counter].prev = 0;
1369                         mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1370                         mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1371                                         PROT_READ, MAP_SHARED, fd[i][counter], 0);
1372                         if (mmap_array[i][counter].base == MAP_FAILED) {
1373                                 printf("kerneltop error: failed to mmap with %d (%s)\n",
1374                                                 errno, strerror(errno));
1375                                 exit(-1);
1376                         }
1377                 }
1378         }
1379
1380         if (pthread_create(&thread, NULL, display_thread, NULL)) {
1381                 printf("Could not create display thread.\n");
1382                 exit(-1);
1383         }
1384
1385         if (realtime_prio) {
1386                 struct sched_param param;
1387
1388                 param.sched_priority = realtime_prio;
1389                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1390                         printf("Could not set realtime priority.\n");
1391                         exit(-1);
1392                 }
1393         }
1394
1395         while (1) {
1396                 int hits = events;
1397
1398                 for (i = 0; i < nr_cpus; i++) {
1399                         for (counter = 0; counter < nr_counters; counter++)
1400                                 mmap_read(&mmap_array[i][counter]);
1401                 }
1402
1403                 if (hits == events)
1404                         ret = poll(event_array, nr_poll, 100);
1405         }
1406
1407         return 0;
1408 }