]> git.karo-electronics.de Git - karo-tx-linux.git/blob - Documentation/perf_counter/kerneltop.c
2ab29b5e32e8d8258eb97ee1341ddffe32e41c99
[karo-tx-linux.git] / Documentation / perf_counter / kerneltop.c
1 /*
2  * kerneltop.c: show top kernel functions - performance counters showcase
3
4    Build with:
5
6      cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
7
8    Sample output:
9
10 ------------------------------------------------------------------------------
11  KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
12 ------------------------------------------------------------------------------
13
14              weight         RIP          kernel function
15              ______   ________________   _______________
16
17               35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18               33.00 - ffffffff804cb740 : sock_alloc_send_skb
19               31.26 - ffffffff804ce808 : skb_push
20               22.43 - ffffffff80510004 : tcp_established_options
21               19.00 - ffffffff8027d250 : find_get_page
22               15.76 - ffffffff804e4fc9 : eth_type_trans
23               15.20 - ffffffff804d8baa : dst_release
24               14.86 - ffffffff804cf5d8 : skb_release_head_state
25               14.00 - ffffffff802217d5 : read_hpet
26               12.00 - ffffffff804ffb7f : __ip_local_out
27               11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28                8.54 - ffffffff805001a3 : ip_queue_xmit
29  */
30
31 /*
32  * perfstat:  /usr/bin/time -alike performance counter statistics utility
33
34           It summarizes the counter events of all tasks (and child tasks),
35           covering all CPUs that the command (or workload) executes on.
36           It only counts the per-task events of the workload started,
37           independent of how many other tasks run on those CPUs.
38
39    Sample output:
40
41    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43    Performance counter stats for 'ls':
44
45            163516953 instructions
46                 2295 cache-misses
47              2855182 branch-misses
48  */
49
50  /*
51   * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52   *
53   * Improvements and fixes by:
54   *
55   *   Arjan van de Ven <arjan@linux.intel.com>
56   *   Yanmin Zhang <yanmin.zhang@intel.com>
57   *   Wu Fengguang <fengguang.wu@intel.com>
58   *   Mike Galbraith <efault@gmx.de>
59   *
60   * Released under the GPL v2. (and only v2, not any later version)
61   */
62
63 #define _GNU_SOURCE
64 #include <sys/types.h>
65 #include <sys/stat.h>
66 #include <sys/time.h>
67 #include <unistd.h>
68 #include <stdint.h>
69 #include <stdlib.h>
70 #include <string.h>
71 #include <getopt.h>
72 #include <assert.h>
73 #include <fcntl.h>
74 #include <stdio.h>
75 #include <errno.h>
76 #include <ctype.h>
77 #include <time.h>
78
79 #include <glib.h>
80
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
83 #include <sys/poll.h>
84 #include <sys/prctl.h>
85 #include <sys/wait.h>
86 #include <sys/uio.h>
87 #include <sys/mman.h>
88
89 #include <linux/unistd.h>
90
91 #include "../../include/linux/perf_counter.h"
92
93
94 /*
95  * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
96  * counters in the current task.
97  */
98 #define PR_TASK_PERF_COUNTERS_DISABLE   31
99 #define PR_TASK_PERF_COUNTERS_ENABLE    32
100
101 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
102
103 #define rdclock()                                       \
104 ({                                                      \
105         struct timespec ts;                             \
106                                                         \
107         clock_gettime(CLOCK_MONOTONIC, &ts);            \
108         ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
109 })
110
111 /*
112  * Pick up some kernel type conventions:
113  */
114 #define __user
115 #define asmlinkage
116
117 typedef unsigned int            __u32;
118 typedef unsigned long long      __u64;
119 typedef long long               __s64;
120
121
122 #ifdef __x86_64__
123 #define __NR_perf_counter_open 295
124 #define rmb()           asm volatile("lfence" ::: "memory")
125 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
126 #endif
127
128 #ifdef __i386__
129 #define __NR_perf_counter_open 333
130 #define rmb()           asm volatile("lfence" ::: "memory")
131 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
132 #endif
133
134 #ifdef __powerpc__
135 #define __NR_perf_counter_open 319
136 #define rmb()           asm volatile ("sync" ::: "memory")
137 #define cpu_relax()     asm volatile ("" ::: "memory");
138 #endif
139
140 #define unlikely(x)     __builtin_expect(!!(x), 0)
141
142 asmlinkage int sys_perf_counter_open(
143         struct perf_counter_hw_event    *hw_event_uptr          __user,
144         pid_t                           pid,
145         int                             cpu,
146         int                             group_fd,
147         unsigned long                   flags)
148 {
149         int ret;
150
151         ret = syscall(
152                 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
153 #if defined(__x86_64__) || defined(__i386__)
154         if (ret < 0 && ret > -4096) {
155                 errno = -ret;
156                 ret = -1;
157         }
158 #endif
159         return ret;
160 }
161
162 #define MAX_COUNTERS                    64
163 #define MAX_NR_CPUS                     256
164
165 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
166
167 static int                      run_perfstat                    =  0;
168 static int                      system_wide                     =  0;
169
170 static int                      nr_counters                     =  0;
171 static __u64                    event_id[MAX_COUNTERS]          = {
172         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
173         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
174         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
175         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
176
177         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
178         EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
179         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
180         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
181 };
182 static int                      default_interval = 100000;
183 static int                      event_count[MAX_COUNTERS];
184 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
185
186 static __u64                    count_filter                   = 100;
187
188 static int                      tid                             = -1;
189 static int                      profile_cpu                     = -1;
190 static int                      nr_cpus                         =  0;
191 static int                      nmi                             =  1;
192 static int                      group                           =  0;
193 static unsigned int             page_size;
194
195 static char                     *vmlinux;
196
197 static char                     *sym_filter;
198 static unsigned long            filter_start;
199 static unsigned long            filter_end;
200
201 static int                      delay_secs                      =  2;
202 static int                      zero;
203 static int                      dump_symtab;
204
205 static GList                    *lines;
206
207 struct source_line {
208         uint64_t                EIP;
209         unsigned long           count;
210         char                    *line;
211 };
212
213
214 const unsigned int default_count[] = {
215         1000000,
216         1000000,
217           10000,
218           10000,
219         1000000,
220           10000,
221 };
222
223 static char *hw_event_names[] = {
224         "CPU cycles",
225         "instructions",
226         "cache references",
227         "cache misses",
228         "branches",
229         "branch misses",
230         "bus cycles",
231 };
232
233 static char *sw_event_names[] = {
234         "cpu clock ticks",
235         "task clock ticks",
236         "pagefaults",
237         "context switches",
238         "CPU migrations",
239         "minor faults",
240         "major faults",
241 };
242
243 struct event_symbol {
244         __u64 event;
245         char *symbol;
246 };
247
248 static struct event_symbol event_symbols[] = {
249         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cpu-cycles",           },
250         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cycles",               },
251         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),              "instructions",         },
252         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),          "cache-references",     },
253         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),              "cache-misses",         },
254         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branch-instructions",  },
255         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branches",             },
256         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),             "branch-misses",        },
257         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),                "bus-cycles",           },
258
259         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),                 "cpu-clock",            },
260         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),                "task-clock",           },
261         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "page-faults",          },
262         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "faults",               },
263         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),           "minor-faults",         },
264         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),           "major-faults",         },
265         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "context-switches",     },
266         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "cs",                   },
267         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "cpu-migrations",       },
268         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "migrations",           },
269 };
270
271 #define __PERF_COUNTER_FIELD(config, name) \
272         ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
273
274 #define PERF_COUNTER_RAW(config)        __PERF_COUNTER_FIELD(config, RAW)
275 #define PERF_COUNTER_CONFIG(config)     __PERF_COUNTER_FIELD(config, CONFIG)
276 #define PERF_COUNTER_TYPE(config)       __PERF_COUNTER_FIELD(config, TYPE)
277 #define PERF_COUNTER_ID(config)         __PERF_COUNTER_FIELD(config, EVENT)
278
279 static void display_events_help(void)
280 {
281         unsigned int i;
282         __u64 e;
283
284         printf(
285         " -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
286
287         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
288                 int type, id;
289
290                 e = event_symbols[i].event;
291                 type = PERF_COUNTER_TYPE(e);
292                 id = PERF_COUNTER_ID(e);
293
294                 printf("\n                             %d:%d: %-20s",
295                                 type, id, event_symbols[i].symbol);
296         }
297
298         printf("\n"
299         "                           rNNN: raw PMU events (eventsel+umask)\n\n");
300 }
301
302 static void display_perfstat_help(void)
303 {
304         printf(
305         "Usage: perfstat [<events...>] <cmd...>\n\n"
306         "PerfStat Options (up to %d event types can be specified):\n\n",
307                  MAX_COUNTERS);
308
309         display_events_help();
310
311         printf(
312         " -a                           # system-wide collection\n");
313         exit(0);
314 }
315
316 static void display_help(void)
317 {
318         if (run_perfstat)
319                 return display_perfstat_help();
320
321         printf(
322         "Usage: kerneltop [<options>]\n"
323         "   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
324         "KernelTop Options (up to %d event types can be specified at once):\n\n",
325                  MAX_COUNTERS);
326
327         display_events_help();
328
329         printf(
330         " -S        --stat             # perfstat COMMAND\n"
331         " -a                           # system-wide collection (for perfstat)\n\n"
332         " -c CNT    --count=CNT        # event period to sample\n\n"
333         " -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
334         " -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
335         " -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
336         " -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
337         " -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
338         " -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
339         " -z        --zero             # zero counts after display\n"
340         " -D        --dump_symtab      # dump symbol table to stderr on startup\n"
341         );
342
343         exit(0);
344 }
345
346 static char *event_name(int ctr)
347 {
348         __u64 config = event_id[ctr];
349         int type = PERF_COUNTER_TYPE(config);
350         int id = PERF_COUNTER_ID(config);
351         static char buf[32];
352
353         if (PERF_COUNTER_RAW(config)) {
354                 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
355                 return buf;
356         }
357
358         switch (type) {
359         case PERF_TYPE_HARDWARE:
360                 if (id < PERF_HW_EVENTS_MAX)
361                         return hw_event_names[id];
362                 return "unknown-hardware";
363
364         case PERF_TYPE_SOFTWARE:
365                 if (id < PERF_SW_EVENTS_MAX)
366                         return sw_event_names[id];
367                 return "unknown-software";
368
369         default:
370                 break;
371         }
372
373         return "unknown";
374 }
375
376 /*
377  * Each event can have multiple symbolic names.
378  * Symbolic names are (almost) exactly matched.
379  */
380 static __u64 match_event_symbols(char *str)
381 {
382         __u64 config, id;
383         int type;
384         unsigned int i;
385
386         if (sscanf(str, "r%llx", &config) == 1)
387                 return config | PERF_COUNTER_RAW_MASK;
388
389         if (sscanf(str, "%d:%llu", &type, &id) == 2)
390                 return EID(type, id);
391
392         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
393                 if (!strncmp(str, event_symbols[i].symbol,
394                              strlen(event_symbols[i].symbol)))
395                         return event_symbols[i].event;
396         }
397
398         return ~0ULL;
399 }
400
401 static int parse_events(char *str)
402 {
403         __u64 config;
404
405 again:
406         if (nr_counters == MAX_COUNTERS)
407                 return -1;
408
409         config = match_event_symbols(str);
410         if (config == ~0ULL)
411                 return -1;
412
413         event_id[nr_counters] = config;
414         nr_counters++;
415
416         str = strstr(str, ",");
417         if (str) {
418                 str++;
419                 goto again;
420         }
421
422         return 0;
423 }
424
425
426 /*
427  * perfstat
428  */
429
430 char fault_here[1000000];
431
432 static void create_perfstat_counter(int counter)
433 {
434         struct perf_counter_hw_event hw_event;
435
436         memset(&hw_event, 0, sizeof(hw_event));
437         hw_event.config         = event_id[counter];
438         hw_event.record_type    = PERF_RECORD_SIMPLE;
439         hw_event.nmi            = 0;
440
441         if (system_wide) {
442                 int cpu;
443                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
444                         fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
445                         if (fd[cpu][counter] < 0) {
446                                 printf("perfstat error: syscall returned with %d (%s)\n",
447                                                 fd[cpu][counter], strerror(errno));
448                                 exit(-1);
449                         }
450                 }
451         } else {
452                 hw_event.inherit        = 1;
453                 hw_event.disabled       = 1;
454
455                 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
456                 if (fd[0][counter] < 0) {
457                         printf("perfstat error: syscall returned with %d (%s)\n",
458                                         fd[0][counter], strerror(errno));
459                         exit(-1);
460                 }
461         }
462 }
463
464 int do_perfstat(int argc, char *argv[])
465 {
466         unsigned long long t0, t1;
467         int counter;
468         ssize_t res;
469         int status;
470         int pid;
471
472         if (!system_wide)
473                 nr_cpus = 1;
474
475         for (counter = 0; counter < nr_counters; counter++)
476                 create_perfstat_counter(counter);
477
478         argc -= optind;
479         argv += optind;
480
481         if (!argc)
482                 display_help();
483
484         /*
485          * Enable counters and exec the command:
486          */
487         t0 = rdclock();
488         prctl(PR_TASK_PERF_COUNTERS_ENABLE);
489
490         if ((pid = fork()) < 0)
491                 perror("failed to fork");
492         if (!pid) {
493                 if (execvp(argv[0], argv)) {
494                         perror(argv[0]);
495                         exit(-1);
496                 }
497         }
498         while (wait(&status) >= 0)
499                 ;
500         prctl(PR_TASK_PERF_COUNTERS_DISABLE);
501         t1 = rdclock();
502
503         fflush(stdout);
504
505         fprintf(stderr, "\n");
506         fprintf(stderr, " Performance counter stats for \'%s\':\n",
507                 argv[0]);
508         fprintf(stderr, "\n");
509
510         for (counter = 0; counter < nr_counters; counter++) {
511                 int cpu;
512                 __u64 count, single_count;
513
514                 count = 0;
515                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
516                         res = read(fd[cpu][counter],
517                                         (char *) &single_count, sizeof(single_count));
518                         assert(res == sizeof(single_count));
519                         count += single_count;
520                 }
521
522                 if (!PERF_COUNTER_RAW(event_id[counter]) &&
523                     (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
524                      event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
525
526                         double msecs = (double)count / 1000000;
527
528                         fprintf(stderr, " %14.6f  %-20s (msecs)\n",
529                                 msecs, event_name(counter));
530                 } else {
531                         fprintf(stderr, " %14Ld  %-20s (events)\n",
532                                 count, event_name(counter));
533                 }
534                 if (!counter)
535                         fprintf(stderr, "\n");
536         }
537         fprintf(stderr, "\n");
538         fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
539                         (double)(t1-t0)/1e6);
540         fprintf(stderr, "\n");
541
542         return 0;
543 }
544
545 /*
546  * Symbols
547  */
548
549 static uint64_t                 min_ip;
550 static uint64_t                 max_ip = -1ll;
551
552 struct sym_entry {
553         unsigned long long      addr;
554         char                    *sym;
555         unsigned long           count[MAX_COUNTERS];
556         int                     skip;
557         GList                   *source;
558 };
559
560 #define MAX_SYMS                100000
561
562 static int sym_table_count;
563
564 struct sym_entry                *sym_filter_entry;
565
566 static struct sym_entry         sym_table[MAX_SYMS];
567
568 static void show_details(struct sym_entry *sym);
569
570 /*
571  * Ordering weight: count-1 * count-2 * ... / count-n
572  */
573 static double sym_weight(const struct sym_entry *sym)
574 {
575         double weight;
576         int counter;
577
578         weight = sym->count[0];
579
580         for (counter = 1; counter < nr_counters-1; counter++)
581                 weight *= sym->count[counter];
582
583         weight /= (sym->count[counter] + 1);
584
585         return weight;
586 }
587
588 static int compare(const void *__sym1, const void *__sym2)
589 {
590         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
591
592         return sym_weight(sym1) < sym_weight(sym2);
593 }
594
595 static time_t                   last_refresh;
596 static long                     events;
597 static long                     userspace_events;
598 static const char               CONSOLE_CLEAR[] = "\e[H\e[2J";
599
600 static struct sym_entry         tmp[MAX_SYMS];
601
602 static void print_sym_table(void)
603 {
604         int i, printed;
605         int counter;
606         float events_per_sec = events/delay_secs;
607         float kevents_per_sec = (events-userspace_events)/delay_secs;
608
609         memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
610         qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
611
612         write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
613
614         printf(
615 "------------------------------------------------------------------------------\n");
616         printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
617                 events_per_sec,
618                 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
619                 nmi ? "NMI" : "IRQ");
620
621         if (nr_counters == 1)
622                 printf("%d ", event_count[0]);
623
624         for (counter = 0; counter < nr_counters; counter++) {
625                 if (counter)
626                         printf("/");
627
628                 printf("%s", event_name(counter));
629         }
630
631         printf( "], ");
632
633         if (tid != -1)
634                 printf(" (tid: %d", tid);
635         else
636                 printf(" (all");
637
638         if (profile_cpu != -1)
639                 printf(", cpu: %d)\n", profile_cpu);
640         else {
641                 if (tid != -1)
642                         printf(")\n");
643                 else
644                         printf(", %d CPUs)\n", nr_cpus);
645         }
646
647         printf("------------------------------------------------------------------------------\n\n");
648
649         if (nr_counters == 1)
650                 printf("             events");
651         else
652                 printf("  weight     events");
653
654         printf("         RIP          kernel function\n"
655                        "  ______     ______   ________________   _______________\n\n"
656         );
657
658         printed = 0;
659         for (i = 0; i < sym_table_count; i++) {
660                 int count;
661
662                 if (nr_counters == 1) {
663                         if (printed <= 18 &&
664                                         tmp[i].count[0] >= count_filter) {
665                                 printf("%19.2f - %016llx : %s\n",
666                                   sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
667                                 printed++;
668                         }
669                 } else {
670                         if (printed <= 18 &&
671                                         tmp[i].count[0] >= count_filter) {
672                                 printf("%8.1f %10ld - %016llx : %s\n",
673                                   sym_weight(tmp + i),
674                                   tmp[i].count[0],
675                                   tmp[i].addr, tmp[i].sym);
676                                 printed++;
677                         }
678                 }
679                 /*
680                  * Add decay to the counts:
681                  */
682                 for (count = 0; count < nr_counters; count++)
683                         sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
684         }
685
686         if (sym_filter_entry)
687                 show_details(sym_filter_entry);
688
689         last_refresh = time(NULL);
690
691         {
692                 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
693
694                 if (poll(&stdin_poll, 1, 0) == 1) {
695                         printf("key pressed - exiting.\n");
696                         exit(0);
697                 }
698         }
699 }
700
701 static int read_symbol(FILE *in, struct sym_entry *s)
702 {
703         static int filter_match = 0;
704         char *sym, stype;
705         char str[500];
706         int rc, pos;
707
708         rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
709         if (rc == EOF)
710                 return -1;
711
712         assert(rc == 3);
713
714         /* skip until end of line: */
715         pos = strlen(str);
716         do {
717                 rc = fgetc(in);
718                 if (rc == '\n' || rc == EOF || pos >= 499)
719                         break;
720                 str[pos] = rc;
721                 pos++;
722         } while (1);
723         str[pos] = 0;
724
725         sym = str;
726
727         /* Filter out known duplicates and non-text symbols. */
728         if (!strcmp(sym, "_text"))
729                 return 1;
730         if (!min_ip && !strcmp(sym, "_stext"))
731                 return 1;
732         if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
733                 return 1;
734         if (stype != 'T' && stype != 't')
735                 return 1;
736         if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
737                 return 1;
738         if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
739                 return 1;
740
741         s->sym = malloc(strlen(str));
742         assert(s->sym);
743
744         strcpy((char *)s->sym, str);
745         s->skip = 0;
746
747         /* Tag events to be skipped. */
748         if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
749                 s->skip = 1;
750         if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
751                 s->skip = 1;
752
753         if (filter_match == 1) {
754                 filter_end = s->addr;
755                 filter_match = -1;
756                 if (filter_end - filter_start > 10000) {
757                         printf("hm, too large filter symbol <%s> - skipping.\n",
758                                 sym_filter);
759                         printf("symbol filter start: %016lx\n", filter_start);
760                         printf("                end: %016lx\n", filter_end);
761                         filter_end = filter_start = 0;
762                         sym_filter = NULL;
763                         sleep(1);
764                 }
765         }
766         if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
767                 filter_match = 1;
768                 filter_start = s->addr;
769         }
770
771         return 0;
772 }
773
774 int compare_addr(const void *__sym1, const void *__sym2)
775 {
776         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
777
778         return sym1->addr > sym2->addr;
779 }
780
781 static void sort_symbol_table(void)
782 {
783         int i, dups;
784
785         do {
786                 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
787                 for (i = 0, dups = 0; i < sym_table_count; i++) {
788                         if (sym_table[i].addr == sym_table[i+1].addr) {
789                                 sym_table[i+1].addr = -1ll;
790                                 dups++;
791                         }
792                 }
793                 sym_table_count -= dups;
794         } while(dups);
795 }
796
797 static void parse_symbols(void)
798 {
799         struct sym_entry *last;
800
801         FILE *kallsyms = fopen("/proc/kallsyms", "r");
802
803         if (!kallsyms) {
804                 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
805                 exit(-1);
806         }
807
808         while (!feof(kallsyms)) {
809                 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
810                         sym_table_count++;
811                         assert(sym_table_count <= MAX_SYMS);
812                 }
813         }
814
815         sort_symbol_table();
816         min_ip = sym_table[0].addr;
817         max_ip = sym_table[sym_table_count-1].addr;
818         last = sym_table + sym_table_count++;
819
820         last->addr = -1ll;
821         last->sym = "<end>";
822
823         if (filter_end) {
824                 int count;
825                 for (count=0; count < sym_table_count; count ++) {
826                         if (!strcmp(sym_table[count].sym, sym_filter)) {
827                                 sym_filter_entry = &sym_table[count];
828                                 break;
829                         }
830                 }
831         }
832         if (dump_symtab) {
833                 int i;
834
835                 for (i = 0; i < sym_table_count; i++)
836                         fprintf(stderr, "%llx %s\n",
837                                 sym_table[i].addr, sym_table[i].sym);
838         }
839 }
840
841 /*
842  * Source lines
843  */
844
845 static void parse_vmlinux(char *filename)
846 {
847         FILE *file;
848         char command[PATH_MAX*2];
849         if (!filename)
850                 return;
851
852         sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
853
854         file = popen(command, "r");
855         if (!file)
856                 return;
857
858         while (!feof(file)) {
859                 struct source_line *src;
860                 size_t dummy = 0;
861                 char *c;
862
863                 src = malloc(sizeof(struct source_line));
864                 assert(src != NULL);
865                 memset(src, 0, sizeof(struct source_line));
866
867                 if (getline(&src->line, &dummy, file) < 0)
868                         break;
869                 if (!src->line)
870                         break;
871
872                 c = strchr(src->line, '\n');
873                 if (c)
874                         *c = 0;
875
876                 lines = g_list_prepend(lines, src);
877
878                 if (strlen(src->line)>8 && src->line[8] == ':')
879                         src->EIP = strtoull(src->line, NULL, 16);
880                 if (strlen(src->line)>8 && src->line[16] == ':')
881                         src->EIP = strtoull(src->line, NULL, 16);
882         }
883         pclose(file);
884         lines = g_list_reverse(lines);
885 }
886
887 static void record_precise_ip(uint64_t ip)
888 {
889         struct source_line *line;
890         GList *item;
891
892         item = g_list_first(lines);
893         while (item) {
894                 line = item->data;
895                 if (line->EIP == ip)
896                         line->count++;
897                 if (line->EIP > ip)
898                         break;
899                 item = g_list_next(item);
900         }
901 }
902
903 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
904 {
905         struct source_line *line;
906         GList *item;
907         char pattern[PATH_MAX];
908         sprintf(pattern, "<%s>:", sym->sym);
909
910         item = g_list_first(lines);
911         while (item) {
912                 line = item->data;
913                 if (strstr(line->line, pattern)) {
914                         sym->source = item;
915                         break;
916                 }
917                 item = g_list_next(item);
918         }
919 }
920
921 void show_lines(GList *item_queue, int item_queue_count)
922 {
923         int i;
924         struct source_line *line;
925
926         for (i = 0; i < item_queue_count; i++) {
927                 line = item_queue->data;
928                 printf("%8li\t%s\n", line->count, line->line);
929                 item_queue = g_list_next(item_queue);
930         }
931 }
932
933 #define TRACE_COUNT     3
934
935 static void show_details(struct sym_entry *sym)
936 {
937         struct source_line *line;
938         GList *item;
939         int displayed = 0;
940         GList *item_queue = NULL;
941         int item_queue_count = 0;
942
943         if (!sym->source)
944                 lookup_sym_in_vmlinux(sym);
945         if (!sym->source)
946                 return;
947
948         printf("Showing details for %s\n", sym->sym);
949
950         item = sym->source;
951         while (item) {
952                 line = item->data;
953                 if (displayed && strstr(line->line, ">:"))
954                         break;
955
956                 if (!item_queue_count)
957                         item_queue = item;
958                 item_queue_count ++;
959
960                 if (line->count >= count_filter) {
961                         show_lines(item_queue, item_queue_count);
962                         item_queue_count = 0;
963                         item_queue = NULL;
964                 } else if (item_queue_count > TRACE_COUNT) {
965                         item_queue = g_list_next(item_queue);
966                         item_queue_count --;
967                 }
968
969                 line->count = 0;
970                 displayed++;
971                 if (displayed > 300)
972                         break;
973                 item = g_list_next(item);
974         }
975 }
976
977 /*
978  * Binary search in the histogram table and record the hit:
979  */
980 static void record_ip(uint64_t ip, int counter)
981 {
982         int left_idx, middle_idx, right_idx, idx;
983         unsigned long left, middle, right;
984
985         record_precise_ip(ip);
986
987         left_idx = 0;
988         right_idx = sym_table_count-1;
989         assert(ip <= max_ip && ip >= min_ip);
990
991         while (left_idx + 1 < right_idx) {
992                 middle_idx = (left_idx + right_idx) / 2;
993
994                 left   = sym_table[  left_idx].addr;
995                 middle = sym_table[middle_idx].addr;
996                 right  = sym_table[ right_idx].addr;
997
998                 if (!(left <= middle && middle <= right)) {
999                         printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1000                         printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1001                 }
1002                 assert(left <= middle && middle <= right);
1003                 if (!(left <= ip && ip <= right)) {
1004                         printf(" left: %016lx\n", left);
1005                         printf("   ip: %016lx\n", (unsigned long)ip);
1006                         printf("right: %016lx\n", right);
1007                 }
1008                 assert(left <= ip && ip <= right);
1009                 /*
1010                  * [ left .... target .... middle .... right ]
1011                  *   => right := middle
1012                  */
1013                 if (ip < middle) {
1014                         right_idx = middle_idx;
1015                         continue;
1016                 }
1017                 /*
1018                  * [ left .... middle ... target ... right ]
1019                  *   => left := middle
1020                  */
1021                 left_idx = middle_idx;
1022         }
1023
1024         idx = left_idx;
1025
1026         if (!sym_table[idx].skip)
1027                 sym_table[idx].count[counter]++;
1028         else events--;
1029 }
1030
1031 static void process_event(uint64_t ip, int counter)
1032 {
1033         events++;
1034
1035         if (ip < min_ip || ip > max_ip) {
1036                 userspace_events++;
1037                 return;
1038         }
1039
1040         record_ip(ip, counter);
1041 }
1042
1043 static void process_options(int argc, char *argv[])
1044 {
1045         int error = 0, counter;
1046
1047         if (strstr(argv[0], "perfstat"))
1048                 run_perfstat = 1;
1049
1050         for (;;) {
1051                 int option_index = 0;
1052                 /** Options for getopt */
1053                 static struct option long_options[] = {
1054                         {"count",       required_argument,      NULL, 'c'},
1055                         {"cpu",         required_argument,      NULL, 'C'},
1056                         {"delay",       required_argument,      NULL, 'd'},
1057                         {"dump_symtab", no_argument,            NULL, 'D'},
1058                         {"event",       required_argument,      NULL, 'e'},
1059                         {"filter",      required_argument,      NULL, 'f'},
1060                         {"group",       required_argument,      NULL, 'g'},
1061                         {"help",        no_argument,            NULL, 'h'},
1062                         {"nmi",         required_argument,      NULL, 'n'},
1063                         {"pid",         required_argument,      NULL, 'p'},
1064                         {"vmlinux",     required_argument,      NULL, 'x'},
1065                         {"symbol",      required_argument,      NULL, 's'},
1066                         {"stat",        no_argument,            NULL, 'S'},
1067                         {"zero",        no_argument,            NULL, 'z'},
1068                         {NULL,          0,                      NULL,  0 }
1069                 };
1070                 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
1071                                     long_options, &option_index);
1072                 if (c == -1)
1073                         break;
1074
1075                 switch (c) {
1076                 case 'a': system_wide                   =              1; break;
1077                 case 'c': default_interval              =   atoi(optarg); break;
1078                 case 'C':
1079                         /* CPU and PID are mutually exclusive */
1080                         if (tid != -1) {
1081                                 printf("WARNING: CPU switch overriding PID\n");
1082                                 sleep(1);
1083                                 tid = -1;
1084                         }
1085                         profile_cpu                     =   atoi(optarg); break;
1086                 case 'd': delay_secs                    =   atoi(optarg); break;
1087                 case 'D': dump_symtab                   =              1; break;
1088
1089                 case 'e': error                         = parse_events(optarg); break;
1090
1091                 case 'f': count_filter                  =   atoi(optarg); break;
1092                 case 'g': group                         =   atoi(optarg); break;
1093                 case 'h':                                 display_help(); break;
1094                 case 'n': nmi                           =   atoi(optarg); break;
1095                 case 'p':
1096                         /* CPU and PID are mutually exclusive */
1097                         if (profile_cpu != -1) {
1098                                 printf("WARNING: PID switch overriding CPU\n");
1099                                 sleep(1);
1100                                 profile_cpu = -1;
1101                         }
1102                         tid                             =   atoi(optarg); break;
1103                 case 's': sym_filter                    = strdup(optarg); break;
1104                 case 'S': run_perfstat                  =              1; break;
1105                 case 'x': vmlinux                       = strdup(optarg); break;
1106                 case 'z': zero                          =              1; break;
1107                 default: error = 1; break;
1108                 }
1109         }
1110         if (error)
1111                 display_help();
1112
1113         if (!nr_counters) {
1114                 if (run_perfstat)
1115                         nr_counters = 8;
1116                 else {
1117                         nr_counters = 1;
1118                         event_id[0] = 0;
1119                 }
1120         }
1121
1122         for (counter = 0; counter < nr_counters; counter++) {
1123                 if (event_count[counter])
1124                         continue;
1125
1126                 event_count[counter] = default_interval;
1127         }
1128 }
1129
1130 struct mmap_data {
1131         int counter;
1132         void *base;
1133         unsigned int mask;
1134         unsigned int prev;
1135 };
1136
1137 static unsigned int mmap_read_head(struct mmap_data *md)
1138 {
1139         struct perf_counter_mmap_page *pc = md->base;
1140         unsigned int seq, head;
1141
1142 repeat:
1143         rmb();
1144         seq = pc->lock;
1145
1146         if (unlikely(seq & 1)) {
1147                 cpu_relax();
1148                 goto repeat;
1149         }
1150
1151         head = pc->data_head;
1152
1153         rmb();
1154         if (pc->lock != seq)
1155                 goto repeat;
1156
1157         return head;
1158 }
1159
1160 static void mmap_read(struct mmap_data *md)
1161 {
1162         unsigned int head = mmap_read_head(md);
1163         unsigned int old = md->prev;
1164         unsigned char *data = md->base + page_size;
1165
1166         if (head - old > md->mask) {
1167                 printf("ERROR: failed to keep up with mmap data\n");
1168                 exit(-1);
1169         }
1170
1171         for (; old != head;) {
1172                 __u64 *ptr = (__u64 *)&data[old & md->mask];
1173                 old += sizeof(__u64);
1174
1175                 process_event(*ptr, md->counter);
1176         }
1177
1178         md->prev = old;
1179 }
1180
1181 int main(int argc, char *argv[])
1182 {
1183         struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
1184         struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1185         struct perf_counter_hw_event hw_event;
1186         int i, counter, group_fd;
1187         unsigned int cpu;
1188         int ret;
1189
1190         page_size = sysconf(_SC_PAGE_SIZE);
1191
1192         process_options(argc, argv);
1193
1194         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1195         assert(nr_cpus <= MAX_NR_CPUS);
1196         assert(nr_cpus >= 0);
1197
1198         if (run_perfstat)
1199                 return do_perfstat(argc, argv);
1200
1201         if (tid != -1 || profile_cpu != -1)
1202                 nr_cpus = 1;
1203
1204         for (i = 0; i < nr_cpus; i++) {
1205                 group_fd = -1;
1206                 for (counter = 0; counter < nr_counters; counter++) {
1207
1208                         cpu     = profile_cpu;
1209                         if (tid == -1 && profile_cpu == -1)
1210                                 cpu = i;
1211
1212                         memset(&hw_event, 0, sizeof(hw_event));
1213                         hw_event.config         = event_id[counter];
1214                         hw_event.irq_period     = event_count[counter];
1215                         hw_event.record_type    = PERF_RECORD_IRQ;
1216                         hw_event.nmi            = nmi;
1217
1218                         fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1219                         fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1220                         if (fd[i][counter] < 0) {
1221                                 printf("kerneltop error: syscall returned with %d (%s)\n",
1222                                         fd[i][counter], strerror(-fd[i][counter]));
1223                                 if (fd[i][counter] == -1)
1224                                         printf("Are you root?\n");
1225                                 exit(-1);
1226                         }
1227                         assert(fd[i][counter] >= 0);
1228
1229                         /*
1230                          * First counter acts as the group leader:
1231                          */
1232                         if (group && group_fd == -1)
1233                                 group_fd = fd[i][counter];
1234
1235                         event_array[i][counter].fd = fd[i][counter];
1236                         event_array[i][counter].events = POLLIN;
1237
1238                         mmap_array[i][counter].counter = counter;
1239                         mmap_array[i][counter].prev = 0;
1240                         mmap_array[i][counter].mask = 2*page_size - 1;
1241                         mmap_array[i][counter].base = mmap(NULL, 3*page_size,
1242                                         PROT_READ, MAP_SHARED, fd[i][counter], 0);
1243                         if (mmap_array[i][counter].base == MAP_FAILED) {
1244                                 printf("kerneltop error: failed to mmap with %d (%s)\n",
1245                                                 errno, strerror(errno));
1246                                 exit(-1);
1247                         }
1248                 }
1249         }
1250
1251         parse_symbols();
1252         if (vmlinux && sym_filter_entry)
1253                 parse_vmlinux(vmlinux);
1254
1255         printf("KernelTop refresh period: %d seconds\n", delay_secs);
1256         last_refresh = time(NULL);
1257
1258         while (1) {
1259                 int hits = events;
1260
1261                 for (i = 0; i < nr_cpus; i++) {
1262                         for (counter = 0; counter < nr_counters; counter++)
1263                                 mmap_read(&mmap_array[i][counter]);
1264                 }
1265
1266                 if (time(NULL) >= last_refresh + delay_secs) {
1267                         print_sym_table();
1268                         events = userspace_events = 0;
1269                 }
1270
1271                 if (hits == events)
1272                         ret = poll(event_array[0], nr_cpus, 1000);
1273                 hits = events;
1274         }
1275
1276         return 0;
1277 }