4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace/beauty/beauty.h"
35 #include "trace-event.h"
36 #include "util/parse-events.h"
37 #include "util/bpf-loader.h"
38 #include "callchain.h"
39 #include "syscalltbl.h"
40 #include "rb_resort.h"
42 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
45 #include <linux/err.h>
46 #include <linux/filter.h>
47 #include <linux/audit.h>
48 #include <linux/kernel.h>
49 #include <linux/random.h>
50 #include <linux/stringify.h>
51 #include <linux/time64.h>
54 # define O_CLOEXEC 02000000
58 struct perf_tool tool;
59 struct syscalltbl *sctbl;
62 struct syscall *table;
64 struct perf_evsel *sys_enter,
68 struct record_opts opts;
69 struct perf_evlist *evlist;
71 struct thread *current;
74 unsigned long nr_events;
75 struct strlist *ev_qualifier;
84 double duration_filter;
90 unsigned int max_stack;
91 unsigned int min_stack;
92 bool not_ev_qualifier;
96 bool multiple_threads;
100 bool show_tool_stats;
102 bool kernel_syscallchains;
112 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
113 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
117 #define TP_UINT_FIELD(bits) \
118 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
121 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
130 #define TP_UINT_FIELD__SWAPPED(bits) \
131 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
134 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
135 return bswap_##bits(value);\
138 TP_UINT_FIELD__SWAPPED(16);
139 TP_UINT_FIELD__SWAPPED(32);
140 TP_UINT_FIELD__SWAPPED(64);
142 static int tp_field__init_uint(struct tp_field *field,
143 struct format_field *format_field,
146 field->offset = format_field->offset;
148 switch (format_field->size) {
150 field->integer = tp_field__u8;
153 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
156 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
159 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
168 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
170 return sample->raw_data + field->offset;
173 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
175 field->offset = format_field->offset;
176 field->pointer = tp_field__ptr;
183 struct tp_field args, ret;
187 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
188 struct tp_field *field,
191 struct format_field *format_field = perf_evsel__field(evsel, name);
193 if (format_field == NULL)
196 return tp_field__init_uint(field, format_field, evsel->needs_swap);
199 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
200 ({ struct syscall_tp *sc = evsel->priv;\
201 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
203 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
204 struct tp_field *field,
207 struct format_field *format_field = perf_evsel__field(evsel, name);
209 if (format_field == NULL)
212 return tp_field__init_ptr(field, format_field);
215 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
216 ({ struct syscall_tp *sc = evsel->priv;\
217 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
219 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
222 perf_evsel__delete(evsel);
225 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
227 evsel->priv = malloc(sizeof(struct syscall_tp));
228 if (evsel->priv != NULL) {
229 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
232 evsel->handler = handler;
243 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
245 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
247 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
249 evsel = perf_evsel__newtp("syscalls", direction);
254 if (perf_evsel__init_syscall_tp(evsel, handler))
260 perf_evsel__delete_priv(evsel);
264 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
265 ({ struct syscall_tp *fields = evsel->priv; \
266 fields->name.integer(&fields->name, sample); })
268 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
269 ({ struct syscall_tp *fields = evsel->priv; \
270 fields->name.pointer(&fields->name, sample); })
275 const char **entries;
278 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
279 .nr_entries = ARRAY_SIZE(array), \
283 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
285 .nr_entries = ARRAY_SIZE(array), \
289 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
291 struct syscall_arg *arg)
293 struct strarray *sa = arg->parm;
294 int idx = arg->val - sa->offset;
296 if (idx < 0 || idx >= sa->nr_entries)
297 return scnprintf(bf, size, intfmt, arg->val);
299 return scnprintf(bf, size, "%s", sa->entries[idx]);
302 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
303 struct syscall_arg *arg)
305 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
308 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310 #if defined(__i386__) || defined(__x86_64__)
312 * FIXME: Make this available to all arches as soon as the ioctl beautifier
313 * gets rewritten to support all arches.
315 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
316 struct syscall_arg *arg)
318 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
321 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
322 #endif /* defined(__i386__) || defined(__x86_64__) */
324 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
325 struct syscall_arg *arg);
327 #define SCA_FD syscall_arg__scnprintf_fd
330 #define AT_FDCWD -100
333 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
334 struct syscall_arg *arg)
339 return scnprintf(bf, size, "CWD");
341 return syscall_arg__scnprintf_fd(bf, size, arg);
344 #define SCA_FDAT syscall_arg__scnprintf_fd_at
346 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
347 struct syscall_arg *arg);
349 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
351 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
352 struct syscall_arg *arg)
354 return scnprintf(bf, size, "%#lx", arg->val);
357 #define SCA_HEX syscall_arg__scnprintf_hex
359 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
360 struct syscall_arg *arg)
362 return scnprintf(bf, size, "%d", arg->val);
365 #define SCA_INT syscall_arg__scnprintf_int
367 static const char *bpf_cmd[] = {
368 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
369 "MAP_GET_NEXT_KEY", "PROG_LOAD",
371 static DEFINE_STRARRAY(bpf_cmd);
373 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
374 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
376 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
377 static DEFINE_STRARRAY(itimers);
379 static const char *keyctl_options[] = {
380 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
381 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
382 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
383 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
384 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
386 static DEFINE_STRARRAY(keyctl_options);
388 static const char *whences[] = { "SET", "CUR", "END",
396 static DEFINE_STRARRAY(whences);
398 static const char *fcntl_cmds[] = {
399 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
400 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
401 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
404 static DEFINE_STRARRAY(fcntl_cmds);
406 static const char *rlimit_resources[] = {
407 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
408 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
411 static DEFINE_STRARRAY(rlimit_resources);
413 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
414 static DEFINE_STRARRAY(sighow);
416 static const char *clockid[] = {
417 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
418 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
419 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
421 static DEFINE_STRARRAY(clockid);
423 static const char *socket_families[] = {
424 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
425 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
426 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
427 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
428 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
429 "ALG", "NFC", "VSOCK",
431 static DEFINE_STRARRAY(socket_families);
433 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
434 struct syscall_arg *arg)
439 if (mode == F_OK) /* 0 */
440 return scnprintf(bf, size, "F");
442 if (mode & n##_OK) { \
443 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
453 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
458 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
460 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
461 struct syscall_arg *arg);
463 #define SCA_FILENAME syscall_arg__scnprintf_filename
465 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
466 struct syscall_arg *arg)
468 int printed = 0, flags = arg->val;
471 if (flags & O_##n) { \
472 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
481 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
486 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
488 #if defined(__i386__) || defined(__x86_64__)
490 * FIXME: Make this available to all arches.
492 #define TCGETS 0x5401
494 static const char *tioctls[] = {
495 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
496 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
497 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
498 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
499 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
500 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
501 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
502 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
503 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
504 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
505 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
506 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
507 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
508 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
509 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
512 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
513 #endif /* defined(__i386__) || defined(__x86_64__) */
515 #ifndef GRND_NONBLOCK
516 #define GRND_NONBLOCK 0x0001
519 #define GRND_RANDOM 0x0002
522 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
523 struct syscall_arg *arg)
525 int printed = 0, flags = arg->val;
528 if (flags & GRND_##n) { \
529 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
530 flags &= ~GRND_##n; \
538 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
543 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
545 #define STRARRAY(arg, name, array) \
546 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
547 .arg_parm = { [arg] = &strarray__##array, }
549 #include "trace/beauty/eventfd.c"
550 #include "trace/beauty/flock.c"
551 #include "trace/beauty/futex_op.c"
552 #include "trace/beauty/mmap.c"
553 #include "trace/beauty/mode_t.c"
554 #include "trace/beauty/msg_flags.c"
555 #include "trace/beauty/open_flags.c"
556 #include "trace/beauty/perf_event_open.c"
557 #include "trace/beauty/pid.c"
558 #include "trace/beauty/sched_policy.c"
559 #include "trace/beauty/seccomp.c"
560 #include "trace/beauty/signum.c"
561 #include "trace/beauty/socket_type.c"
562 #include "trace/beauty/waitid_options.c"
564 static struct syscall_fmt {
567 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
574 { .name = "access", .errmsg = true,
575 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
576 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
577 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
578 { .name = "brk", .hexret = true,
579 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
580 { .name = "chdir", .errmsg = true, },
581 { .name = "chmod", .errmsg = true, },
582 { .name = "chroot", .errmsg = true, },
583 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
584 { .name = "clone", .errpid = true, },
585 { .name = "close", .errmsg = true,
586 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
587 { .name = "connect", .errmsg = true, },
588 { .name = "creat", .errmsg = true, },
589 { .name = "dup", .errmsg = true, },
590 { .name = "dup2", .errmsg = true, },
591 { .name = "dup3", .errmsg = true, },
592 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
593 { .name = "eventfd2", .errmsg = true,
594 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
595 { .name = "faccessat", .errmsg = true, },
596 { .name = "fadvise64", .errmsg = true, },
597 { .name = "fallocate", .errmsg = true, },
598 { .name = "fchdir", .errmsg = true, },
599 { .name = "fchmod", .errmsg = true, },
600 { .name = "fchmodat", .errmsg = true,
601 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
602 { .name = "fchown", .errmsg = true, },
603 { .name = "fchownat", .errmsg = true,
604 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
605 { .name = "fcntl", .errmsg = true,
606 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
607 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
608 { .name = "fdatasync", .errmsg = true, },
609 { .name = "flock", .errmsg = true,
610 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
611 { .name = "fsetxattr", .errmsg = true, },
612 { .name = "fstat", .errmsg = true, .alias = "newfstat", },
613 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", },
614 { .name = "fstatfs", .errmsg = true, },
615 { .name = "fsync", .errmsg = true, },
616 { .name = "ftruncate", .errmsg = true, },
617 { .name = "futex", .errmsg = true,
618 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
619 { .name = "futimesat", .errmsg = true,
620 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
621 { .name = "getdents", .errmsg = true, },
622 { .name = "getdents64", .errmsg = true, },
623 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
624 { .name = "getpid", .errpid = true, },
625 { .name = "getpgid", .errpid = true, },
626 { .name = "getppid", .errpid = true, },
627 { .name = "getrandom", .errmsg = true,
628 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
629 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
630 { .name = "getxattr", .errmsg = true, },
631 { .name = "inotify_add_watch", .errmsg = true, },
632 { .name = "ioctl", .errmsg = true,
634 #if defined(__i386__) || defined(__x86_64__)
636 * FIXME: Make this available to all arches.
638 [1] = SCA_STRHEXARRAY, /* cmd */
639 [2] = SCA_HEX, /* arg */ },
640 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
642 [2] = SCA_HEX, /* arg */ }, },
644 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
645 { .name = "kill", .errmsg = true,
646 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
647 { .name = "lchown", .errmsg = true, },
648 { .name = "lgetxattr", .errmsg = true, },
649 { .name = "linkat", .errmsg = true,
650 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
651 { .name = "listxattr", .errmsg = true, },
652 { .name = "llistxattr", .errmsg = true, },
653 { .name = "lremovexattr", .errmsg = true, },
654 { .name = "lseek", .errmsg = true,
655 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
656 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
657 { .name = "lsetxattr", .errmsg = true, },
658 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
659 { .name = "lsxattr", .errmsg = true, },
660 { .name = "madvise", .errmsg = true,
661 .arg_scnprintf = { [0] = SCA_HEX, /* start */
662 [2] = SCA_MADV_BHV, /* behavior */ }, },
663 { .name = "mkdir", .errmsg = true, },
664 { .name = "mkdirat", .errmsg = true,
665 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
666 { .name = "mknod", .errmsg = true, },
667 { .name = "mknodat", .errmsg = true,
668 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
669 { .name = "mlock", .errmsg = true,
670 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
671 { .name = "mlockall", .errmsg = true,
672 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
673 { .name = "mmap", .hexret = true,
674 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
675 [2] = SCA_MMAP_PROT, /* prot */
676 [3] = SCA_MMAP_FLAGS, /* flags */ }, },
677 { .name = "mprotect", .errmsg = true,
678 .arg_scnprintf = { [0] = SCA_HEX, /* start */
679 [2] = SCA_MMAP_PROT, /* prot */ }, },
680 { .name = "mq_unlink", .errmsg = true,
681 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
682 { .name = "mremap", .hexret = true,
683 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
684 [3] = SCA_MREMAP_FLAGS, /* flags */
685 [4] = SCA_HEX, /* new_addr */ }, },
686 { .name = "munlock", .errmsg = true,
687 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
688 { .name = "munmap", .errmsg = true,
689 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
690 { .name = "name_to_handle_at", .errmsg = true,
691 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
692 { .name = "newfstatat", .errmsg = true,
693 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
694 { .name = "open", .errmsg = true,
695 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
696 { .name = "open_by_handle_at", .errmsg = true,
697 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
698 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
699 { .name = "openat", .errmsg = true,
700 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
701 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
702 { .name = "perf_event_open", .errmsg = true,
703 .arg_scnprintf = { [2] = SCA_INT, /* cpu */
704 [3] = SCA_FD, /* group_fd */
705 [4] = SCA_PERF_FLAGS, /* flags */ }, },
706 { .name = "pipe2", .errmsg = true,
707 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
708 { .name = "poll", .errmsg = true, .timeout = true, },
709 { .name = "ppoll", .errmsg = true, .timeout = true, },
710 { .name = "pread", .errmsg = true, .alias = "pread64", },
711 { .name = "preadv", .errmsg = true, .alias = "pread", },
712 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
713 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", },
714 { .name = "pwritev", .errmsg = true, },
715 { .name = "read", .errmsg = true, },
716 { .name = "readlink", .errmsg = true, },
717 { .name = "readlinkat", .errmsg = true,
718 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
719 { .name = "readv", .errmsg = true, },
720 { .name = "recvfrom", .errmsg = true,
721 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
722 { .name = "recvmmsg", .errmsg = true,
723 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
724 { .name = "recvmsg", .errmsg = true,
725 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
726 { .name = "removexattr", .errmsg = true, },
727 { .name = "renameat", .errmsg = true,
728 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
729 { .name = "rmdir", .errmsg = true, },
730 { .name = "rt_sigaction", .errmsg = true,
731 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
732 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
733 { .name = "rt_sigqueueinfo", .errmsg = true,
734 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
735 { .name = "rt_tgsigqueueinfo", .errmsg = true,
736 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
737 { .name = "sched_getattr", .errmsg = true, },
738 { .name = "sched_setattr", .errmsg = true, },
739 { .name = "sched_setscheduler", .errmsg = true,
740 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
741 { .name = "seccomp", .errmsg = true,
742 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
743 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
744 { .name = "select", .errmsg = true, .timeout = true, },
745 { .name = "sendmmsg", .errmsg = true,
746 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
747 { .name = "sendmsg", .errmsg = true,
748 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
749 { .name = "sendto", .errmsg = true,
750 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
751 { .name = "set_tid_address", .errpid = true, },
752 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
753 { .name = "setpgid", .errmsg = true, },
754 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
755 { .name = "setxattr", .errmsg = true, },
756 { .name = "shutdown", .errmsg = true, },
757 { .name = "socket", .errmsg = true,
758 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
759 [1] = SCA_SK_TYPE, /* type */ },
760 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
761 { .name = "socketpair", .errmsg = true,
762 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
763 [1] = SCA_SK_TYPE, /* type */ },
764 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
765 { .name = "stat", .errmsg = true, .alias = "newstat", },
766 { .name = "statfs", .errmsg = true, },
767 { .name = "statx", .errmsg = true,
768 .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
769 [2] = SCA_STATX_FLAGS, /* flags */
770 [3] = SCA_STATX_MASK, /* mask */ }, },
771 { .name = "swapoff", .errmsg = true,
772 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
773 { .name = "swapon", .errmsg = true,
774 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
775 { .name = "symlinkat", .errmsg = true,
776 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
777 { .name = "tgkill", .errmsg = true,
778 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
779 { .name = "tkill", .errmsg = true,
780 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
781 { .name = "truncate", .errmsg = true, },
782 { .name = "uname", .errmsg = true, .alias = "newuname", },
783 { .name = "unlinkat", .errmsg = true,
784 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
785 { .name = "utime", .errmsg = true, },
786 { .name = "utimensat", .errmsg = true,
787 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
788 { .name = "utimes", .errmsg = true, },
789 { .name = "vmsplice", .errmsg = true, },
790 { .name = "wait4", .errpid = true,
791 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
792 { .name = "waitid", .errpid = true,
793 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
794 { .name = "write", .errmsg = true, },
795 { .name = "writev", .errmsg = true, },
798 static int syscall_fmt__cmp(const void *name, const void *fmtp)
800 const struct syscall_fmt *fmt = fmtp;
801 return strcmp(name, fmt->name);
804 static struct syscall_fmt *syscall_fmt__find(const char *name)
806 const int nmemb = ARRAY_SIZE(syscall_fmts);
807 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
811 struct event_format *tp_format;
813 struct format_field *args;
816 struct syscall_fmt *fmt;
817 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
822 * We need to have this 'calculated' boolean because in some cases we really
823 * don't know what is the duration of a syscall, for instance, when we start
824 * a session and some threads are waiting for a syscall to finish, say 'poll',
825 * in which case all we can do is to print "( ? ) for duration and for the
828 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
830 double duration = (double)t / NSEC_PER_MSEC;
831 size_t printed = fprintf(fp, "(");
834 printed += fprintf(fp, " ? ");
835 else if (duration >= 1.0)
836 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
837 else if (duration >= 0.01)
838 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
840 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
841 return printed + fprintf(fp, "): ");
845 * filename.ptr: The filename char pointer that will be vfs_getname'd
846 * filename.entry_str_pos: Where to insert the string translated from
847 * filename.ptr by the vfs_getname tracepoint/kprobe.
849 struct thread_trace {
852 unsigned long nr_events;
853 unsigned long pfmaj, pfmin;
858 short int entry_str_pos;
860 unsigned int namelen;
868 struct intlist *syscall_stats;
871 static struct thread_trace *thread_trace__new(void)
873 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
876 ttrace->paths.max = -1;
878 ttrace->syscall_stats = intlist__new(NULL);
883 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
885 struct thread_trace *ttrace;
890 if (thread__priv(thread) == NULL)
891 thread__set_priv(thread, thread_trace__new());
893 if (thread__priv(thread) == NULL)
896 ttrace = thread__priv(thread);
901 color_fprintf(fp, PERF_COLOR_RED,
902 "WARNING: not enough memory, dropping samples!\n");
906 #define TRACE_PFMAJ (1 << 0)
907 #define TRACE_PFMIN (1 << 1)
909 static const size_t trace__entry_str_size = 2048;
911 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
913 struct thread_trace *ttrace = thread__priv(thread);
915 if (fd > ttrace->paths.max) {
916 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
921 if (ttrace->paths.max != -1) {
922 memset(npath + ttrace->paths.max + 1, 0,
923 (fd - ttrace->paths.max) * sizeof(char *));
925 memset(npath, 0, (fd + 1) * sizeof(char *));
928 ttrace->paths.table = npath;
929 ttrace->paths.max = fd;
932 ttrace->paths.table[fd] = strdup(pathname);
934 return ttrace->paths.table[fd] != NULL ? 0 : -1;
937 static int thread__read_fd_path(struct thread *thread, int fd)
939 char linkname[PATH_MAX], pathname[PATH_MAX];
943 if (thread->pid_ == thread->tid) {
944 scnprintf(linkname, sizeof(linkname),
945 "/proc/%d/fd/%d", thread->pid_, fd);
947 scnprintf(linkname, sizeof(linkname),
948 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
951 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
954 ret = readlink(linkname, pathname, sizeof(pathname));
956 if (ret < 0 || ret > st.st_size)
959 pathname[ret] = '\0';
960 return trace__set_fd_pathname(thread, fd, pathname);
963 static const char *thread__fd_path(struct thread *thread, int fd,
966 struct thread_trace *ttrace = thread__priv(thread);
974 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
977 ++trace->stats.proc_getname;
978 if (thread__read_fd_path(thread, fd))
982 return ttrace->paths.table[fd];
985 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
986 struct syscall_arg *arg)
989 size_t printed = scnprintf(bf, size, "%d", fd);
990 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
993 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
998 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
999 struct syscall_arg *arg)
1002 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1003 struct thread_trace *ttrace = thread__priv(arg->thread);
1005 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1006 zfree(&ttrace->paths.table[fd]);
1011 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1014 struct thread_trace *ttrace = thread__priv(thread);
1016 ttrace->filename.ptr = ptr;
1017 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1020 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1021 struct syscall_arg *arg)
1023 unsigned long ptr = arg->val;
1025 if (!arg->trace->vfs_getname)
1026 return scnprintf(bf, size, "%#x", ptr);
1028 thread__set_filename_pos(arg->thread, bf, ptr);
1032 static bool trace__filter_duration(struct trace *trace, double t)
1034 return t < (trace->duration_filter * NSEC_PER_MSEC);
1037 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1039 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1041 return fprintf(fp, "%10.3f ", ts);
1045 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1046 * using ttrace->entry_time for a thread that receives a sys_exit without
1047 * first having received a sys_enter ("poll" issued before tracing session
1048 * starts, lost sys_enter exit due to ring buffer overflow).
1050 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1053 return __trace__fprintf_tstamp(trace, tstamp, fp);
1055 return fprintf(fp, " ? ");
1058 static bool done = false;
1059 static bool interrupted = false;
1061 static void sig_handler(int sig)
1064 interrupted = sig == SIGINT;
1067 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1068 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1070 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1071 printed += fprintf_duration(duration, duration_calculated, fp);
1073 if (trace->multiple_threads) {
1074 if (trace->show_comm)
1075 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1076 printed += fprintf(fp, "%d ", thread->tid);
1082 static int trace__process_event(struct trace *trace, struct machine *machine,
1083 union perf_event *event, struct perf_sample *sample)
1087 switch (event->header.type) {
1088 case PERF_RECORD_LOST:
1089 color_fprintf(trace->output, PERF_COLOR_RED,
1090 "LOST %" PRIu64 " events!\n", event->lost.lost);
1091 ret = machine__process_lost_event(machine, event, sample);
1094 ret = machine__process_event(machine, event, sample);
1101 static int trace__tool_process(struct perf_tool *tool,
1102 union perf_event *event,
1103 struct perf_sample *sample,
1104 struct machine *machine)
1106 struct trace *trace = container_of(tool, struct trace, tool);
1107 return trace__process_event(trace, machine, event, sample);
1110 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1112 struct machine *machine = vmachine;
1114 if (machine->kptr_restrict_warned)
1117 if (symbol_conf.kptr_restrict) {
1118 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1119 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1120 "Kernel samples will not be resolved.\n");
1121 machine->kptr_restrict_warned = true;
1125 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1128 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1130 int err = symbol__init(NULL);
1135 trace->host = machine__new_host();
1136 if (trace->host == NULL)
1139 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1142 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1143 evlist->threads, trace__tool_process, false,
1144 trace->opts.proc_map_timeout);
1151 static int syscall__set_arg_fmts(struct syscall *sc)
1153 struct format_field *field;
1156 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1157 if (sc->arg_scnprintf == NULL)
1161 sc->arg_parm = sc->fmt->arg_parm;
1163 for (field = sc->args; field; field = field->next) {
1164 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1165 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1166 else if (strcmp(field->type, "const char *") == 0 &&
1167 (strcmp(field->name, "filename") == 0 ||
1168 strcmp(field->name, "path") == 0 ||
1169 strcmp(field->name, "pathname") == 0))
1170 sc->arg_scnprintf[idx] = SCA_FILENAME;
1171 else if (field->flags & FIELD_IS_POINTER)
1172 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1173 else if (strcmp(field->type, "pid_t") == 0)
1174 sc->arg_scnprintf[idx] = SCA_PID;
1175 else if (strcmp(field->type, "umode_t") == 0)
1176 sc->arg_scnprintf[idx] = SCA_MODE_T;
1177 else if ((strcmp(field->type, "int") == 0 ||
1178 strcmp(field->type, "unsigned int") == 0 ||
1179 strcmp(field->type, "long") == 0) &&
1180 (len = strlen(field->name)) >= 2 &&
1181 strcmp(field->name + len - 2, "fd") == 0) {
1183 * /sys/kernel/tracing/events/syscalls/sys_enter*
1184 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1189 sc->arg_scnprintf[idx] = SCA_FD;
1197 static int trace__read_syscall_info(struct trace *trace, int id)
1201 const char *name = syscalltbl__name(trace->sctbl, id);
1206 if (id > trace->syscalls.max) {
1207 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1209 if (nsyscalls == NULL)
1212 if (trace->syscalls.max != -1) {
1213 memset(nsyscalls + trace->syscalls.max + 1, 0,
1214 (id - trace->syscalls.max) * sizeof(*sc));
1216 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1219 trace->syscalls.table = nsyscalls;
1220 trace->syscalls.max = id;
1223 sc = trace->syscalls.table + id;
1226 sc->fmt = syscall_fmt__find(sc->name);
1228 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1229 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1231 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1232 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1233 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1236 if (IS_ERR(sc->tp_format))
1239 sc->args = sc->tp_format->format.fields;
1240 sc->nr_args = sc->tp_format->format.nr_fields;
1242 * We need to check and discard the first variable '__syscall_nr'
1243 * or 'nr' that mean the syscall number. It is needless here.
1244 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1246 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1247 sc->args = sc->args->next;
1251 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1253 return syscall__set_arg_fmts(sc);
1256 static int trace__validate_ev_qualifier(struct trace *trace)
1259 struct str_node *pos;
1261 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1262 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1263 sizeof(trace->ev_qualifier_ids.entries[0]));
1265 if (trace->ev_qualifier_ids.entries == NULL) {
1266 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1274 strlist__for_each_entry(pos, trace->ev_qualifier) {
1275 const char *sc = pos->s;
1276 int id = syscalltbl__id(trace->sctbl, sc);
1280 fputs("Error:\tInvalid syscall ", trace->output);
1283 fputs(", ", trace->output);
1286 fputs(sc, trace->output);
1289 trace->ev_qualifier_ids.entries[i++] = id;
1293 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1294 "\nHint:\tand: 'man syscalls'\n", trace->output);
1295 zfree(&trace->ev_qualifier_ids.entries);
1296 trace->ev_qualifier_ids.nr = 0;
1303 * args is to be interpreted as a series of longs but we need to handle
1304 * 8-byte unaligned accesses. args points to raw_data within the event
1305 * and raw_data is guaranteed to be 8-byte unaligned because it is
1306 * preceded by raw_size which is a u32. So we need to copy args to a temp
1307 * variable to read it. Most notably this avoids extended load instructions
1308 * on unaligned addresses
1311 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1312 unsigned char *args, struct trace *trace,
1313 struct thread *thread)
1319 if (sc->args != NULL) {
1320 struct format_field *field;
1322 struct syscall_arg arg = {
1329 for (field = sc->args; field;
1330 field = field->next, ++arg.idx, bit <<= 1) {
1334 /* special care for unaligned accesses */
1335 p = args + sizeof(unsigned long) * arg.idx;
1336 memcpy(&val, p, sizeof(val));
1339 * Suppress this argument if its value is zero and
1340 * and we don't have a string associated in an
1344 !(sc->arg_scnprintf &&
1345 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1346 sc->arg_parm[arg.idx]))
1349 printed += scnprintf(bf + printed, size - printed,
1350 "%s%s: ", printed ? ", " : "", field->name);
1351 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1354 arg.parm = sc->arg_parm[arg.idx];
1355 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1356 size - printed, &arg);
1358 printed += scnprintf(bf + printed, size - printed,
1362 } else if (IS_ERR(sc->tp_format)) {
1364 * If we managed to read the tracepoint /format file, then we
1365 * may end up not having any args, like with gettid(), so only
1366 * print the raw args when we didn't manage to read it.
1371 /* special care for unaligned accesses */
1372 p = args + sizeof(unsigned long) * i;
1373 memcpy(&val, p, sizeof(val));
1374 printed += scnprintf(bf + printed, size - printed,
1376 printed ? ", " : "", i, val);
1384 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1385 union perf_event *event,
1386 struct perf_sample *sample);
1388 static struct syscall *trace__syscall_info(struct trace *trace,
1389 struct perf_evsel *evsel, int id)
1395 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1396 * before that, leaving at a higher verbosity level till that is
1397 * explained. Reproduced with plain ftrace with:
1399 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1400 * grep "NR -1 " /t/trace_pipe
1402 * After generating some load on the machine.
1406 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1407 id, perf_evsel__name(evsel), ++n);
1412 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1413 trace__read_syscall_info(trace, id))
1416 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1419 return &trace->syscalls.table[id];
1423 fprintf(trace->output, "Problems reading syscall %d", id);
1424 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1425 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1426 fputs(" information\n", trace->output);
1431 static void thread__update_stats(struct thread_trace *ttrace,
1432 int id, struct perf_sample *sample)
1434 struct int_node *inode;
1435 struct stats *stats;
1438 inode = intlist__findnew(ttrace->syscall_stats, id);
1442 stats = inode->priv;
1443 if (stats == NULL) {
1444 stats = malloc(sizeof(struct stats));
1448 inode->priv = stats;
1451 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1452 duration = sample->time - ttrace->entry_time;
1454 update_stats(stats, duration);
1457 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1459 struct thread_trace *ttrace;
1463 if (trace->current == NULL)
1466 ttrace = thread__priv(trace->current);
1468 if (!ttrace->entry_pending)
1471 duration = sample->time - ttrace->entry_time;
1473 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1474 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1475 ttrace->entry_pending = false;
1480 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1481 union perf_event *event __maybe_unused,
1482 struct perf_sample *sample)
1487 struct thread *thread;
1488 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1489 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1490 struct thread_trace *ttrace;
1495 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1496 ttrace = thread__trace(thread, trace->output);
1500 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1502 if (ttrace->entry_str == NULL) {
1503 ttrace->entry_str = malloc(trace__entry_str_size);
1504 if (!ttrace->entry_str)
1508 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1509 trace__printf_interrupted_entry(trace, sample);
1511 ttrace->entry_time = sample->time;
1512 msg = ttrace->entry_str;
1513 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1515 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1516 args, trace, thread);
1519 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1520 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1521 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1524 ttrace->entry_pending = true;
1525 /* See trace__vfs_getname & trace__sys_exit */
1526 ttrace->filename.pending_open = false;
1529 if (trace->current != thread) {
1530 thread__put(trace->current);
1531 trace->current = thread__get(thread);
1535 thread__put(thread);
1539 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1540 struct perf_sample *sample,
1541 struct callchain_cursor *cursor)
1543 struct addr_location al;
1545 if (machine__resolve(trace->host, &al, sample) < 0 ||
1546 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1552 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1554 /* TODO: user-configurable print_opts */
1555 const unsigned int print_opts = EVSEL__PRINT_SYM |
1557 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1559 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1562 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1563 union perf_event *event __maybe_unused,
1564 struct perf_sample *sample)
1568 bool duration_calculated = false;
1569 struct thread *thread;
1570 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1571 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1572 struct thread_trace *ttrace;
1577 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1578 ttrace = thread__trace(thread, trace->output);
1583 thread__update_stats(ttrace, id, sample);
1585 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1587 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1588 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1589 ttrace->filename.pending_open = false;
1590 ++trace->stats.vfs_getname;
1593 if (ttrace->entry_time) {
1594 duration = sample->time - ttrace->entry_time;
1595 if (trace__filter_duration(trace, duration))
1597 duration_calculated = true;
1598 } else if (trace->duration_filter)
1601 if (sample->callchain) {
1602 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1603 if (callchain_ret == 0) {
1604 if (callchain_cursor.nr < trace->min_stack)
1610 if (trace->summary_only)
1613 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1615 if (ttrace->entry_pending) {
1616 fprintf(trace->output, "%-70s", ttrace->entry_str);
1618 fprintf(trace->output, " ... [");
1619 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1620 fprintf(trace->output, "]: %s()", sc->name);
1623 if (sc->fmt == NULL) {
1625 fprintf(trace->output, ") = %ld", ret);
1626 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1627 char bf[STRERR_BUFSIZE];
1628 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1629 *e = audit_errno_to_name(-ret);
1631 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1632 } else if (ret == 0 && sc->fmt->timeout)
1633 fprintf(trace->output, ") = 0 Timeout");
1634 else if (sc->fmt->hexret)
1635 fprintf(trace->output, ") = %#lx", ret);
1636 else if (sc->fmt->errpid) {
1637 struct thread *child = machine__find_thread(trace->host, ret, ret);
1639 if (child != NULL) {
1640 fprintf(trace->output, ") = %ld", ret);
1641 if (child->comm_set)
1642 fprintf(trace->output, " (%s)", thread__comm_str(child));
1648 fputc('\n', trace->output);
1650 if (callchain_ret > 0)
1651 trace__fprintf_callchain(trace, sample);
1652 else if (callchain_ret < 0)
1653 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1655 ttrace->entry_pending = false;
1658 thread__put(thread);
1662 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1663 union perf_event *event __maybe_unused,
1664 struct perf_sample *sample)
1666 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1667 struct thread_trace *ttrace;
1668 size_t filename_len, entry_str_len, to_move;
1669 ssize_t remaining_space;
1671 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1676 ttrace = thread__priv(thread);
1680 filename_len = strlen(filename);
1681 if (filename_len == 0)
1684 if (ttrace->filename.namelen < filename_len) {
1685 char *f = realloc(ttrace->filename.name, filename_len + 1);
1690 ttrace->filename.namelen = filename_len;
1691 ttrace->filename.name = f;
1694 strcpy(ttrace->filename.name, filename);
1695 ttrace->filename.pending_open = true;
1697 if (!ttrace->filename.ptr)
1700 entry_str_len = strlen(ttrace->entry_str);
1701 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1702 if (remaining_space <= 0)
1705 if (filename_len > (size_t)remaining_space) {
1706 filename += filename_len - remaining_space;
1707 filename_len = remaining_space;
1710 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1711 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1712 memmove(pos + filename_len, pos, to_move);
1713 memcpy(pos, filename, filename_len);
1715 ttrace->filename.ptr = 0;
1716 ttrace->filename.entry_str_pos = 0;
1718 thread__put(thread);
1723 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1724 union perf_event *event __maybe_unused,
1725 struct perf_sample *sample)
1727 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1728 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1729 struct thread *thread = machine__findnew_thread(trace->host,
1732 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1737 ttrace->runtime_ms += runtime_ms;
1738 trace->runtime_ms += runtime_ms;
1740 thread__put(thread);
1744 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1746 perf_evsel__strval(evsel, sample, "comm"),
1747 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1749 perf_evsel__intval(evsel, sample, "vruntime"));
1753 static void bpf_output__printer(enum binary_printer_ops op,
1754 unsigned int val, void *extra)
1756 FILE *output = extra;
1757 unsigned char ch = (unsigned char)val;
1760 case BINARY_PRINT_CHAR_DATA:
1761 fprintf(output, "%c", isprint(ch) ? ch : '.');
1763 case BINARY_PRINT_DATA_BEGIN:
1764 case BINARY_PRINT_LINE_BEGIN:
1765 case BINARY_PRINT_ADDR:
1766 case BINARY_PRINT_NUM_DATA:
1767 case BINARY_PRINT_NUM_PAD:
1768 case BINARY_PRINT_SEP:
1769 case BINARY_PRINT_CHAR_PAD:
1770 case BINARY_PRINT_LINE_END:
1771 case BINARY_PRINT_DATA_END:
1777 static void bpf_output__fprintf(struct trace *trace,
1778 struct perf_sample *sample)
1780 print_binary(sample->raw_data, sample->raw_size, 8,
1781 bpf_output__printer, trace->output);
1784 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1785 union perf_event *event __maybe_unused,
1786 struct perf_sample *sample)
1788 int callchain_ret = 0;
1790 if (sample->callchain) {
1791 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1792 if (callchain_ret == 0) {
1793 if (callchain_cursor.nr < trace->min_stack)
1799 trace__printf_interrupted_entry(trace, sample);
1800 trace__fprintf_tstamp(trace, sample->time, trace->output);
1802 if (trace->trace_syscalls)
1803 fprintf(trace->output, "( ): ");
1805 fprintf(trace->output, "%s:", evsel->name);
1807 if (perf_evsel__is_bpf_output(evsel)) {
1808 bpf_output__fprintf(trace, sample);
1809 } else if (evsel->tp_format) {
1810 event_format__fprintf(evsel->tp_format, sample->cpu,
1811 sample->raw_data, sample->raw_size,
1815 fprintf(trace->output, ")\n");
1817 if (callchain_ret > 0)
1818 trace__fprintf_callchain(trace, sample);
1819 else if (callchain_ret < 0)
1820 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1825 static void print_location(FILE *f, struct perf_sample *sample,
1826 struct addr_location *al,
1827 bool print_dso, bool print_sym)
1830 if ((verbose > 0 || print_dso) && al->map)
1831 fprintf(f, "%s@", al->map->dso->long_name);
1833 if ((verbose > 0 || print_sym) && al->sym)
1834 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1835 al->addr - al->sym->start);
1837 fprintf(f, "0x%" PRIx64, al->addr);
1839 fprintf(f, "0x%" PRIx64, sample->addr);
1842 static int trace__pgfault(struct trace *trace,
1843 struct perf_evsel *evsel,
1844 union perf_event *event __maybe_unused,
1845 struct perf_sample *sample)
1847 struct thread *thread;
1848 struct addr_location al;
1849 char map_type = 'd';
1850 struct thread_trace *ttrace;
1852 int callchain_ret = 0;
1854 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1856 if (sample->callchain) {
1857 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1858 if (callchain_ret == 0) {
1859 if (callchain_cursor.nr < trace->min_stack)
1865 ttrace = thread__trace(thread, trace->output);
1869 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1874 if (trace->summary_only)
1877 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1880 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1882 fprintf(trace->output, "%sfault [",
1883 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1886 print_location(trace->output, sample, &al, false, true);
1888 fprintf(trace->output, "] => ");
1890 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1894 thread__find_addr_location(thread, sample->cpumode,
1895 MAP__FUNCTION, sample->addr, &al);
1903 print_location(trace->output, sample, &al, true, false);
1905 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1907 if (callchain_ret > 0)
1908 trace__fprintf_callchain(trace, sample);
1909 else if (callchain_ret < 0)
1910 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1914 thread__put(thread);
1918 static void trace__set_base_time(struct trace *trace,
1919 struct perf_evsel *evsel,
1920 struct perf_sample *sample)
1923 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1924 * and don't use sample->time unconditionally, we may end up having
1925 * some other event in the future without PERF_SAMPLE_TIME for good
1926 * reason, i.e. we may not be interested in its timestamps, just in
1927 * it taking place, picking some piece of information when it
1928 * appears in our event stream (vfs_getname comes to mind).
1930 if (trace->base_time == 0 && !trace->full_time &&
1931 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1932 trace->base_time = sample->time;
1935 static int trace__process_sample(struct perf_tool *tool,
1936 union perf_event *event,
1937 struct perf_sample *sample,
1938 struct perf_evsel *evsel,
1939 struct machine *machine __maybe_unused)
1941 struct trace *trace = container_of(tool, struct trace, tool);
1942 struct thread *thread;
1945 tracepoint_handler handler = evsel->handler;
1947 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1948 if (thread && thread__is_filtered(thread))
1951 trace__set_base_time(trace, evsel, sample);
1955 handler(trace, evsel, event, sample);
1958 thread__put(thread);
1962 static int trace__record(struct trace *trace, int argc, const char **argv)
1964 unsigned int rec_argc, i, j;
1965 const char **rec_argv;
1966 const char * const record_args[] = {
1973 const char * const sc_args[] = { "-e", };
1974 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1975 const char * const majpf_args[] = { "-e", "major-faults" };
1976 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1977 const char * const minpf_args[] = { "-e", "minor-faults" };
1978 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1980 /* +1 is for the event string below */
1981 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1982 majpf_args_nr + minpf_args_nr + argc;
1983 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1985 if (rec_argv == NULL)
1989 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1990 rec_argv[j++] = record_args[i];
1992 if (trace->trace_syscalls) {
1993 for (i = 0; i < sc_args_nr; i++)
1994 rec_argv[j++] = sc_args[i];
1996 /* event string may be different for older kernels - e.g., RHEL6 */
1997 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1998 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1999 else if (is_valid_tracepoint("syscalls:sys_enter"))
2000 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2002 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2007 if (trace->trace_pgfaults & TRACE_PFMAJ)
2008 for (i = 0; i < majpf_args_nr; i++)
2009 rec_argv[j++] = majpf_args[i];
2011 if (trace->trace_pgfaults & TRACE_PFMIN)
2012 for (i = 0; i < minpf_args_nr; i++)
2013 rec_argv[j++] = minpf_args[i];
2015 for (i = 0; i < (unsigned int)argc; i++)
2016 rec_argv[j++] = argv[i];
2018 return cmd_record(j, rec_argv);
2021 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2023 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2025 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2030 if (perf_evsel__field(evsel, "pathname") == NULL) {
2031 perf_evsel__delete(evsel);
2035 evsel->handler = trace__vfs_getname;
2036 perf_evlist__add(evlist, evsel);
2040 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2042 struct perf_evsel *evsel;
2043 struct perf_event_attr attr = {
2044 .type = PERF_TYPE_SOFTWARE,
2048 attr.config = config;
2049 attr.sample_period = 1;
2051 event_attr_init(&attr);
2053 evsel = perf_evsel__new(&attr);
2055 evsel->handler = trace__pgfault;
2060 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2062 const u32 type = event->header.type;
2063 struct perf_evsel *evsel;
2065 if (type != PERF_RECORD_SAMPLE) {
2066 trace__process_event(trace, trace->host, event, sample);
2070 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2071 if (evsel == NULL) {
2072 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2076 trace__set_base_time(trace, evsel, sample);
2078 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2079 sample->raw_data == NULL) {
2080 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2081 perf_evsel__name(evsel), sample->tid,
2082 sample->cpu, sample->raw_size);
2084 tracepoint_handler handler = evsel->handler;
2085 handler(trace, evsel, event, sample);
2089 static int trace__add_syscall_newtp(struct trace *trace)
2092 struct perf_evlist *evlist = trace->evlist;
2093 struct perf_evsel *sys_enter, *sys_exit;
2095 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2096 if (sys_enter == NULL)
2099 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2100 goto out_delete_sys_enter;
2102 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2103 if (sys_exit == NULL)
2104 goto out_delete_sys_enter;
2106 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2107 goto out_delete_sys_exit;
2109 perf_evlist__add(evlist, sys_enter);
2110 perf_evlist__add(evlist, sys_exit);
2112 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2114 * We're interested only in the user space callchain
2115 * leading to the syscall, allow overriding that for
2116 * debugging reasons using --kernel_syscall_callchains
2118 sys_exit->attr.exclude_callchain_kernel = 1;
2121 trace->syscalls.events.sys_enter = sys_enter;
2122 trace->syscalls.events.sys_exit = sys_exit;
2128 out_delete_sys_exit:
2129 perf_evsel__delete_priv(sys_exit);
2130 out_delete_sys_enter:
2131 perf_evsel__delete_priv(sys_enter);
2135 static int trace__set_ev_qualifier_filter(struct trace *trace)
2138 struct perf_evsel *sys_exit;
2139 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2140 trace->ev_qualifier_ids.nr,
2141 trace->ev_qualifier_ids.entries);
2146 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2148 sys_exit = trace->syscalls.events.sys_exit;
2149 err = perf_evsel__append_tp_filter(sys_exit, filter);
2160 static int trace__run(struct trace *trace, int argc, const char **argv)
2162 struct perf_evlist *evlist = trace->evlist;
2163 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2165 unsigned long before;
2166 const bool forks = argc > 0;
2167 bool draining = false;
2171 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2172 goto out_error_raw_syscalls;
2174 if (trace->trace_syscalls)
2175 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2177 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2178 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2179 if (pgfault_maj == NULL)
2181 perf_evlist__add(evlist, pgfault_maj);
2184 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2185 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2186 if (pgfault_min == NULL)
2188 perf_evlist__add(evlist, pgfault_min);
2192 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2193 trace__sched_stat_runtime))
2194 goto out_error_sched_stat_runtime;
2196 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2198 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2199 goto out_delete_evlist;
2202 err = trace__symbols_init(trace, evlist);
2204 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2205 goto out_delete_evlist;
2208 perf_evlist__config(evlist, &trace->opts, NULL);
2210 if (callchain_param.enabled) {
2211 bool use_identifier = false;
2213 if (trace->syscalls.events.sys_exit) {
2214 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2215 &trace->opts, &callchain_param);
2216 use_identifier = true;
2220 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2221 use_identifier = true;
2225 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2226 use_identifier = true;
2229 if (use_identifier) {
2231 * Now we have evsels with different sample_ids, use
2232 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2233 * from a fixed position in each ring buffer record.
2235 * As of this the changeset introducing this comment, this
2236 * isn't strictly needed, as the fields that can come before
2237 * PERF_SAMPLE_ID are all used, but we'll probably disable
2238 * some of those for things like copying the payload of
2239 * pointer syscall arguments, and for vfs_getname we don't
2240 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2241 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2243 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2244 perf_evlist__reset_sample_bit(evlist, ID);
2248 signal(SIGCHLD, sig_handler);
2249 signal(SIGINT, sig_handler);
2252 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2255 fprintf(trace->output, "Couldn't run the workload!\n");
2256 goto out_delete_evlist;
2260 err = perf_evlist__open(evlist);
2262 goto out_error_open;
2264 err = bpf__apply_obj_config();
2266 char errbuf[BUFSIZ];
2268 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2269 pr_err("ERROR: Apply config to BPF failed: %s\n",
2271 goto out_error_open;
2275 * Better not use !target__has_task() here because we need to cover the
2276 * case where no threads were specified in the command line, but a
2277 * workload was, and in that case we will fill in the thread_map when
2278 * we fork the workload in perf_evlist__prepare_workload.
2280 if (trace->filter_pids.nr > 0)
2281 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2282 else if (thread_map__pid(evlist->threads, 0) == -1)
2283 err = perf_evlist__set_filter_pid(evlist, getpid());
2288 if (trace->ev_qualifier_ids.nr > 0) {
2289 err = trace__set_ev_qualifier_filter(trace);
2293 pr_debug("event qualifier tracepoint filter: %s\n",
2294 trace->syscalls.events.sys_exit->filter);
2297 err = perf_evlist__apply_filters(evlist, &evsel);
2299 goto out_error_apply_filters;
2301 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2303 goto out_error_mmap;
2305 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2306 perf_evlist__enable(evlist);
2309 perf_evlist__start_workload(evlist);
2311 if (trace->opts.initial_delay) {
2312 usleep(trace->opts.initial_delay * 1000);
2313 perf_evlist__enable(evlist);
2316 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2317 evlist->threads->nr > 1 ||
2318 perf_evlist__first(evlist)->attr.inherit;
2320 before = trace->nr_events;
2322 for (i = 0; i < evlist->nr_mmaps; i++) {
2323 union perf_event *event;
2325 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2326 struct perf_sample sample;
2330 err = perf_evlist__parse_sample(evlist, event, &sample);
2332 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2336 trace__handle_event(trace, event, &sample);
2338 perf_evlist__mmap_consume(evlist, i);
2343 if (done && !draining) {
2344 perf_evlist__disable(evlist);
2350 if (trace->nr_events == before) {
2351 int timeout = done ? 100 : -1;
2353 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2354 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2364 thread__zput(trace->current);
2366 perf_evlist__disable(evlist);
2370 trace__fprintf_thread_summary(trace, trace->output);
2372 if (trace->show_tool_stats) {
2373 fprintf(trace->output, "Stats:\n "
2374 " vfs_getname : %" PRIu64 "\n"
2375 " proc_getname: %" PRIu64 "\n",
2376 trace->stats.vfs_getname,
2377 trace->stats.proc_getname);
2382 perf_evlist__delete(evlist);
2383 trace->evlist = NULL;
2384 trace->live = false;
2387 char errbuf[BUFSIZ];
2389 out_error_sched_stat_runtime:
2390 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2393 out_error_raw_syscalls:
2394 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2398 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2402 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2405 fprintf(trace->output, "%s\n", errbuf);
2406 goto out_delete_evlist;
2408 out_error_apply_filters:
2409 fprintf(trace->output,
2410 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2411 evsel->filter, perf_evsel__name(evsel), errno,
2412 str_error_r(errno, errbuf, sizeof(errbuf)));
2413 goto out_delete_evlist;
2416 fprintf(trace->output, "Not enough memory to run!\n");
2417 goto out_delete_evlist;
2420 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2421 goto out_delete_evlist;
2424 static int trace__replay(struct trace *trace)
2426 const struct perf_evsel_str_handler handlers[] = {
2427 { "probe:vfs_getname", trace__vfs_getname, },
2429 struct perf_data_file file = {
2431 .mode = PERF_DATA_MODE_READ,
2432 .force = trace->force,
2434 struct perf_session *session;
2435 struct perf_evsel *evsel;
2438 trace->tool.sample = trace__process_sample;
2439 trace->tool.mmap = perf_event__process_mmap;
2440 trace->tool.mmap2 = perf_event__process_mmap2;
2441 trace->tool.comm = perf_event__process_comm;
2442 trace->tool.exit = perf_event__process_exit;
2443 trace->tool.fork = perf_event__process_fork;
2444 trace->tool.attr = perf_event__process_attr;
2445 trace->tool.tracing_data = perf_event__process_tracing_data;
2446 trace->tool.build_id = perf_event__process_build_id;
2447 trace->tool.namespaces = perf_event__process_namespaces;
2449 trace->tool.ordered_events = true;
2450 trace->tool.ordering_requires_timestamps = true;
2452 /* add tid to output */
2453 trace->multiple_threads = true;
2455 session = perf_session__new(&file, false, &trace->tool);
2456 if (session == NULL)
2459 if (trace->opts.target.pid)
2460 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2462 if (trace->opts.target.tid)
2463 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2465 if (symbol__init(&session->header.env) < 0)
2468 trace->host = &session->machines.host;
2470 err = perf_session__set_tracepoints_handlers(session, handlers);
2474 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475 "raw_syscalls:sys_enter");
2476 /* older kernels have syscalls tp versus raw_syscalls */
2478 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2479 "syscalls:sys_enter");
2482 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2483 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2484 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2488 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2489 "raw_syscalls:sys_exit");
2491 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2492 "syscalls:sys_exit");
2494 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2495 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2496 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2500 evlist__for_each_entry(session->evlist, evsel) {
2501 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2502 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2503 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2504 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2505 evsel->handler = trace__pgfault;
2510 err = perf_session__process_events(session);
2512 pr_err("Failed to process events, error %d", err);
2514 else if (trace->summary)
2515 trace__fprintf_thread_summary(trace, trace->output);
2518 perf_session__delete(session);
2523 static size_t trace__fprintf_threads_header(FILE *fp)
2527 printed = fprintf(fp, "\n Summary of events:\n\n");
2532 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2533 struct stats *stats;
2538 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2539 struct stats *stats = source->priv;
2541 entry->syscall = source->i;
2542 entry->stats = stats;
2543 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2546 static size_t thread__dump_stats(struct thread_trace *ttrace,
2547 struct trace *trace, FILE *fp)
2552 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2554 if (syscall_stats == NULL)
2557 printed += fprintf(fp, "\n");
2559 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2560 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2561 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2563 resort_rb__for_each_entry(nd, syscall_stats) {
2564 struct stats *stats = syscall_stats_entry->stats;
2566 double min = (double)(stats->min) / NSEC_PER_MSEC;
2567 double max = (double)(stats->max) / NSEC_PER_MSEC;
2568 double avg = avg_stats(stats);
2570 u64 n = (u64) stats->n;
2572 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2573 avg /= NSEC_PER_MSEC;
2575 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2576 printed += fprintf(fp, " %-15s", sc->name);
2577 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2578 n, syscall_stats_entry->msecs, min, avg);
2579 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2583 resort_rb__delete(syscall_stats);
2584 printed += fprintf(fp, "\n\n");
2589 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2592 struct thread_trace *ttrace = thread__priv(thread);
2598 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2600 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2601 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2602 printed += fprintf(fp, "%.1f%%", ratio);
2604 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2606 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2608 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2609 else if (fputc('\n', fp) != EOF)
2612 printed += thread__dump_stats(ttrace, trace, fp);
2617 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2619 return ttrace ? ttrace->nr_events : 0;
2622 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2623 struct thread *thread;
2626 entry->thread = rb_entry(nd, struct thread, rb_node);
2629 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2631 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2632 size_t printed = trace__fprintf_threads_header(fp);
2635 if (threads == NULL) {
2636 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2640 resort_rb__for_each_entry(nd, threads)
2641 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2643 resort_rb__delete(threads);
2648 static int trace__set_duration(const struct option *opt, const char *str,
2649 int unset __maybe_unused)
2651 struct trace *trace = opt->value;
2653 trace->duration_filter = atof(str);
2657 static int trace__set_filter_pids(const struct option *opt, const char *str,
2658 int unset __maybe_unused)
2662 struct trace *trace = opt->value;
2664 * FIXME: introduce a intarray class, plain parse csv and create a
2665 * { int nr, int entries[] } struct...
2667 struct intlist *list = intlist__new(str);
2672 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2673 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2675 if (trace->filter_pids.entries == NULL)
2678 trace->filter_pids.entries[0] = getpid();
2680 for (i = 1; i < trace->filter_pids.nr; ++i)
2681 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2683 intlist__delete(list);
2689 static int trace__open_output(struct trace *trace, const char *filename)
2693 if (!stat(filename, &st) && st.st_size) {
2694 char oldname[PATH_MAX];
2696 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2698 rename(filename, oldname);
2701 trace->output = fopen(filename, "w");
2703 return trace->output == NULL ? -errno : 0;
2706 static int parse_pagefaults(const struct option *opt, const char *str,
2707 int unset __maybe_unused)
2709 int *trace_pgfaults = opt->value;
2711 if (strcmp(str, "all") == 0)
2712 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2713 else if (strcmp(str, "maj") == 0)
2714 *trace_pgfaults |= TRACE_PFMAJ;
2715 else if (strcmp(str, "min") == 0)
2716 *trace_pgfaults |= TRACE_PFMIN;
2723 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2725 struct perf_evsel *evsel;
2727 evlist__for_each_entry(evlist, evsel)
2728 evsel->handler = handler;
2732 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2733 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2734 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2736 * It'd be better to introduce a parse_options() variant that would return a
2737 * list with the terms it didn't match to an event...
2739 static int trace__parse_events_option(const struct option *opt, const char *str,
2740 int unset __maybe_unused)
2742 struct trace *trace = (struct trace *)opt->value;
2743 const char *s = str;
2744 char *sep = NULL, *lists[2] = { NULL, NULL, };
2745 int len = strlen(str), err = -1, list;
2746 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2747 char group_name[PATH_MAX];
2749 if (strace_groups_dir == NULL)
2754 trace->not_ev_qualifier = true;
2758 if ((sep = strchr(s, ',')) != NULL)
2762 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2765 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2766 if (access(group_name, R_OK) == 0)
2771 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2773 lists[list] = malloc(len);
2774 if (lists[list] == NULL)
2776 strcpy(lists[list], s);
2786 if (lists[1] != NULL) {
2787 struct strlist_config slist_config = {
2788 .dirname = strace_groups_dir,
2791 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2792 if (trace->ev_qualifier == NULL) {
2793 fputs("Not enough memory to parse event qualifier", trace->output);
2797 if (trace__validate_ev_qualifier(trace))
2804 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2805 "event selector. use 'perf list' to list available events",
2806 parse_events_option);
2807 err = parse_events_option(&o, lists[0], 0);
2816 int cmd_trace(int argc, const char **argv)
2818 const char *trace_usage[] = {
2819 "perf trace [<options>] [<command>]",
2820 "perf trace [<options>] -- <command> [<options>]",
2821 "perf trace record [<options>] [<command>]",
2822 "perf trace record [<options>] -- <command> [<options>]",
2825 struct trace trace = {
2834 .user_freq = UINT_MAX,
2835 .user_interval = ULLONG_MAX,
2836 .no_buffering = true,
2837 .mmap_pages = UINT_MAX,
2838 .proc_map_timeout = 500,
2842 .trace_syscalls = true,
2843 .kernel_syscallchains = false,
2844 .max_stack = UINT_MAX,
2846 const char *output_name = NULL;
2847 const struct option trace_options[] = {
2848 OPT_CALLBACK('e', "event", &trace, "event",
2849 "event/syscall selector. use 'perf list' to list available events",
2850 trace__parse_events_option),
2851 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2852 "show the thread COMM next to its id"),
2853 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2854 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2855 trace__parse_events_option),
2856 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2857 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2858 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2859 "trace events on existing process id"),
2860 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2861 "trace events on existing thread id"),
2862 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2863 "pids to filter (by the kernel)", trace__set_filter_pids),
2864 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2865 "system-wide collection from all CPUs"),
2866 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2867 "list of cpus to monitor"),
2868 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2869 "child tasks do not inherit counters"),
2870 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2871 "number of mmap data pages",
2872 perf_evlist__parse_mmap_pages),
2873 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2875 OPT_CALLBACK(0, "duration", &trace, "float",
2876 "show only events with duration > N.M ms",
2877 trace__set_duration),
2878 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2879 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2880 OPT_BOOLEAN('T', "time", &trace.full_time,
2881 "Show full timestamp, not time relative to first start"),
2882 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2883 "Show only syscall summary with statistics"),
2884 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2885 "Show all syscalls and summary with statistics"),
2886 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2887 "Trace pagefaults", parse_pagefaults, "maj"),
2888 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2889 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2890 OPT_CALLBACK(0, "call-graph", &trace.opts,
2891 "record_mode[,record_size]", record_callchain_help,
2892 &record_parse_callchain_opt),
2893 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2894 "Show the kernel callchains on the syscall exit path"),
2895 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2896 "Set the minimum stack depth when parsing the callchain, "
2897 "anything below the specified depth will be ignored."),
2898 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2899 "Set the maximum stack depth when parsing the callchain, "
2900 "anything beyond the specified depth will be ignored. "
2901 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2902 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2903 "per thread proc mmap processing timeout in ms"),
2904 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2905 "ms to wait before starting measurement after program "
2909 bool __maybe_unused max_stack_user_set = true;
2910 bool mmap_pages_user_set = true;
2911 const char * const trace_subcommands[] = { "record", NULL };
2915 signal(SIGSEGV, sighandler_dump_stack);
2916 signal(SIGFPE, sighandler_dump_stack);
2918 trace.evlist = perf_evlist__new();
2919 trace.sctbl = syscalltbl__new();
2921 if (trace.evlist == NULL || trace.sctbl == NULL) {
2922 pr_err("Not enough memory to run!\n");
2927 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2928 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2930 err = bpf__setup_stdout(trace.evlist);
2932 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2933 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2939 if (trace.trace_pgfaults) {
2940 trace.opts.sample_address = true;
2941 trace.opts.sample_time = true;
2944 if (trace.opts.mmap_pages == UINT_MAX)
2945 mmap_pages_user_set = false;
2947 if (trace.max_stack == UINT_MAX) {
2948 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2949 max_stack_user_set = false;
2952 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2953 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2954 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2957 if (callchain_param.enabled) {
2958 if (!mmap_pages_user_set && geteuid() == 0)
2959 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2961 symbol_conf.use_callchain = true;
2964 if (trace.evlist->nr_entries > 0)
2965 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2967 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2968 return trace__record(&trace, argc-1, &argv[1]);
2970 /* summary_only implies summary option, but don't overwrite summary if set */
2971 if (trace.summary_only)
2972 trace.summary = trace.summary_only;
2974 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2975 trace.evlist->nr_entries == 0 /* Was --events used? */) {
2976 pr_err("Please specify something to trace.\n");
2980 if (!trace.trace_syscalls && trace.ev_qualifier) {
2981 pr_err("The -e option can't be used with --no-syscalls.\n");
2985 if (output_name != NULL) {
2986 err = trace__open_output(&trace, output_name);
2988 perror("failed to create output file");
2993 trace.open_id = syscalltbl__id(trace.sctbl, "open");
2995 err = target__validate(&trace.opts.target);
2997 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2998 fprintf(trace.output, "%s", bf);
3002 err = target__parse_uid(&trace.opts.target);
3004 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3005 fprintf(trace.output, "%s", bf);
3009 if (!argc && target__none(&trace.opts.target))
3010 trace.opts.target.system_wide = true;
3013 err = trace__replay(&trace);
3015 err = trace__run(&trace, argc, argv);
3018 if (output_name != NULL)
3019 fclose(trace.output);