]> git.karo-electronics.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
perf tools: Move extra string util functions to util/string2.h
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace/beauty/beauty.h"
35 #include "trace-event.h"
36 #include "util/parse-events.h"
37 #include "util/bpf-loader.h"
38 #include "callchain.h"
39 #include "print_binary.h"
40 #include "string2.h"
41 #include "syscalltbl.h"
42 #include "rb_resort.h"
43
44 #include <inttypes.h>
45 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
46 #include <stdlib.h>
47 #include <string.h>
48 #include <linux/err.h>
49 #include <linux/filter.h>
50 #include <linux/audit.h>
51 #include <linux/kernel.h>
52 #include <linux/random.h>
53 #include <linux/stringify.h>
54 #include <linux/time64.h>
55
56 #include "sane_ctype.h"
57
58 #ifndef O_CLOEXEC
59 # define O_CLOEXEC              02000000
60 #endif
61
62 struct trace {
63         struct perf_tool        tool;
64         struct syscalltbl       *sctbl;
65         struct {
66                 int             max;
67                 struct syscall  *table;
68                 struct {
69                         struct perf_evsel *sys_enter,
70                                           *sys_exit;
71                 }               events;
72         } syscalls;
73         struct record_opts      opts;
74         struct perf_evlist      *evlist;
75         struct machine          *host;
76         struct thread           *current;
77         u64                     base_time;
78         FILE                    *output;
79         unsigned long           nr_events;
80         struct strlist          *ev_qualifier;
81         struct {
82                 size_t          nr;
83                 int             *entries;
84         }                       ev_qualifier_ids;
85         struct {
86                 size_t          nr;
87                 pid_t           *entries;
88         }                       filter_pids;
89         double                  duration_filter;
90         double                  runtime_ms;
91         struct {
92                 u64             vfs_getname,
93                                 proc_getname;
94         } stats;
95         unsigned int            max_stack;
96         unsigned int            min_stack;
97         bool                    not_ev_qualifier;
98         bool                    live;
99         bool                    full_time;
100         bool                    sched;
101         bool                    multiple_threads;
102         bool                    summary;
103         bool                    summary_only;
104         bool                    show_comm;
105         bool                    show_tool_stats;
106         bool                    trace_syscalls;
107         bool                    kernel_syscallchains;
108         bool                    force;
109         bool                    vfs_getname;
110         int                     trace_pgfaults;
111         int                     open_id;
112 };
113
114 struct tp_field {
115         int offset;
116         union {
117                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
118                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
119         };
120 };
121
122 #define TP_UINT_FIELD(bits) \
123 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
124 { \
125         u##bits value; \
126         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
127         return value;  \
128 }
129
130 TP_UINT_FIELD(8);
131 TP_UINT_FIELD(16);
132 TP_UINT_FIELD(32);
133 TP_UINT_FIELD(64);
134
135 #define TP_UINT_FIELD__SWAPPED(bits) \
136 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138         u##bits value; \
139         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140         return bswap_##bits(value);\
141 }
142
143 TP_UINT_FIELD__SWAPPED(16);
144 TP_UINT_FIELD__SWAPPED(32);
145 TP_UINT_FIELD__SWAPPED(64);
146
147 static int tp_field__init_uint(struct tp_field *field,
148                                struct format_field *format_field,
149                                bool needs_swap)
150 {
151         field->offset = format_field->offset;
152
153         switch (format_field->size) {
154         case 1:
155                 field->integer = tp_field__u8;
156                 break;
157         case 2:
158                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
159                 break;
160         case 4:
161                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
162                 break;
163         case 8:
164                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
165                 break;
166         default:
167                 return -1;
168         }
169
170         return 0;
171 }
172
173 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
174 {
175         return sample->raw_data + field->offset;
176 }
177
178 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
179 {
180         field->offset = format_field->offset;
181         field->pointer = tp_field__ptr;
182         return 0;
183 }
184
185 struct syscall_tp {
186         struct tp_field id;
187         union {
188                 struct tp_field args, ret;
189         };
190 };
191
192 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
193                                           struct tp_field *field,
194                                           const char *name)
195 {
196         struct format_field *format_field = perf_evsel__field(evsel, name);
197
198         if (format_field == NULL)
199                 return -1;
200
201         return tp_field__init_uint(field, format_field, evsel->needs_swap);
202 }
203
204 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
205         ({ struct syscall_tp *sc = evsel->priv;\
206            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
207
208 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
209                                          struct tp_field *field,
210                                          const char *name)
211 {
212         struct format_field *format_field = perf_evsel__field(evsel, name);
213
214         if (format_field == NULL)
215                 return -1;
216
217         return tp_field__init_ptr(field, format_field);
218 }
219
220 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
221         ({ struct syscall_tp *sc = evsel->priv;\
222            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
223
224 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
225 {
226         zfree(&evsel->priv);
227         perf_evsel__delete(evsel);
228 }
229
230 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
231 {
232         evsel->priv = malloc(sizeof(struct syscall_tp));
233         if (evsel->priv != NULL) {
234                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
235                         goto out_delete;
236
237                 evsel->handler = handler;
238                 return 0;
239         }
240
241         return -ENOMEM;
242
243 out_delete:
244         zfree(&evsel->priv);
245         return -ENOENT;
246 }
247
248 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
249 {
250         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
251
252         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
253         if (IS_ERR(evsel))
254                 evsel = perf_evsel__newtp("syscalls", direction);
255
256         if (IS_ERR(evsel))
257                 return NULL;
258
259         if (perf_evsel__init_syscall_tp(evsel, handler))
260                 goto out_delete;
261
262         return evsel;
263
264 out_delete:
265         perf_evsel__delete_priv(evsel);
266         return NULL;
267 }
268
269 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
270         ({ struct syscall_tp *fields = evsel->priv; \
271            fields->name.integer(&fields->name, sample); })
272
273 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
274         ({ struct syscall_tp *fields = evsel->priv; \
275            fields->name.pointer(&fields->name, sample); })
276
277 struct strarray {
278         int         offset;
279         int         nr_entries;
280         const char **entries;
281 };
282
283 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
284         .nr_entries = ARRAY_SIZE(array), \
285         .entries = array, \
286 }
287
288 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
289         .offset     = off, \
290         .nr_entries = ARRAY_SIZE(array), \
291         .entries = array, \
292 }
293
294 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
295                                                 const char *intfmt,
296                                                 struct syscall_arg *arg)
297 {
298         struct strarray *sa = arg->parm;
299         int idx = arg->val - sa->offset;
300
301         if (idx < 0 || idx >= sa->nr_entries)
302                 return scnprintf(bf, size, intfmt, arg->val);
303
304         return scnprintf(bf, size, "%s", sa->entries[idx]);
305 }
306
307 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
308                                               struct syscall_arg *arg)
309 {
310         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
311 }
312
313 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
314
315 #if defined(__i386__) || defined(__x86_64__)
316 /*
317  * FIXME: Make this available to all arches as soon as the ioctl beautifier
318  *        gets rewritten to support all arches.
319  */
320 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
321                                                  struct syscall_arg *arg)
322 {
323         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
324 }
325
326 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
327 #endif /* defined(__i386__) || defined(__x86_64__) */
328
329 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
330                                         struct syscall_arg *arg);
331
332 #define SCA_FD syscall_arg__scnprintf_fd
333
334 #ifndef AT_FDCWD
335 #define AT_FDCWD        -100
336 #endif
337
338 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
339                                            struct syscall_arg *arg)
340 {
341         int fd = arg->val;
342
343         if (fd == AT_FDCWD)
344                 return scnprintf(bf, size, "CWD");
345
346         return syscall_arg__scnprintf_fd(bf, size, arg);
347 }
348
349 #define SCA_FDAT syscall_arg__scnprintf_fd_at
350
351 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
352                                               struct syscall_arg *arg);
353
354 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
355
356 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
357                                          struct syscall_arg *arg)
358 {
359         return scnprintf(bf, size, "%#lx", arg->val);
360 }
361
362 #define SCA_HEX syscall_arg__scnprintf_hex
363
364 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
365                                          struct syscall_arg *arg)
366 {
367         return scnprintf(bf, size, "%d", arg->val);
368 }
369
370 #define SCA_INT syscall_arg__scnprintf_int
371
372 static const char *bpf_cmd[] = {
373         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
374         "MAP_GET_NEXT_KEY", "PROG_LOAD",
375 };
376 static DEFINE_STRARRAY(bpf_cmd);
377
378 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
379 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
380
381 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
382 static DEFINE_STRARRAY(itimers);
383
384 static const char *keyctl_options[] = {
385         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
386         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
387         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
388         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
389         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
390 };
391 static DEFINE_STRARRAY(keyctl_options);
392
393 static const char *whences[] = { "SET", "CUR", "END",
394 #ifdef SEEK_DATA
395 "DATA",
396 #endif
397 #ifdef SEEK_HOLE
398 "HOLE",
399 #endif
400 };
401 static DEFINE_STRARRAY(whences);
402
403 static const char *fcntl_cmds[] = {
404         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
405         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
406         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
407         "F_GETOWNER_UIDS",
408 };
409 static DEFINE_STRARRAY(fcntl_cmds);
410
411 static const char *rlimit_resources[] = {
412         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
413         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
414         "RTTIME",
415 };
416 static DEFINE_STRARRAY(rlimit_resources);
417
418 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
419 static DEFINE_STRARRAY(sighow);
420
421 static const char *clockid[] = {
422         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
423         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
424         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
425 };
426 static DEFINE_STRARRAY(clockid);
427
428 static const char *socket_families[] = {
429         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
430         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
431         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
432         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
433         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
434         "ALG", "NFC", "VSOCK",
435 };
436 static DEFINE_STRARRAY(socket_families);
437
438 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
439                                                  struct syscall_arg *arg)
440 {
441         size_t printed = 0;
442         int mode = arg->val;
443
444         if (mode == F_OK) /* 0 */
445                 return scnprintf(bf, size, "F");
446 #define P_MODE(n) \
447         if (mode & n##_OK) { \
448                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
449                 mode &= ~n##_OK; \
450         }
451
452         P_MODE(R);
453         P_MODE(W);
454         P_MODE(X);
455 #undef P_MODE
456
457         if (mode)
458                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
459
460         return printed;
461 }
462
463 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
464
465 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
466                                               struct syscall_arg *arg);
467
468 #define SCA_FILENAME syscall_arg__scnprintf_filename
469
470 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
471                                                 struct syscall_arg *arg)
472 {
473         int printed = 0, flags = arg->val;
474
475 #define P_FLAG(n) \
476         if (flags & O_##n) { \
477                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
478                 flags &= ~O_##n; \
479         }
480
481         P_FLAG(CLOEXEC);
482         P_FLAG(NONBLOCK);
483 #undef P_FLAG
484
485         if (flags)
486                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
487
488         return printed;
489 }
490
491 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
492
493 #if defined(__i386__) || defined(__x86_64__)
494 /*
495  * FIXME: Make this available to all arches.
496  */
497 #define TCGETS          0x5401
498
499 static const char *tioctls[] = {
500         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
501         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
502         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
503         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
504         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
505         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
506         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
507         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
508         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
509         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
510         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
511         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
512         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
513         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
514         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
515 };
516
517 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
518 #endif /* defined(__i386__) || defined(__x86_64__) */
519
520 #ifndef GRND_NONBLOCK
521 #define GRND_NONBLOCK   0x0001
522 #endif
523 #ifndef GRND_RANDOM
524 #define GRND_RANDOM     0x0002
525 #endif
526
527 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
528                                                    struct syscall_arg *arg)
529 {
530         int printed = 0, flags = arg->val;
531
532 #define P_FLAG(n) \
533         if (flags & GRND_##n) { \
534                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
535                 flags &= ~GRND_##n; \
536         }
537
538         P_FLAG(RANDOM);
539         P_FLAG(NONBLOCK);
540 #undef P_FLAG
541
542         if (flags)
543                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
544
545         return printed;
546 }
547
548 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
549
550 #define STRARRAY(arg, name, array) \
551           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
552           .arg_parm      = { [arg] = &strarray__##array, }
553
554 #include "trace/beauty/eventfd.c"
555 #include "trace/beauty/flock.c"
556 #include "trace/beauty/futex_op.c"
557 #include "trace/beauty/mmap.c"
558 #include "trace/beauty/mode_t.c"
559 #include "trace/beauty/msg_flags.c"
560 #include "trace/beauty/open_flags.c"
561 #include "trace/beauty/perf_event_open.c"
562 #include "trace/beauty/pid.c"
563 #include "trace/beauty/sched_policy.c"
564 #include "trace/beauty/seccomp.c"
565 #include "trace/beauty/signum.c"
566 #include "trace/beauty/socket_type.c"
567 #include "trace/beauty/waitid_options.c"
568
569 static struct syscall_fmt {
570         const char *name;
571         const char *alias;
572         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
573         void       *arg_parm[6];
574         bool       errmsg;
575         bool       errpid;
576         bool       timeout;
577         bool       hexret;
578 } syscall_fmts[] = {
579         { .name     = "access",     .errmsg = true,
580           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
581         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
582         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
583         { .name     = "brk",        .hexret = true,
584           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
585         { .name     = "chdir",      .errmsg = true, },
586         { .name     = "chmod",      .errmsg = true, },
587         { .name     = "chroot",     .errmsg = true, },
588         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
589         { .name     = "clone",      .errpid = true, },
590         { .name     = "close",      .errmsg = true,
591           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
592         { .name     = "connect",    .errmsg = true, },
593         { .name     = "creat",      .errmsg = true, },
594         { .name     = "dup",        .errmsg = true, },
595         { .name     = "dup2",       .errmsg = true, },
596         { .name     = "dup3",       .errmsg = true, },
597         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
598         { .name     = "eventfd2",   .errmsg = true,
599           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
600         { .name     = "faccessat",  .errmsg = true, },
601         { .name     = "fadvise64",  .errmsg = true, },
602         { .name     = "fallocate",  .errmsg = true, },
603         { .name     = "fchdir",     .errmsg = true, },
604         { .name     = "fchmod",     .errmsg = true, },
605         { .name     = "fchmodat",   .errmsg = true,
606           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
607         { .name     = "fchown",     .errmsg = true, },
608         { .name     = "fchownat",   .errmsg = true,
609           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610         { .name     = "fcntl",      .errmsg = true,
611           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
612           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
613         { .name     = "fdatasync",  .errmsg = true, },
614         { .name     = "flock",      .errmsg = true,
615           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
616         { .name     = "fsetxattr",  .errmsg = true, },
617         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
618         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
619         { .name     = "fstatfs",    .errmsg = true, },
620         { .name     = "fsync",    .errmsg = true, },
621         { .name     = "ftruncate", .errmsg = true, },
622         { .name     = "futex",      .errmsg = true,
623           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
624         { .name     = "futimesat", .errmsg = true,
625           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
626         { .name     = "getdents",   .errmsg = true, },
627         { .name     = "getdents64", .errmsg = true, },
628         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
629         { .name     = "getpid",     .errpid = true, },
630         { .name     = "getpgid",    .errpid = true, },
631         { .name     = "getppid",    .errpid = true, },
632         { .name     = "getrandom",  .errmsg = true,
633           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
634         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
635         { .name     = "getxattr",   .errmsg = true, },
636         { .name     = "inotify_add_watch",          .errmsg = true, },
637         { .name     = "ioctl",      .errmsg = true,
638           .arg_scnprintf = {
639 #if defined(__i386__) || defined(__x86_64__)
640 /*
641  * FIXME: Make this available to all arches.
642  */
643                              [1] = SCA_STRHEXARRAY, /* cmd */
644                              [2] = SCA_HEX, /* arg */ },
645           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
646 #else
647                              [2] = SCA_HEX, /* arg */ }, },
648 #endif
649         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
650         { .name     = "kill",       .errmsg = true,
651           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
652         { .name     = "lchown",    .errmsg = true, },
653         { .name     = "lgetxattr",  .errmsg = true, },
654         { .name     = "linkat",     .errmsg = true,
655           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
656         { .name     = "listxattr",  .errmsg = true, },
657         { .name     = "llistxattr", .errmsg = true, },
658         { .name     = "lremovexattr",  .errmsg = true, },
659         { .name     = "lseek",      .errmsg = true,
660           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
661           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
662         { .name     = "lsetxattr",  .errmsg = true, },
663         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
664         { .name     = "lsxattr",    .errmsg = true, },
665         { .name     = "madvise",    .errmsg = true,
666           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
667                              [2] = SCA_MADV_BHV, /* behavior */ }, },
668         { .name     = "mkdir",    .errmsg = true, },
669         { .name     = "mkdirat",    .errmsg = true,
670           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
671         { .name     = "mknod",      .errmsg = true, },
672         { .name     = "mknodat",    .errmsg = true,
673           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674         { .name     = "mlock",      .errmsg = true,
675           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
676         { .name     = "mlockall",   .errmsg = true,
677           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
678         { .name     = "mmap",       .hexret = true,
679           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
680                              [2] = SCA_MMAP_PROT, /* prot */
681                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
682         { .name     = "mprotect",   .errmsg = true,
683           .arg_scnprintf = { [0] = SCA_HEX, /* start */
684                              [2] = SCA_MMAP_PROT, /* prot */ }, },
685         { .name     = "mq_unlink", .errmsg = true,
686           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
687         { .name     = "mremap",     .hexret = true,
688           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
689                              [3] = SCA_MREMAP_FLAGS, /* flags */
690                              [4] = SCA_HEX, /* new_addr */ }, },
691         { .name     = "munlock",    .errmsg = true,
692           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
693         { .name     = "munmap",     .errmsg = true,
694           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
695         { .name     = "name_to_handle_at", .errmsg = true,
696           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
697         { .name     = "newfstatat", .errmsg = true,
698           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
699         { .name     = "open",       .errmsg = true,
700           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
701         { .name     = "open_by_handle_at", .errmsg = true,
702           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
703                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
704         { .name     = "openat",     .errmsg = true,
705           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
706                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
707         { .name     = "perf_event_open", .errmsg = true,
708           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
709                              [3] = SCA_FD,  /* group_fd */
710                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
711         { .name     = "pipe2",      .errmsg = true,
712           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
713         { .name     = "poll",       .errmsg = true, .timeout = true, },
714         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
715         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
716         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
717         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
718         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
719         { .name     = "pwritev",    .errmsg = true, },
720         { .name     = "read",       .errmsg = true, },
721         { .name     = "readlink",   .errmsg = true, },
722         { .name     = "readlinkat", .errmsg = true,
723           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
724         { .name     = "readv",      .errmsg = true, },
725         { .name     = "recvfrom",   .errmsg = true,
726           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
727         { .name     = "recvmmsg",   .errmsg = true,
728           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
729         { .name     = "recvmsg",    .errmsg = true,
730           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
731         { .name     = "removexattr", .errmsg = true, },
732         { .name     = "renameat",   .errmsg = true,
733           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
734         { .name     = "rmdir",    .errmsg = true, },
735         { .name     = "rt_sigaction", .errmsg = true,
736           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
737         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
738         { .name     = "rt_sigqueueinfo", .errmsg = true,
739           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
740         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
741           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
742         { .name     = "sched_getattr",        .errmsg = true, },
743         { .name     = "sched_setattr",        .errmsg = true, },
744         { .name     = "sched_setscheduler",   .errmsg = true,
745           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
746         { .name     = "seccomp", .errmsg = true,
747           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
748                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
749         { .name     = "select",     .errmsg = true, .timeout = true, },
750         { .name     = "sendmmsg",    .errmsg = true,
751           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
752         { .name     = "sendmsg",    .errmsg = true,
753           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
754         { .name     = "sendto",     .errmsg = true,
755           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
756         { .name     = "set_tid_address", .errpid = true, },
757         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
758         { .name     = "setpgid",    .errmsg = true, },
759         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
760         { .name     = "setxattr",   .errmsg = true, },
761         { .name     = "shutdown",   .errmsg = true, },
762         { .name     = "socket",     .errmsg = true,
763           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
764                              [1] = SCA_SK_TYPE, /* type */ },
765           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
766         { .name     = "socketpair", .errmsg = true,
767           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
768                              [1] = SCA_SK_TYPE, /* type */ },
769           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
770         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
771         { .name     = "statfs",     .errmsg = true, },
772         { .name     = "statx",      .errmsg = true,
773           .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
774                              [2] = SCA_STATX_FLAGS, /* flags */
775                              [3] = SCA_STATX_MASK, /* mask */ }, },
776         { .name     = "swapoff",    .errmsg = true,
777           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
778         { .name     = "swapon",     .errmsg = true,
779           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
780         { .name     = "symlinkat",  .errmsg = true,
781           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
782         { .name     = "tgkill",     .errmsg = true,
783           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
784         { .name     = "tkill",      .errmsg = true,
785           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
786         { .name     = "truncate",   .errmsg = true, },
787         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
788         { .name     = "unlinkat",   .errmsg = true,
789           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
790         { .name     = "utime",  .errmsg = true, },
791         { .name     = "utimensat",  .errmsg = true,
792           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
793         { .name     = "utimes",  .errmsg = true, },
794         { .name     = "vmsplice",  .errmsg = true, },
795         { .name     = "wait4",      .errpid = true,
796           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
797         { .name     = "waitid",     .errpid = true,
798           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
799         { .name     = "write",      .errmsg = true, },
800         { .name     = "writev",     .errmsg = true, },
801 };
802
803 static int syscall_fmt__cmp(const void *name, const void *fmtp)
804 {
805         const struct syscall_fmt *fmt = fmtp;
806         return strcmp(name, fmt->name);
807 }
808
809 static struct syscall_fmt *syscall_fmt__find(const char *name)
810 {
811         const int nmemb = ARRAY_SIZE(syscall_fmts);
812         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
813 }
814
815 struct syscall {
816         struct event_format *tp_format;
817         int                 nr_args;
818         struct format_field *args;
819         const char          *name;
820         bool                is_exit;
821         struct syscall_fmt  *fmt;
822         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
823         void                **arg_parm;
824 };
825
826 /*
827  * We need to have this 'calculated' boolean because in some cases we really
828  * don't know what is the duration of a syscall, for instance, when we start
829  * a session and some threads are waiting for a syscall to finish, say 'poll',
830  * in which case all we can do is to print "( ? ) for duration and for the
831  * start timestamp.
832  */
833 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
834 {
835         double duration = (double)t / NSEC_PER_MSEC;
836         size_t printed = fprintf(fp, "(");
837
838         if (!calculated)
839                 printed += fprintf(fp, "     ?   ");
840         else if (duration >= 1.0)
841                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
842         else if (duration >= 0.01)
843                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
844         else
845                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
846         return printed + fprintf(fp, "): ");
847 }
848
849 /**
850  * filename.ptr: The filename char pointer that will be vfs_getname'd
851  * filename.entry_str_pos: Where to insert the string translated from
852  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
853  */
854 struct thread_trace {
855         u64               entry_time;
856         bool              entry_pending;
857         unsigned long     nr_events;
858         unsigned long     pfmaj, pfmin;
859         char              *entry_str;
860         double            runtime_ms;
861         struct {
862                 unsigned long ptr;
863                 short int     entry_str_pos;
864                 bool          pending_open;
865                 unsigned int  namelen;
866                 char          *name;
867         } filename;
868         struct {
869                 int       max;
870                 char      **table;
871         } paths;
872
873         struct intlist *syscall_stats;
874 };
875
876 static struct thread_trace *thread_trace__new(void)
877 {
878         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
879
880         if (ttrace)
881                 ttrace->paths.max = -1;
882
883         ttrace->syscall_stats = intlist__new(NULL);
884
885         return ttrace;
886 }
887
888 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
889 {
890         struct thread_trace *ttrace;
891
892         if (thread == NULL)
893                 goto fail;
894
895         if (thread__priv(thread) == NULL)
896                 thread__set_priv(thread, thread_trace__new());
897
898         if (thread__priv(thread) == NULL)
899                 goto fail;
900
901         ttrace = thread__priv(thread);
902         ++ttrace->nr_events;
903
904         return ttrace;
905 fail:
906         color_fprintf(fp, PERF_COLOR_RED,
907                       "WARNING: not enough memory, dropping samples!\n");
908         return NULL;
909 }
910
911 #define TRACE_PFMAJ             (1 << 0)
912 #define TRACE_PFMIN             (1 << 1)
913
914 static const size_t trace__entry_str_size = 2048;
915
916 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
917 {
918         struct thread_trace *ttrace = thread__priv(thread);
919
920         if (fd > ttrace->paths.max) {
921                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
922
923                 if (npath == NULL)
924                         return -1;
925
926                 if (ttrace->paths.max != -1) {
927                         memset(npath + ttrace->paths.max + 1, 0,
928                                (fd - ttrace->paths.max) * sizeof(char *));
929                 } else {
930                         memset(npath, 0, (fd + 1) * sizeof(char *));
931                 }
932
933                 ttrace->paths.table = npath;
934                 ttrace->paths.max   = fd;
935         }
936
937         ttrace->paths.table[fd] = strdup(pathname);
938
939         return ttrace->paths.table[fd] != NULL ? 0 : -1;
940 }
941
942 static int thread__read_fd_path(struct thread *thread, int fd)
943 {
944         char linkname[PATH_MAX], pathname[PATH_MAX];
945         struct stat st;
946         int ret;
947
948         if (thread->pid_ == thread->tid) {
949                 scnprintf(linkname, sizeof(linkname),
950                           "/proc/%d/fd/%d", thread->pid_, fd);
951         } else {
952                 scnprintf(linkname, sizeof(linkname),
953                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
954         }
955
956         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
957                 return -1;
958
959         ret = readlink(linkname, pathname, sizeof(pathname));
960
961         if (ret < 0 || ret > st.st_size)
962                 return -1;
963
964         pathname[ret] = '\0';
965         return trace__set_fd_pathname(thread, fd, pathname);
966 }
967
968 static const char *thread__fd_path(struct thread *thread, int fd,
969                                    struct trace *trace)
970 {
971         struct thread_trace *ttrace = thread__priv(thread);
972
973         if (ttrace == NULL)
974                 return NULL;
975
976         if (fd < 0)
977                 return NULL;
978
979         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
980                 if (!trace->live)
981                         return NULL;
982                 ++trace->stats.proc_getname;
983                 if (thread__read_fd_path(thread, fd))
984                         return NULL;
985         }
986
987         return ttrace->paths.table[fd];
988 }
989
990 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
991                                         struct syscall_arg *arg)
992 {
993         int fd = arg->val;
994         size_t printed = scnprintf(bf, size, "%d", fd);
995         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
996
997         if (path)
998                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
999
1000         return printed;
1001 }
1002
1003 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1004                                               struct syscall_arg *arg)
1005 {
1006         int fd = arg->val;
1007         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1008         struct thread_trace *ttrace = thread__priv(arg->thread);
1009
1010         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1011                 zfree(&ttrace->paths.table[fd]);
1012
1013         return printed;
1014 }
1015
1016 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1017                                      unsigned long ptr)
1018 {
1019         struct thread_trace *ttrace = thread__priv(thread);
1020
1021         ttrace->filename.ptr = ptr;
1022         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1023 }
1024
1025 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1026                                               struct syscall_arg *arg)
1027 {
1028         unsigned long ptr = arg->val;
1029
1030         if (!arg->trace->vfs_getname)
1031                 return scnprintf(bf, size, "%#x", ptr);
1032
1033         thread__set_filename_pos(arg->thread, bf, ptr);
1034         return 0;
1035 }
1036
1037 static bool trace__filter_duration(struct trace *trace, double t)
1038 {
1039         return t < (trace->duration_filter * NSEC_PER_MSEC);
1040 }
1041
1042 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1043 {
1044         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1045
1046         return fprintf(fp, "%10.3f ", ts);
1047 }
1048
1049 /*
1050  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1051  * using ttrace->entry_time for a thread that receives a sys_exit without
1052  * first having received a sys_enter ("poll" issued before tracing session
1053  * starts, lost sys_enter exit due to ring buffer overflow).
1054  */
1055 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1056 {
1057         if (tstamp > 0)
1058                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1059
1060         return fprintf(fp, "         ? ");
1061 }
1062
1063 static bool done = false;
1064 static bool interrupted = false;
1065
1066 static void sig_handler(int sig)
1067 {
1068         done = true;
1069         interrupted = sig == SIGINT;
1070 }
1071
1072 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1073                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1074 {
1075         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1076         printed += fprintf_duration(duration, duration_calculated, fp);
1077
1078         if (trace->multiple_threads) {
1079                 if (trace->show_comm)
1080                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1081                 printed += fprintf(fp, "%d ", thread->tid);
1082         }
1083
1084         return printed;
1085 }
1086
1087 static int trace__process_event(struct trace *trace, struct machine *machine,
1088                                 union perf_event *event, struct perf_sample *sample)
1089 {
1090         int ret = 0;
1091
1092         switch (event->header.type) {
1093         case PERF_RECORD_LOST:
1094                 color_fprintf(trace->output, PERF_COLOR_RED,
1095                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1096                 ret = machine__process_lost_event(machine, event, sample);
1097                 break;
1098         default:
1099                 ret = machine__process_event(machine, event, sample);
1100                 break;
1101         }
1102
1103         return ret;
1104 }
1105
1106 static int trace__tool_process(struct perf_tool *tool,
1107                                union perf_event *event,
1108                                struct perf_sample *sample,
1109                                struct machine *machine)
1110 {
1111         struct trace *trace = container_of(tool, struct trace, tool);
1112         return trace__process_event(trace, machine, event, sample);
1113 }
1114
1115 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1116 {
1117         struct machine *machine = vmachine;
1118
1119         if (machine->kptr_restrict_warned)
1120                 return NULL;
1121
1122         if (symbol_conf.kptr_restrict) {
1123                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1124                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1125                            "Kernel samples will not be resolved.\n");
1126                 machine->kptr_restrict_warned = true;
1127                 return NULL;
1128         }
1129
1130         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1131 }
1132
1133 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1134 {
1135         int err = symbol__init(NULL);
1136
1137         if (err)
1138                 return err;
1139
1140         trace->host = machine__new_host();
1141         if (trace->host == NULL)
1142                 return -ENOMEM;
1143
1144         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1145                 return -errno;
1146
1147         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1148                                             evlist->threads, trace__tool_process, false,
1149                                             trace->opts.proc_map_timeout);
1150         if (err)
1151                 symbol__exit();
1152
1153         return err;
1154 }
1155
1156 static int syscall__set_arg_fmts(struct syscall *sc)
1157 {
1158         struct format_field *field;
1159         int idx = 0, len;
1160
1161         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1162         if (sc->arg_scnprintf == NULL)
1163                 return -1;
1164
1165         if (sc->fmt)
1166                 sc->arg_parm = sc->fmt->arg_parm;
1167
1168         for (field = sc->args; field; field = field->next) {
1169                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1170                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1171                 else if (strcmp(field->type, "const char *") == 0 &&
1172                          (strcmp(field->name, "filename") == 0 ||
1173                           strcmp(field->name, "path") == 0 ||
1174                           strcmp(field->name, "pathname") == 0))
1175                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1176                 else if (field->flags & FIELD_IS_POINTER)
1177                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1178                 else if (strcmp(field->type, "pid_t") == 0)
1179                         sc->arg_scnprintf[idx] = SCA_PID;
1180                 else if (strcmp(field->type, "umode_t") == 0)
1181                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1182                 else if ((strcmp(field->type, "int") == 0 ||
1183                           strcmp(field->type, "unsigned int") == 0 ||
1184                           strcmp(field->type, "long") == 0) &&
1185                          (len = strlen(field->name)) >= 2 &&
1186                          strcmp(field->name + len - 2, "fd") == 0) {
1187                         /*
1188                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1189                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1190                          * 65 int
1191                          * 23 unsigned int
1192                          * 7 unsigned long
1193                          */
1194                         sc->arg_scnprintf[idx] = SCA_FD;
1195                 }
1196                 ++idx;
1197         }
1198
1199         return 0;
1200 }
1201
1202 static int trace__read_syscall_info(struct trace *trace, int id)
1203 {
1204         char tp_name[128];
1205         struct syscall *sc;
1206         const char *name = syscalltbl__name(trace->sctbl, id);
1207
1208         if (name == NULL)
1209                 return -1;
1210
1211         if (id > trace->syscalls.max) {
1212                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1213
1214                 if (nsyscalls == NULL)
1215                         return -1;
1216
1217                 if (trace->syscalls.max != -1) {
1218                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1219                                (id - trace->syscalls.max) * sizeof(*sc));
1220                 } else {
1221                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1222                 }
1223
1224                 trace->syscalls.table = nsyscalls;
1225                 trace->syscalls.max   = id;
1226         }
1227
1228         sc = trace->syscalls.table + id;
1229         sc->name = name;
1230
1231         sc->fmt  = syscall_fmt__find(sc->name);
1232
1233         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1234         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1235
1236         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1237                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1238                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1239         }
1240
1241         if (IS_ERR(sc->tp_format))
1242                 return -1;
1243
1244         sc->args = sc->tp_format->format.fields;
1245         sc->nr_args = sc->tp_format->format.nr_fields;
1246         /*
1247          * We need to check and discard the first variable '__syscall_nr'
1248          * or 'nr' that mean the syscall number. It is needless here.
1249          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1250          */
1251         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1252                 sc->args = sc->args->next;
1253                 --sc->nr_args;
1254         }
1255
1256         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1257
1258         return syscall__set_arg_fmts(sc);
1259 }
1260
1261 static int trace__validate_ev_qualifier(struct trace *trace)
1262 {
1263         int err = 0, i;
1264         struct str_node *pos;
1265
1266         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1267         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1268                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1269
1270         if (trace->ev_qualifier_ids.entries == NULL) {
1271                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1272                        trace->output);
1273                 err = -EINVAL;
1274                 goto out;
1275         }
1276
1277         i = 0;
1278
1279         strlist__for_each_entry(pos, trace->ev_qualifier) {
1280                 const char *sc = pos->s;
1281                 int id = syscalltbl__id(trace->sctbl, sc);
1282
1283                 if (id < 0) {
1284                         if (err == 0) {
1285                                 fputs("Error:\tInvalid syscall ", trace->output);
1286                                 err = -EINVAL;
1287                         } else {
1288                                 fputs(", ", trace->output);
1289                         }
1290
1291                         fputs(sc, trace->output);
1292                 }
1293
1294                 trace->ev_qualifier_ids.entries[i++] = id;
1295         }
1296
1297         if (err < 0) {
1298                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1299                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1300                 zfree(&trace->ev_qualifier_ids.entries);
1301                 trace->ev_qualifier_ids.nr = 0;
1302         }
1303 out:
1304         return err;
1305 }
1306
1307 /*
1308  * args is to be interpreted as a series of longs but we need to handle
1309  * 8-byte unaligned accesses. args points to raw_data within the event
1310  * and raw_data is guaranteed to be 8-byte unaligned because it is
1311  * preceded by raw_size which is a u32. So we need to copy args to a temp
1312  * variable to read it. Most notably this avoids extended load instructions
1313  * on unaligned addresses
1314  */
1315
1316 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1317                                       unsigned char *args, struct trace *trace,
1318                                       struct thread *thread)
1319 {
1320         size_t printed = 0;
1321         unsigned char *p;
1322         unsigned long val;
1323
1324         if (sc->args != NULL) {
1325                 struct format_field *field;
1326                 u8 bit = 1;
1327                 struct syscall_arg arg = {
1328                         .idx    = 0,
1329                         .mask   = 0,
1330                         .trace  = trace,
1331                         .thread = thread,
1332                 };
1333
1334                 for (field = sc->args; field;
1335                      field = field->next, ++arg.idx, bit <<= 1) {
1336                         if (arg.mask & bit)
1337                                 continue;
1338
1339                         /* special care for unaligned accesses */
1340                         p = args + sizeof(unsigned long) * arg.idx;
1341                         memcpy(&val, p, sizeof(val));
1342
1343                         /*
1344                          * Suppress this argument if its value is zero and
1345                          * and we don't have a string associated in an
1346                          * strarray for it.
1347                          */
1348                         if (val == 0 &&
1349                             !(sc->arg_scnprintf &&
1350                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1351                               sc->arg_parm[arg.idx]))
1352                                 continue;
1353
1354                         printed += scnprintf(bf + printed, size - printed,
1355                                              "%s%s: ", printed ? ", " : "", field->name);
1356                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1357                                 arg.val = val;
1358                                 if (sc->arg_parm)
1359                                         arg.parm = sc->arg_parm[arg.idx];
1360                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1361                                                                       size - printed, &arg);
1362                         } else {
1363                                 printed += scnprintf(bf + printed, size - printed,
1364                                                      "%ld", val);
1365                         }
1366                 }
1367         } else if (IS_ERR(sc->tp_format)) {
1368                 /*
1369                  * If we managed to read the tracepoint /format file, then we
1370                  * may end up not having any args, like with gettid(), so only
1371                  * print the raw args when we didn't manage to read it.
1372                  */
1373                 int i = 0;
1374
1375                 while (i < 6) {
1376                         /* special care for unaligned accesses */
1377                         p = args + sizeof(unsigned long) * i;
1378                         memcpy(&val, p, sizeof(val));
1379                         printed += scnprintf(bf + printed, size - printed,
1380                                              "%sarg%d: %ld",
1381                                              printed ? ", " : "", i, val);
1382                         ++i;
1383                 }
1384         }
1385
1386         return printed;
1387 }
1388
1389 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1390                                   union perf_event *event,
1391                                   struct perf_sample *sample);
1392
1393 static struct syscall *trace__syscall_info(struct trace *trace,
1394                                            struct perf_evsel *evsel, int id)
1395 {
1396
1397         if (id < 0) {
1398
1399                 /*
1400                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1401                  * before that, leaving at a higher verbosity level till that is
1402                  * explained. Reproduced with plain ftrace with:
1403                  *
1404                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1405                  * grep "NR -1 " /t/trace_pipe
1406                  *
1407                  * After generating some load on the machine.
1408                  */
1409                 if (verbose > 1) {
1410                         static u64 n;
1411                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1412                                 id, perf_evsel__name(evsel), ++n);
1413                 }
1414                 return NULL;
1415         }
1416
1417         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1418             trace__read_syscall_info(trace, id))
1419                 goto out_cant_read;
1420
1421         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1422                 goto out_cant_read;
1423
1424         return &trace->syscalls.table[id];
1425
1426 out_cant_read:
1427         if (verbose > 0) {
1428                 fprintf(trace->output, "Problems reading syscall %d", id);
1429                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1430                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1431                 fputs(" information\n", trace->output);
1432         }
1433         return NULL;
1434 }
1435
1436 static void thread__update_stats(struct thread_trace *ttrace,
1437                                  int id, struct perf_sample *sample)
1438 {
1439         struct int_node *inode;
1440         struct stats *stats;
1441         u64 duration = 0;
1442
1443         inode = intlist__findnew(ttrace->syscall_stats, id);
1444         if (inode == NULL)
1445                 return;
1446
1447         stats = inode->priv;
1448         if (stats == NULL) {
1449                 stats = malloc(sizeof(struct stats));
1450                 if (stats == NULL)
1451                         return;
1452                 init_stats(stats);
1453                 inode->priv = stats;
1454         }
1455
1456         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1457                 duration = sample->time - ttrace->entry_time;
1458
1459         update_stats(stats, duration);
1460 }
1461
1462 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1463 {
1464         struct thread_trace *ttrace;
1465         u64 duration;
1466         size_t printed;
1467
1468         if (trace->current == NULL)
1469                 return 0;
1470
1471         ttrace = thread__priv(trace->current);
1472
1473         if (!ttrace->entry_pending)
1474                 return 0;
1475
1476         duration = sample->time - ttrace->entry_time;
1477
1478         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1479         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1480         ttrace->entry_pending = false;
1481
1482         return printed;
1483 }
1484
1485 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1486                             union perf_event *event __maybe_unused,
1487                             struct perf_sample *sample)
1488 {
1489         char *msg;
1490         void *args;
1491         size_t printed = 0;
1492         struct thread *thread;
1493         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1494         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1495         struct thread_trace *ttrace;
1496
1497         if (sc == NULL)
1498                 return -1;
1499
1500         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1501         ttrace = thread__trace(thread, trace->output);
1502         if (ttrace == NULL)
1503                 goto out_put;
1504
1505         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1506
1507         if (ttrace->entry_str == NULL) {
1508                 ttrace->entry_str = malloc(trace__entry_str_size);
1509                 if (!ttrace->entry_str)
1510                         goto out_put;
1511         }
1512
1513         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1514                 trace__printf_interrupted_entry(trace, sample);
1515
1516         ttrace->entry_time = sample->time;
1517         msg = ttrace->entry_str;
1518         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1519
1520         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1521                                            args, trace, thread);
1522
1523         if (sc->is_exit) {
1524                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1525                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1526                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1527                 }
1528         } else {
1529                 ttrace->entry_pending = true;
1530                 /* See trace__vfs_getname & trace__sys_exit */
1531                 ttrace->filename.pending_open = false;
1532         }
1533
1534         if (trace->current != thread) {
1535                 thread__put(trace->current);
1536                 trace->current = thread__get(thread);
1537         }
1538         err = 0;
1539 out_put:
1540         thread__put(thread);
1541         return err;
1542 }
1543
1544 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1545                                     struct perf_sample *sample,
1546                                     struct callchain_cursor *cursor)
1547 {
1548         struct addr_location al;
1549
1550         if (machine__resolve(trace->host, &al, sample) < 0 ||
1551             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1552                 return -1;
1553
1554         return 0;
1555 }
1556
1557 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1558 {
1559         /* TODO: user-configurable print_opts */
1560         const unsigned int print_opts = EVSEL__PRINT_SYM |
1561                                         EVSEL__PRINT_DSO |
1562                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1563
1564         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1565 }
1566
1567 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1568                            union perf_event *event __maybe_unused,
1569                            struct perf_sample *sample)
1570 {
1571         long ret;
1572         u64 duration = 0;
1573         bool duration_calculated = false;
1574         struct thread *thread;
1575         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1576         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1577         struct thread_trace *ttrace;
1578
1579         if (sc == NULL)
1580                 return -1;
1581
1582         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1583         ttrace = thread__trace(thread, trace->output);
1584         if (ttrace == NULL)
1585                 goto out_put;
1586
1587         if (trace->summary)
1588                 thread__update_stats(ttrace, id, sample);
1589
1590         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1591
1592         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1593                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1594                 ttrace->filename.pending_open = false;
1595                 ++trace->stats.vfs_getname;
1596         }
1597
1598         if (ttrace->entry_time) {
1599                 duration = sample->time - ttrace->entry_time;
1600                 if (trace__filter_duration(trace, duration))
1601                         goto out;
1602                 duration_calculated = true;
1603         } else if (trace->duration_filter)
1604                 goto out;
1605
1606         if (sample->callchain) {
1607                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1608                 if (callchain_ret == 0) {
1609                         if (callchain_cursor.nr < trace->min_stack)
1610                                 goto out;
1611                         callchain_ret = 1;
1612                 }
1613         }
1614
1615         if (trace->summary_only)
1616                 goto out;
1617
1618         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1619
1620         if (ttrace->entry_pending) {
1621                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1622         } else {
1623                 fprintf(trace->output, " ... [");
1624                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1625                 fprintf(trace->output, "]: %s()", sc->name);
1626         }
1627
1628         if (sc->fmt == NULL) {
1629 signed_print:
1630                 fprintf(trace->output, ") = %ld", ret);
1631         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1632                 char bf[STRERR_BUFSIZE];
1633                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1634                            *e = audit_errno_to_name(-ret);
1635
1636                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1637         } else if (ret == 0 && sc->fmt->timeout)
1638                 fprintf(trace->output, ") = 0 Timeout");
1639         else if (sc->fmt->hexret)
1640                 fprintf(trace->output, ") = %#lx", ret);
1641         else if (sc->fmt->errpid) {
1642                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1643
1644                 if (child != NULL) {
1645                         fprintf(trace->output, ") = %ld", ret);
1646                         if (child->comm_set)
1647                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1648                         thread__put(child);
1649                 }
1650         } else
1651                 goto signed_print;
1652
1653         fputc('\n', trace->output);
1654
1655         if (callchain_ret > 0)
1656                 trace__fprintf_callchain(trace, sample);
1657         else if (callchain_ret < 0)
1658                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1659 out:
1660         ttrace->entry_pending = false;
1661         err = 0;
1662 out_put:
1663         thread__put(thread);
1664         return err;
1665 }
1666
1667 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1668                               union perf_event *event __maybe_unused,
1669                               struct perf_sample *sample)
1670 {
1671         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1672         struct thread_trace *ttrace;
1673         size_t filename_len, entry_str_len, to_move;
1674         ssize_t remaining_space;
1675         char *pos;
1676         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1677
1678         if (!thread)
1679                 goto out;
1680
1681         ttrace = thread__priv(thread);
1682         if (!ttrace)
1683                 goto out_put;
1684
1685         filename_len = strlen(filename);
1686         if (filename_len == 0)
1687                 goto out_put;
1688
1689         if (ttrace->filename.namelen < filename_len) {
1690                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1691
1692                 if (f == NULL)
1693                         goto out_put;
1694
1695                 ttrace->filename.namelen = filename_len;
1696                 ttrace->filename.name = f;
1697         }
1698
1699         strcpy(ttrace->filename.name, filename);
1700         ttrace->filename.pending_open = true;
1701
1702         if (!ttrace->filename.ptr)
1703                 goto out_put;
1704
1705         entry_str_len = strlen(ttrace->entry_str);
1706         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1707         if (remaining_space <= 0)
1708                 goto out_put;
1709
1710         if (filename_len > (size_t)remaining_space) {
1711                 filename += filename_len - remaining_space;
1712                 filename_len = remaining_space;
1713         }
1714
1715         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1716         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1717         memmove(pos + filename_len, pos, to_move);
1718         memcpy(pos, filename, filename_len);
1719
1720         ttrace->filename.ptr = 0;
1721         ttrace->filename.entry_str_pos = 0;
1722 out_put:
1723         thread__put(thread);
1724 out:
1725         return 0;
1726 }
1727
1728 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1729                                      union perf_event *event __maybe_unused,
1730                                      struct perf_sample *sample)
1731 {
1732         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1733         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1734         struct thread *thread = machine__findnew_thread(trace->host,
1735                                                         sample->pid,
1736                                                         sample->tid);
1737         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1738
1739         if (ttrace == NULL)
1740                 goto out_dump;
1741
1742         ttrace->runtime_ms += runtime_ms;
1743         trace->runtime_ms += runtime_ms;
1744 out_put:
1745         thread__put(thread);
1746         return 0;
1747
1748 out_dump:
1749         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1750                evsel->name,
1751                perf_evsel__strval(evsel, sample, "comm"),
1752                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1753                runtime,
1754                perf_evsel__intval(evsel, sample, "vruntime"));
1755         goto out_put;
1756 }
1757
1758 static void bpf_output__printer(enum binary_printer_ops op,
1759                                 unsigned int val, void *extra)
1760 {
1761         FILE *output = extra;
1762         unsigned char ch = (unsigned char)val;
1763
1764         switch (op) {
1765         case BINARY_PRINT_CHAR_DATA:
1766                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1767                 break;
1768         case BINARY_PRINT_DATA_BEGIN:
1769         case BINARY_PRINT_LINE_BEGIN:
1770         case BINARY_PRINT_ADDR:
1771         case BINARY_PRINT_NUM_DATA:
1772         case BINARY_PRINT_NUM_PAD:
1773         case BINARY_PRINT_SEP:
1774         case BINARY_PRINT_CHAR_PAD:
1775         case BINARY_PRINT_LINE_END:
1776         case BINARY_PRINT_DATA_END:
1777         default:
1778                 break;
1779         }
1780 }
1781
1782 static void bpf_output__fprintf(struct trace *trace,
1783                                 struct perf_sample *sample)
1784 {
1785         print_binary(sample->raw_data, sample->raw_size, 8,
1786                      bpf_output__printer, trace->output);
1787 }
1788
1789 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1790                                 union perf_event *event __maybe_unused,
1791                                 struct perf_sample *sample)
1792 {
1793         int callchain_ret = 0;
1794
1795         if (sample->callchain) {
1796                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1797                 if (callchain_ret == 0) {
1798                         if (callchain_cursor.nr < trace->min_stack)
1799                                 goto out;
1800                         callchain_ret = 1;
1801                 }
1802         }
1803
1804         trace__printf_interrupted_entry(trace, sample);
1805         trace__fprintf_tstamp(trace, sample->time, trace->output);
1806
1807         if (trace->trace_syscalls)
1808                 fprintf(trace->output, "(         ): ");
1809
1810         fprintf(trace->output, "%s:", evsel->name);
1811
1812         if (perf_evsel__is_bpf_output(evsel)) {
1813                 bpf_output__fprintf(trace, sample);
1814         } else if (evsel->tp_format) {
1815                 event_format__fprintf(evsel->tp_format, sample->cpu,
1816                                       sample->raw_data, sample->raw_size,
1817                                       trace->output);
1818         }
1819
1820         fprintf(trace->output, ")\n");
1821
1822         if (callchain_ret > 0)
1823                 trace__fprintf_callchain(trace, sample);
1824         else if (callchain_ret < 0)
1825                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1826 out:
1827         return 0;
1828 }
1829
1830 static void print_location(FILE *f, struct perf_sample *sample,
1831                            struct addr_location *al,
1832                            bool print_dso, bool print_sym)
1833 {
1834
1835         if ((verbose > 0 || print_dso) && al->map)
1836                 fprintf(f, "%s@", al->map->dso->long_name);
1837
1838         if ((verbose > 0 || print_sym) && al->sym)
1839                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1840                         al->addr - al->sym->start);
1841         else if (al->map)
1842                 fprintf(f, "0x%" PRIx64, al->addr);
1843         else
1844                 fprintf(f, "0x%" PRIx64, sample->addr);
1845 }
1846
1847 static int trace__pgfault(struct trace *trace,
1848                           struct perf_evsel *evsel,
1849                           union perf_event *event __maybe_unused,
1850                           struct perf_sample *sample)
1851 {
1852         struct thread *thread;
1853         struct addr_location al;
1854         char map_type = 'd';
1855         struct thread_trace *ttrace;
1856         int err = -1;
1857         int callchain_ret = 0;
1858
1859         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1860
1861         if (sample->callchain) {
1862                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1863                 if (callchain_ret == 0) {
1864                         if (callchain_cursor.nr < trace->min_stack)
1865                                 goto out_put;
1866                         callchain_ret = 1;
1867                 }
1868         }
1869
1870         ttrace = thread__trace(thread, trace->output);
1871         if (ttrace == NULL)
1872                 goto out_put;
1873
1874         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1875                 ttrace->pfmaj++;
1876         else
1877                 ttrace->pfmin++;
1878
1879         if (trace->summary_only)
1880                 goto out;
1881
1882         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1883                               sample->ip, &al);
1884
1885         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1886
1887         fprintf(trace->output, "%sfault [",
1888                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1889                 "maj" : "min");
1890
1891         print_location(trace->output, sample, &al, false, true);
1892
1893         fprintf(trace->output, "] => ");
1894
1895         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1896                                    sample->addr, &al);
1897
1898         if (!al.map) {
1899                 thread__find_addr_location(thread, sample->cpumode,
1900                                            MAP__FUNCTION, sample->addr, &al);
1901
1902                 if (al.map)
1903                         map_type = 'x';
1904                 else
1905                         map_type = '?';
1906         }
1907
1908         print_location(trace->output, sample, &al, true, false);
1909
1910         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1911
1912         if (callchain_ret > 0)
1913                 trace__fprintf_callchain(trace, sample);
1914         else if (callchain_ret < 0)
1915                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1916 out:
1917         err = 0;
1918 out_put:
1919         thread__put(thread);
1920         return err;
1921 }
1922
1923 static void trace__set_base_time(struct trace *trace,
1924                                  struct perf_evsel *evsel,
1925                                  struct perf_sample *sample)
1926 {
1927         /*
1928          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1929          * and don't use sample->time unconditionally, we may end up having
1930          * some other event in the future without PERF_SAMPLE_TIME for good
1931          * reason, i.e. we may not be interested in its timestamps, just in
1932          * it taking place, picking some piece of information when it
1933          * appears in our event stream (vfs_getname comes to mind).
1934          */
1935         if (trace->base_time == 0 && !trace->full_time &&
1936             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1937                 trace->base_time = sample->time;
1938 }
1939
1940 static int trace__process_sample(struct perf_tool *tool,
1941                                  union perf_event *event,
1942                                  struct perf_sample *sample,
1943                                  struct perf_evsel *evsel,
1944                                  struct machine *machine __maybe_unused)
1945 {
1946         struct trace *trace = container_of(tool, struct trace, tool);
1947         struct thread *thread;
1948         int err = 0;
1949
1950         tracepoint_handler handler = evsel->handler;
1951
1952         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1953         if (thread && thread__is_filtered(thread))
1954                 goto out;
1955
1956         trace__set_base_time(trace, evsel, sample);
1957
1958         if (handler) {
1959                 ++trace->nr_events;
1960                 handler(trace, evsel, event, sample);
1961         }
1962 out:
1963         thread__put(thread);
1964         return err;
1965 }
1966
1967 static int trace__record(struct trace *trace, int argc, const char **argv)
1968 {
1969         unsigned int rec_argc, i, j;
1970         const char **rec_argv;
1971         const char * const record_args[] = {
1972                 "record",
1973                 "-R",
1974                 "-m", "1024",
1975                 "-c", "1",
1976         };
1977
1978         const char * const sc_args[] = { "-e", };
1979         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1980         const char * const majpf_args[] = { "-e", "major-faults" };
1981         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1982         const char * const minpf_args[] = { "-e", "minor-faults" };
1983         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1984
1985         /* +1 is for the event string below */
1986         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1987                 majpf_args_nr + minpf_args_nr + argc;
1988         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1989
1990         if (rec_argv == NULL)
1991                 return -ENOMEM;
1992
1993         j = 0;
1994         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1995                 rec_argv[j++] = record_args[i];
1996
1997         if (trace->trace_syscalls) {
1998                 for (i = 0; i < sc_args_nr; i++)
1999                         rec_argv[j++] = sc_args[i];
2000
2001                 /* event string may be different for older kernels - e.g., RHEL6 */
2002                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2003                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2004                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2005                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2006                 else {
2007                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2008                         return -1;
2009                 }
2010         }
2011
2012         if (trace->trace_pgfaults & TRACE_PFMAJ)
2013                 for (i = 0; i < majpf_args_nr; i++)
2014                         rec_argv[j++] = majpf_args[i];
2015
2016         if (trace->trace_pgfaults & TRACE_PFMIN)
2017                 for (i = 0; i < minpf_args_nr; i++)
2018                         rec_argv[j++] = minpf_args[i];
2019
2020         for (i = 0; i < (unsigned int)argc; i++)
2021                 rec_argv[j++] = argv[i];
2022
2023         return cmd_record(j, rec_argv);
2024 }
2025
2026 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2027
2028 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2029 {
2030         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2031
2032         if (IS_ERR(evsel))
2033                 return false;
2034
2035         if (perf_evsel__field(evsel, "pathname") == NULL) {
2036                 perf_evsel__delete(evsel);
2037                 return false;
2038         }
2039
2040         evsel->handler = trace__vfs_getname;
2041         perf_evlist__add(evlist, evsel);
2042         return true;
2043 }
2044
2045 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2046 {
2047         struct perf_evsel *evsel;
2048         struct perf_event_attr attr = {
2049                 .type = PERF_TYPE_SOFTWARE,
2050                 .mmap_data = 1,
2051         };
2052
2053         attr.config = config;
2054         attr.sample_period = 1;
2055
2056         event_attr_init(&attr);
2057
2058         evsel = perf_evsel__new(&attr);
2059         if (evsel)
2060                 evsel->handler = trace__pgfault;
2061
2062         return evsel;
2063 }
2064
2065 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2066 {
2067         const u32 type = event->header.type;
2068         struct perf_evsel *evsel;
2069
2070         if (type != PERF_RECORD_SAMPLE) {
2071                 trace__process_event(trace, trace->host, event, sample);
2072                 return;
2073         }
2074
2075         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2076         if (evsel == NULL) {
2077                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2078                 return;
2079         }
2080
2081         trace__set_base_time(trace, evsel, sample);
2082
2083         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2084             sample->raw_data == NULL) {
2085                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2086                        perf_evsel__name(evsel), sample->tid,
2087                        sample->cpu, sample->raw_size);
2088         } else {
2089                 tracepoint_handler handler = evsel->handler;
2090                 handler(trace, evsel, event, sample);
2091         }
2092 }
2093
2094 static int trace__add_syscall_newtp(struct trace *trace)
2095 {
2096         int ret = -1;
2097         struct perf_evlist *evlist = trace->evlist;
2098         struct perf_evsel *sys_enter, *sys_exit;
2099
2100         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2101         if (sys_enter == NULL)
2102                 goto out;
2103
2104         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2105                 goto out_delete_sys_enter;
2106
2107         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2108         if (sys_exit == NULL)
2109                 goto out_delete_sys_enter;
2110
2111         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2112                 goto out_delete_sys_exit;
2113
2114         perf_evlist__add(evlist, sys_enter);
2115         perf_evlist__add(evlist, sys_exit);
2116
2117         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2118                 /*
2119                  * We're interested only in the user space callchain
2120                  * leading to the syscall, allow overriding that for
2121                  * debugging reasons using --kernel_syscall_callchains
2122                  */
2123                 sys_exit->attr.exclude_callchain_kernel = 1;
2124         }
2125
2126         trace->syscalls.events.sys_enter = sys_enter;
2127         trace->syscalls.events.sys_exit  = sys_exit;
2128
2129         ret = 0;
2130 out:
2131         return ret;
2132
2133 out_delete_sys_exit:
2134         perf_evsel__delete_priv(sys_exit);
2135 out_delete_sys_enter:
2136         perf_evsel__delete_priv(sys_enter);
2137         goto out;
2138 }
2139
2140 static int trace__set_ev_qualifier_filter(struct trace *trace)
2141 {
2142         int err = -1;
2143         struct perf_evsel *sys_exit;
2144         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2145                                                 trace->ev_qualifier_ids.nr,
2146                                                 trace->ev_qualifier_ids.entries);
2147
2148         if (filter == NULL)
2149                 goto out_enomem;
2150
2151         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2152                                           filter)) {
2153                 sys_exit = trace->syscalls.events.sys_exit;
2154                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2155         }
2156
2157         free(filter);
2158 out:
2159         return err;
2160 out_enomem:
2161         errno = ENOMEM;
2162         goto out;
2163 }
2164
2165 static int trace__run(struct trace *trace, int argc, const char **argv)
2166 {
2167         struct perf_evlist *evlist = trace->evlist;
2168         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2169         int err = -1, i;
2170         unsigned long before;
2171         const bool forks = argc > 0;
2172         bool draining = false;
2173
2174         trace->live = true;
2175
2176         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2177                 goto out_error_raw_syscalls;
2178
2179         if (trace->trace_syscalls)
2180                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2181
2182         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2183                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2184                 if (pgfault_maj == NULL)
2185                         goto out_error_mem;
2186                 perf_evlist__add(evlist, pgfault_maj);
2187         }
2188
2189         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2190                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2191                 if (pgfault_min == NULL)
2192                         goto out_error_mem;
2193                 perf_evlist__add(evlist, pgfault_min);
2194         }
2195
2196         if (trace->sched &&
2197             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2198                                    trace__sched_stat_runtime))
2199                 goto out_error_sched_stat_runtime;
2200
2201         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2202         if (err < 0) {
2203                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2204                 goto out_delete_evlist;
2205         }
2206
2207         err = trace__symbols_init(trace, evlist);
2208         if (err < 0) {
2209                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2210                 goto out_delete_evlist;
2211         }
2212
2213         perf_evlist__config(evlist, &trace->opts, NULL);
2214
2215         if (callchain_param.enabled) {
2216                 bool use_identifier = false;
2217
2218                 if (trace->syscalls.events.sys_exit) {
2219                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2220                                                      &trace->opts, &callchain_param);
2221                         use_identifier = true;
2222                 }
2223
2224                 if (pgfault_maj) {
2225                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2226                         use_identifier = true;
2227                 }
2228
2229                 if (pgfault_min) {
2230                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2231                         use_identifier = true;
2232                 }
2233
2234                 if (use_identifier) {
2235                        /*
2236                         * Now we have evsels with different sample_ids, use
2237                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2238                         * from a fixed position in each ring buffer record.
2239                         *
2240                         * As of this the changeset introducing this comment, this
2241                         * isn't strictly needed, as the fields that can come before
2242                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2243                         * some of those for things like copying the payload of
2244                         * pointer syscall arguments, and for vfs_getname we don't
2245                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2246                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2247                         */
2248                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2249                         perf_evlist__reset_sample_bit(evlist, ID);
2250                 }
2251         }
2252
2253         signal(SIGCHLD, sig_handler);
2254         signal(SIGINT, sig_handler);
2255
2256         if (forks) {
2257                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2258                                                     argv, false, NULL);
2259                 if (err < 0) {
2260                         fprintf(trace->output, "Couldn't run the workload!\n");
2261                         goto out_delete_evlist;
2262                 }
2263         }
2264
2265         err = perf_evlist__open(evlist);
2266         if (err < 0)
2267                 goto out_error_open;
2268
2269         err = bpf__apply_obj_config();
2270         if (err) {
2271                 char errbuf[BUFSIZ];
2272
2273                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2274                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2275                          errbuf);
2276                 goto out_error_open;
2277         }
2278
2279         /*
2280          * Better not use !target__has_task() here because we need to cover the
2281          * case where no threads were specified in the command line, but a
2282          * workload was, and in that case we will fill in the thread_map when
2283          * we fork the workload in perf_evlist__prepare_workload.
2284          */
2285         if (trace->filter_pids.nr > 0)
2286                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2287         else if (thread_map__pid(evlist->threads, 0) == -1)
2288                 err = perf_evlist__set_filter_pid(evlist, getpid());
2289
2290         if (err < 0)
2291                 goto out_error_mem;
2292
2293         if (trace->ev_qualifier_ids.nr > 0) {
2294                 err = trace__set_ev_qualifier_filter(trace);
2295                 if (err < 0)
2296                         goto out_errno;
2297
2298                 pr_debug("event qualifier tracepoint filter: %s\n",
2299                          trace->syscalls.events.sys_exit->filter);
2300         }
2301
2302         err = perf_evlist__apply_filters(evlist, &evsel);
2303         if (err < 0)
2304                 goto out_error_apply_filters;
2305
2306         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2307         if (err < 0)
2308                 goto out_error_mmap;
2309
2310         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2311                 perf_evlist__enable(evlist);
2312
2313         if (forks)
2314                 perf_evlist__start_workload(evlist);
2315
2316         if (trace->opts.initial_delay) {
2317                 usleep(trace->opts.initial_delay * 1000);
2318                 perf_evlist__enable(evlist);
2319         }
2320
2321         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2322                                   evlist->threads->nr > 1 ||
2323                                   perf_evlist__first(evlist)->attr.inherit;
2324 again:
2325         before = trace->nr_events;
2326
2327         for (i = 0; i < evlist->nr_mmaps; i++) {
2328                 union perf_event *event;
2329
2330                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2331                         struct perf_sample sample;
2332
2333                         ++trace->nr_events;
2334
2335                         err = perf_evlist__parse_sample(evlist, event, &sample);
2336                         if (err) {
2337                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2338                                 goto next_event;
2339                         }
2340
2341                         trace__handle_event(trace, event, &sample);
2342 next_event:
2343                         perf_evlist__mmap_consume(evlist, i);
2344
2345                         if (interrupted)
2346                                 goto out_disable;
2347
2348                         if (done && !draining) {
2349                                 perf_evlist__disable(evlist);
2350                                 draining = true;
2351                         }
2352                 }
2353         }
2354
2355         if (trace->nr_events == before) {
2356                 int timeout = done ? 100 : -1;
2357
2358                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2359                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2360                                 draining = true;
2361
2362                         goto again;
2363                 }
2364         } else {
2365                 goto again;
2366         }
2367
2368 out_disable:
2369         thread__zput(trace->current);
2370
2371         perf_evlist__disable(evlist);
2372
2373         if (!err) {
2374                 if (trace->summary)
2375                         trace__fprintf_thread_summary(trace, trace->output);
2376
2377                 if (trace->show_tool_stats) {
2378                         fprintf(trace->output, "Stats:\n "
2379                                                " vfs_getname : %" PRIu64 "\n"
2380                                                " proc_getname: %" PRIu64 "\n",
2381                                 trace->stats.vfs_getname,
2382                                 trace->stats.proc_getname);
2383                 }
2384         }
2385
2386 out_delete_evlist:
2387         perf_evlist__delete(evlist);
2388         trace->evlist = NULL;
2389         trace->live = false;
2390         return err;
2391 {
2392         char errbuf[BUFSIZ];
2393
2394 out_error_sched_stat_runtime:
2395         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2396         goto out_error;
2397
2398 out_error_raw_syscalls:
2399         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2400         goto out_error;
2401
2402 out_error_mmap:
2403         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2404         goto out_error;
2405
2406 out_error_open:
2407         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2408
2409 out_error:
2410         fprintf(trace->output, "%s\n", errbuf);
2411         goto out_delete_evlist;
2412
2413 out_error_apply_filters:
2414         fprintf(trace->output,
2415                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2416                 evsel->filter, perf_evsel__name(evsel), errno,
2417                 str_error_r(errno, errbuf, sizeof(errbuf)));
2418         goto out_delete_evlist;
2419 }
2420 out_error_mem:
2421         fprintf(trace->output, "Not enough memory to run!\n");
2422         goto out_delete_evlist;
2423
2424 out_errno:
2425         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2426         goto out_delete_evlist;
2427 }
2428
2429 static int trace__replay(struct trace *trace)
2430 {
2431         const struct perf_evsel_str_handler handlers[] = {
2432                 { "probe:vfs_getname",       trace__vfs_getname, },
2433         };
2434         struct perf_data_file file = {
2435                 .path  = input_name,
2436                 .mode  = PERF_DATA_MODE_READ,
2437                 .force = trace->force,
2438         };
2439         struct perf_session *session;
2440         struct perf_evsel *evsel;
2441         int err = -1;
2442
2443         trace->tool.sample        = trace__process_sample;
2444         trace->tool.mmap          = perf_event__process_mmap;
2445         trace->tool.mmap2         = perf_event__process_mmap2;
2446         trace->tool.comm          = perf_event__process_comm;
2447         trace->tool.exit          = perf_event__process_exit;
2448         trace->tool.fork          = perf_event__process_fork;
2449         trace->tool.attr          = perf_event__process_attr;
2450         trace->tool.tracing_data  = perf_event__process_tracing_data;
2451         trace->tool.build_id      = perf_event__process_build_id;
2452         trace->tool.namespaces    = perf_event__process_namespaces;
2453
2454         trace->tool.ordered_events = true;
2455         trace->tool.ordering_requires_timestamps = true;
2456
2457         /* add tid to output */
2458         trace->multiple_threads = true;
2459
2460         session = perf_session__new(&file, false, &trace->tool);
2461         if (session == NULL)
2462                 return -1;
2463
2464         if (trace->opts.target.pid)
2465                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2466
2467         if (trace->opts.target.tid)
2468                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2469
2470         if (symbol__init(&session->header.env) < 0)
2471                 goto out;
2472
2473         trace->host = &session->machines.host;
2474
2475         err = perf_session__set_tracepoints_handlers(session, handlers);
2476         if (err)
2477                 goto out;
2478
2479         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2480                                                      "raw_syscalls:sys_enter");
2481         /* older kernels have syscalls tp versus raw_syscalls */
2482         if (evsel == NULL)
2483                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2484                                                              "syscalls:sys_enter");
2485
2486         if (evsel &&
2487             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2488             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2489                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2490                 goto out;
2491         }
2492
2493         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2494                                                      "raw_syscalls:sys_exit");
2495         if (evsel == NULL)
2496                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2497                                                              "syscalls:sys_exit");
2498         if (evsel &&
2499             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2500             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2501                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2502                 goto out;
2503         }
2504
2505         evlist__for_each_entry(session->evlist, evsel) {
2506                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2507                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2508                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2509                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2510                         evsel->handler = trace__pgfault;
2511         }
2512
2513         setup_pager();
2514
2515         err = perf_session__process_events(session);
2516         if (err)
2517                 pr_err("Failed to process events, error %d", err);
2518
2519         else if (trace->summary)
2520                 trace__fprintf_thread_summary(trace, trace->output);
2521
2522 out:
2523         perf_session__delete(session);
2524
2525         return err;
2526 }
2527
2528 static size_t trace__fprintf_threads_header(FILE *fp)
2529 {
2530         size_t printed;
2531
2532         printed  = fprintf(fp, "\n Summary of events:\n\n");
2533
2534         return printed;
2535 }
2536
2537 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2538         struct stats    *stats;
2539         double          msecs;
2540         int             syscall;
2541 )
2542 {
2543         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2544         struct stats *stats = source->priv;
2545
2546         entry->syscall = source->i;
2547         entry->stats   = stats;
2548         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2549 }
2550
2551 static size_t thread__dump_stats(struct thread_trace *ttrace,
2552                                  struct trace *trace, FILE *fp)
2553 {
2554         size_t printed = 0;
2555         struct syscall *sc;
2556         struct rb_node *nd;
2557         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2558
2559         if (syscall_stats == NULL)
2560                 return 0;
2561
2562         printed += fprintf(fp, "\n");
2563
2564         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2565         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2566         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2567
2568         resort_rb__for_each_entry(nd, syscall_stats) {
2569                 struct stats *stats = syscall_stats_entry->stats;
2570                 if (stats) {
2571                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2572                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2573                         double avg = avg_stats(stats);
2574                         double pct;
2575                         u64 n = (u64) stats->n;
2576
2577                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2578                         avg /= NSEC_PER_MSEC;
2579
2580                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2581                         printed += fprintf(fp, "   %-15s", sc->name);
2582                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2583                                            n, syscall_stats_entry->msecs, min, avg);
2584                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2585                 }
2586         }
2587
2588         resort_rb__delete(syscall_stats);
2589         printed += fprintf(fp, "\n\n");
2590
2591         return printed;
2592 }
2593
2594 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2595 {
2596         size_t printed = 0;
2597         struct thread_trace *ttrace = thread__priv(thread);
2598         double ratio;
2599
2600         if (ttrace == NULL)
2601                 return 0;
2602
2603         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2604
2605         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2606         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2607         printed += fprintf(fp, "%.1f%%", ratio);
2608         if (ttrace->pfmaj)
2609                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2610         if (ttrace->pfmin)
2611                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2612         if (trace->sched)
2613                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2614         else if (fputc('\n', fp) != EOF)
2615                 ++printed;
2616
2617         printed += thread__dump_stats(ttrace, trace, fp);
2618
2619         return printed;
2620 }
2621
2622 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2623 {
2624         return ttrace ? ttrace->nr_events : 0;
2625 }
2626
2627 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2628         struct thread *thread;
2629 )
2630 {
2631         entry->thread = rb_entry(nd, struct thread, rb_node);
2632 }
2633
2634 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2635 {
2636         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2637         size_t printed = trace__fprintf_threads_header(fp);
2638         struct rb_node *nd;
2639
2640         if (threads == NULL) {
2641                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2642                 return 0;
2643         }
2644
2645         resort_rb__for_each_entry(nd, threads)
2646                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2647
2648         resort_rb__delete(threads);
2649
2650         return printed;
2651 }
2652
2653 static int trace__set_duration(const struct option *opt, const char *str,
2654                                int unset __maybe_unused)
2655 {
2656         struct trace *trace = opt->value;
2657
2658         trace->duration_filter = atof(str);
2659         return 0;
2660 }
2661
2662 static int trace__set_filter_pids(const struct option *opt, const char *str,
2663                                   int unset __maybe_unused)
2664 {
2665         int ret = -1;
2666         size_t i;
2667         struct trace *trace = opt->value;
2668         /*
2669          * FIXME: introduce a intarray class, plain parse csv and create a
2670          * { int nr, int entries[] } struct...
2671          */
2672         struct intlist *list = intlist__new(str);
2673
2674         if (list == NULL)
2675                 return -1;
2676
2677         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2678         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2679
2680         if (trace->filter_pids.entries == NULL)
2681                 goto out;
2682
2683         trace->filter_pids.entries[0] = getpid();
2684
2685         for (i = 1; i < trace->filter_pids.nr; ++i)
2686                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2687
2688         intlist__delete(list);
2689         ret = 0;
2690 out:
2691         return ret;
2692 }
2693
2694 static int trace__open_output(struct trace *trace, const char *filename)
2695 {
2696         struct stat st;
2697
2698         if (!stat(filename, &st) && st.st_size) {
2699                 char oldname[PATH_MAX];
2700
2701                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2702                 unlink(oldname);
2703                 rename(filename, oldname);
2704         }
2705
2706         trace->output = fopen(filename, "w");
2707
2708         return trace->output == NULL ? -errno : 0;
2709 }
2710
2711 static int parse_pagefaults(const struct option *opt, const char *str,
2712                             int unset __maybe_unused)
2713 {
2714         int *trace_pgfaults = opt->value;
2715
2716         if (strcmp(str, "all") == 0)
2717                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2718         else if (strcmp(str, "maj") == 0)
2719                 *trace_pgfaults |= TRACE_PFMAJ;
2720         else if (strcmp(str, "min") == 0)
2721                 *trace_pgfaults |= TRACE_PFMIN;
2722         else
2723                 return -1;
2724
2725         return 0;
2726 }
2727
2728 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2729 {
2730         struct perf_evsel *evsel;
2731
2732         evlist__for_each_entry(evlist, evsel)
2733                 evsel->handler = handler;
2734 }
2735
2736 /*
2737  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2738  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2739  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2740  *
2741  * It'd be better to introduce a parse_options() variant that would return a
2742  * list with the terms it didn't match to an event...
2743  */
2744 static int trace__parse_events_option(const struct option *opt, const char *str,
2745                                       int unset __maybe_unused)
2746 {
2747         struct trace *trace = (struct trace *)opt->value;
2748         const char *s = str;
2749         char *sep = NULL, *lists[2] = { NULL, NULL, };
2750         int len = strlen(str), err = -1, list;
2751         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2752         char group_name[PATH_MAX];
2753
2754         if (strace_groups_dir == NULL)
2755                 return -1;
2756
2757         if (*s == '!') {
2758                 ++s;
2759                 trace->not_ev_qualifier = true;
2760         }
2761
2762         while (1) {
2763                 if ((sep = strchr(s, ',')) != NULL)
2764                         *sep = '\0';
2765
2766                 list = 0;
2767                 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2768                         list = 1;
2769                 } else {
2770                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2771                         if (access(group_name, R_OK) == 0)
2772                                 list = 1;
2773                 }
2774
2775                 if (lists[list]) {
2776                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2777                 } else {
2778                         lists[list] = malloc(len);
2779                         if (lists[list] == NULL)
2780                                 goto out;
2781                         strcpy(lists[list], s);
2782                 }
2783
2784                 if (!sep)
2785                         break;
2786
2787                 *sep = ',';
2788                 s = sep + 1;
2789         }
2790
2791         if (lists[1] != NULL) {
2792                 struct strlist_config slist_config = {
2793                         .dirname = strace_groups_dir,
2794                 };
2795
2796                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2797                 if (trace->ev_qualifier == NULL) {
2798                         fputs("Not enough memory to parse event qualifier", trace->output);
2799                         goto out;
2800                 }
2801
2802                 if (trace__validate_ev_qualifier(trace))
2803                         goto out;
2804         }
2805
2806         err = 0;
2807
2808         if (lists[0]) {
2809                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2810                                                "event selector. use 'perf list' to list available events",
2811                                                parse_events_option);
2812                 err = parse_events_option(&o, lists[0], 0);
2813         }
2814 out:
2815         if (sep)
2816                 *sep = ',';
2817
2818         return err;
2819 }
2820
2821 int cmd_trace(int argc, const char **argv)
2822 {
2823         const char *trace_usage[] = {
2824                 "perf trace [<options>] [<command>]",
2825                 "perf trace [<options>] -- <command> [<options>]",
2826                 "perf trace record [<options>] [<command>]",
2827                 "perf trace record [<options>] -- <command> [<options>]",
2828                 NULL
2829         };
2830         struct trace trace = {
2831                 .syscalls = {
2832                         . max = -1,
2833                 },
2834                 .opts = {
2835                         .target = {
2836                                 .uid       = UINT_MAX,
2837                                 .uses_mmap = true,
2838                         },
2839                         .user_freq     = UINT_MAX,
2840                         .user_interval = ULLONG_MAX,
2841                         .no_buffering  = true,
2842                         .mmap_pages    = UINT_MAX,
2843                         .proc_map_timeout  = 500,
2844                 },
2845                 .output = stderr,
2846                 .show_comm = true,
2847                 .trace_syscalls = true,
2848                 .kernel_syscallchains = false,
2849                 .max_stack = UINT_MAX,
2850         };
2851         const char *output_name = NULL;
2852         const struct option trace_options[] = {
2853         OPT_CALLBACK('e', "event", &trace, "event",
2854                      "event/syscall selector. use 'perf list' to list available events",
2855                      trace__parse_events_option),
2856         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2857                     "show the thread COMM next to its id"),
2858         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2859         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2860                      trace__parse_events_option),
2861         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2862         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2863         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2864                     "trace events on existing process id"),
2865         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2866                     "trace events on existing thread id"),
2867         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2868                      "pids to filter (by the kernel)", trace__set_filter_pids),
2869         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2870                     "system-wide collection from all CPUs"),
2871         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2872                     "list of cpus to monitor"),
2873         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2874                     "child tasks do not inherit counters"),
2875         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2876                      "number of mmap data pages",
2877                      perf_evlist__parse_mmap_pages),
2878         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2879                    "user to profile"),
2880         OPT_CALLBACK(0, "duration", &trace, "float",
2881                      "show only events with duration > N.M ms",
2882                      trace__set_duration),
2883         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2884         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2885         OPT_BOOLEAN('T', "time", &trace.full_time,
2886                     "Show full timestamp, not time relative to first start"),
2887         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2888                     "Show only syscall summary with statistics"),
2889         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2890                     "Show all syscalls and summary with statistics"),
2891         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2892                      "Trace pagefaults", parse_pagefaults, "maj"),
2893         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2894         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2895         OPT_CALLBACK(0, "call-graph", &trace.opts,
2896                      "record_mode[,record_size]", record_callchain_help,
2897                      &record_parse_callchain_opt),
2898         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2899                     "Show the kernel callchains on the syscall exit path"),
2900         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2901                      "Set the minimum stack depth when parsing the callchain, "
2902                      "anything below the specified depth will be ignored."),
2903         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2904                      "Set the maximum stack depth when parsing the callchain, "
2905                      "anything beyond the specified depth will be ignored. "
2906                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2907         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2908                         "per thread proc mmap processing timeout in ms"),
2909         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2910                      "ms to wait before starting measurement after program "
2911                      "start"),
2912         OPT_END()
2913         };
2914         bool __maybe_unused max_stack_user_set = true;
2915         bool mmap_pages_user_set = true;
2916         const char * const trace_subcommands[] = { "record", NULL };
2917         int err;
2918         char bf[BUFSIZ];
2919
2920         signal(SIGSEGV, sighandler_dump_stack);
2921         signal(SIGFPE, sighandler_dump_stack);
2922
2923         trace.evlist = perf_evlist__new();
2924         trace.sctbl = syscalltbl__new();
2925
2926         if (trace.evlist == NULL || trace.sctbl == NULL) {
2927                 pr_err("Not enough memory to run!\n");
2928                 err = -ENOMEM;
2929                 goto out;
2930         }
2931
2932         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2933                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2934
2935         err = bpf__setup_stdout(trace.evlist);
2936         if (err) {
2937                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2938                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2939                 goto out;
2940         }
2941
2942         err = -1;
2943
2944         if (trace.trace_pgfaults) {
2945                 trace.opts.sample_address = true;
2946                 trace.opts.sample_time = true;
2947         }
2948
2949         if (trace.opts.mmap_pages == UINT_MAX)
2950                 mmap_pages_user_set = false;
2951
2952         if (trace.max_stack == UINT_MAX) {
2953                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2954                 max_stack_user_set = false;
2955         }
2956
2957 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2958         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2959                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2960 #endif
2961
2962         if (callchain_param.enabled) {
2963                 if (!mmap_pages_user_set && geteuid() == 0)
2964                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2965
2966                 symbol_conf.use_callchain = true;
2967         }
2968
2969         if (trace.evlist->nr_entries > 0)
2970                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2971
2972         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2973                 return trace__record(&trace, argc-1, &argv[1]);
2974
2975         /* summary_only implies summary option, but don't overwrite summary if set */
2976         if (trace.summary_only)
2977                 trace.summary = trace.summary_only;
2978
2979         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2980             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2981                 pr_err("Please specify something to trace.\n");
2982                 return -1;
2983         }
2984
2985         if (!trace.trace_syscalls && trace.ev_qualifier) {
2986                 pr_err("The -e option can't be used with --no-syscalls.\n");
2987                 goto out;
2988         }
2989
2990         if (output_name != NULL) {
2991                 err = trace__open_output(&trace, output_name);
2992                 if (err < 0) {
2993                         perror("failed to create output file");
2994                         goto out;
2995                 }
2996         }
2997
2998         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2999
3000         err = target__validate(&trace.opts.target);
3001         if (err) {
3002                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3003                 fprintf(trace.output, "%s", bf);
3004                 goto out_close;
3005         }
3006
3007         err = target__parse_uid(&trace.opts.target);
3008         if (err) {
3009                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3010                 fprintf(trace.output, "%s", bf);
3011                 goto out_close;
3012         }
3013
3014         if (!argc && target__none(&trace.opts.target))
3015                 trace.opts.target.system_wide = true;
3016
3017         if (input_name)
3018                 err = trace__replay(&trace);
3019         else
3020                 err = trace__run(&trace, argc, argv);
3021
3022 out_close:
3023         if (output_name != NULL)
3024                 fclose(trace.output);
3025 out:
3026         return err;
3027 }