]> git.karo-electronics.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
perf tools: Add include <linux/kernel.h> where ARRAY_SIZE() is used
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace/beauty/beauty.h"
35 #include "trace-event.h"
36 #include "util/parse-events.h"
37 #include "util/bpf-loader.h"
38 #include "callchain.h"
39 #include "syscalltbl.h"
40 #include "rb_resort.h"
41
42 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
43 #include <stdlib.h>
44 #include <string.h>
45 #include <linux/err.h>
46 #include <linux/filter.h>
47 #include <linux/audit.h>
48 #include <linux/kernel.h>
49 #include <linux/random.h>
50 #include <linux/stringify.h>
51 #include <linux/time64.h>
52
53 #ifndef O_CLOEXEC
54 # define O_CLOEXEC              02000000
55 #endif
56
57 struct trace {
58         struct perf_tool        tool;
59         struct syscalltbl       *sctbl;
60         struct {
61                 int             max;
62                 struct syscall  *table;
63                 struct {
64                         struct perf_evsel *sys_enter,
65                                           *sys_exit;
66                 }               events;
67         } syscalls;
68         struct record_opts      opts;
69         struct perf_evlist      *evlist;
70         struct machine          *host;
71         struct thread           *current;
72         u64                     base_time;
73         FILE                    *output;
74         unsigned long           nr_events;
75         struct strlist          *ev_qualifier;
76         struct {
77                 size_t          nr;
78                 int             *entries;
79         }                       ev_qualifier_ids;
80         struct {
81                 size_t          nr;
82                 pid_t           *entries;
83         }                       filter_pids;
84         double                  duration_filter;
85         double                  runtime_ms;
86         struct {
87                 u64             vfs_getname,
88                                 proc_getname;
89         } stats;
90         unsigned int            max_stack;
91         unsigned int            min_stack;
92         bool                    not_ev_qualifier;
93         bool                    live;
94         bool                    full_time;
95         bool                    sched;
96         bool                    multiple_threads;
97         bool                    summary;
98         bool                    summary_only;
99         bool                    show_comm;
100         bool                    show_tool_stats;
101         bool                    trace_syscalls;
102         bool                    kernel_syscallchains;
103         bool                    force;
104         bool                    vfs_getname;
105         int                     trace_pgfaults;
106         int                     open_id;
107 };
108
109 struct tp_field {
110         int offset;
111         union {
112                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
113                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
114         };
115 };
116
117 #define TP_UINT_FIELD(bits) \
118 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
119 { \
120         u##bits value; \
121         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
122         return value;  \
123 }
124
125 TP_UINT_FIELD(8);
126 TP_UINT_FIELD(16);
127 TP_UINT_FIELD(32);
128 TP_UINT_FIELD(64);
129
130 #define TP_UINT_FIELD__SWAPPED(bits) \
131 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
132 { \
133         u##bits value; \
134         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
135         return bswap_##bits(value);\
136 }
137
138 TP_UINT_FIELD__SWAPPED(16);
139 TP_UINT_FIELD__SWAPPED(32);
140 TP_UINT_FIELD__SWAPPED(64);
141
142 static int tp_field__init_uint(struct tp_field *field,
143                                struct format_field *format_field,
144                                bool needs_swap)
145 {
146         field->offset = format_field->offset;
147
148         switch (format_field->size) {
149         case 1:
150                 field->integer = tp_field__u8;
151                 break;
152         case 2:
153                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
154                 break;
155         case 4:
156                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
157                 break;
158         case 8:
159                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
160                 break;
161         default:
162                 return -1;
163         }
164
165         return 0;
166 }
167
168 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
169 {
170         return sample->raw_data + field->offset;
171 }
172
173 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
174 {
175         field->offset = format_field->offset;
176         field->pointer = tp_field__ptr;
177         return 0;
178 }
179
180 struct syscall_tp {
181         struct tp_field id;
182         union {
183                 struct tp_field args, ret;
184         };
185 };
186
187 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
188                                           struct tp_field *field,
189                                           const char *name)
190 {
191         struct format_field *format_field = perf_evsel__field(evsel, name);
192
193         if (format_field == NULL)
194                 return -1;
195
196         return tp_field__init_uint(field, format_field, evsel->needs_swap);
197 }
198
199 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
200         ({ struct syscall_tp *sc = evsel->priv;\
201            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
202
203 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
204                                          struct tp_field *field,
205                                          const char *name)
206 {
207         struct format_field *format_field = perf_evsel__field(evsel, name);
208
209         if (format_field == NULL)
210                 return -1;
211
212         return tp_field__init_ptr(field, format_field);
213 }
214
215 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
216         ({ struct syscall_tp *sc = evsel->priv;\
217            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
218
219 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
220 {
221         zfree(&evsel->priv);
222         perf_evsel__delete(evsel);
223 }
224
225 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
226 {
227         evsel->priv = malloc(sizeof(struct syscall_tp));
228         if (evsel->priv != NULL) {
229                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
230                         goto out_delete;
231
232                 evsel->handler = handler;
233                 return 0;
234         }
235
236         return -ENOMEM;
237
238 out_delete:
239         zfree(&evsel->priv);
240         return -ENOENT;
241 }
242
243 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
244 {
245         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
246
247         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
248         if (IS_ERR(evsel))
249                 evsel = perf_evsel__newtp("syscalls", direction);
250
251         if (IS_ERR(evsel))
252                 return NULL;
253
254         if (perf_evsel__init_syscall_tp(evsel, handler))
255                 goto out_delete;
256
257         return evsel;
258
259 out_delete:
260         perf_evsel__delete_priv(evsel);
261         return NULL;
262 }
263
264 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
265         ({ struct syscall_tp *fields = evsel->priv; \
266            fields->name.integer(&fields->name, sample); })
267
268 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
269         ({ struct syscall_tp *fields = evsel->priv; \
270            fields->name.pointer(&fields->name, sample); })
271
272 struct strarray {
273         int         offset;
274         int         nr_entries;
275         const char **entries;
276 };
277
278 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
279         .nr_entries = ARRAY_SIZE(array), \
280         .entries = array, \
281 }
282
283 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
284         .offset     = off, \
285         .nr_entries = ARRAY_SIZE(array), \
286         .entries = array, \
287 }
288
289 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
290                                                 const char *intfmt,
291                                                 struct syscall_arg *arg)
292 {
293         struct strarray *sa = arg->parm;
294         int idx = arg->val - sa->offset;
295
296         if (idx < 0 || idx >= sa->nr_entries)
297                 return scnprintf(bf, size, intfmt, arg->val);
298
299         return scnprintf(bf, size, "%s", sa->entries[idx]);
300 }
301
302 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
303                                               struct syscall_arg *arg)
304 {
305         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
306 }
307
308 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
309
310 #if defined(__i386__) || defined(__x86_64__)
311 /*
312  * FIXME: Make this available to all arches as soon as the ioctl beautifier
313  *        gets rewritten to support all arches.
314  */
315 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
316                                                  struct syscall_arg *arg)
317 {
318         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
319 }
320
321 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
322 #endif /* defined(__i386__) || defined(__x86_64__) */
323
324 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
325                                         struct syscall_arg *arg);
326
327 #define SCA_FD syscall_arg__scnprintf_fd
328
329 #ifndef AT_FDCWD
330 #define AT_FDCWD        -100
331 #endif
332
333 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
334                                            struct syscall_arg *arg)
335 {
336         int fd = arg->val;
337
338         if (fd == AT_FDCWD)
339                 return scnprintf(bf, size, "CWD");
340
341         return syscall_arg__scnprintf_fd(bf, size, arg);
342 }
343
344 #define SCA_FDAT syscall_arg__scnprintf_fd_at
345
346 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
347                                               struct syscall_arg *arg);
348
349 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
350
351 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
352                                          struct syscall_arg *arg)
353 {
354         return scnprintf(bf, size, "%#lx", arg->val);
355 }
356
357 #define SCA_HEX syscall_arg__scnprintf_hex
358
359 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
360                                          struct syscall_arg *arg)
361 {
362         return scnprintf(bf, size, "%d", arg->val);
363 }
364
365 #define SCA_INT syscall_arg__scnprintf_int
366
367 static const char *bpf_cmd[] = {
368         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
369         "MAP_GET_NEXT_KEY", "PROG_LOAD",
370 };
371 static DEFINE_STRARRAY(bpf_cmd);
372
373 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
374 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
375
376 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
377 static DEFINE_STRARRAY(itimers);
378
379 static const char *keyctl_options[] = {
380         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
381         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
382         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
383         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
384         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
385 };
386 static DEFINE_STRARRAY(keyctl_options);
387
388 static const char *whences[] = { "SET", "CUR", "END",
389 #ifdef SEEK_DATA
390 "DATA",
391 #endif
392 #ifdef SEEK_HOLE
393 "HOLE",
394 #endif
395 };
396 static DEFINE_STRARRAY(whences);
397
398 static const char *fcntl_cmds[] = {
399         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
400         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
401         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
402         "F_GETOWNER_UIDS",
403 };
404 static DEFINE_STRARRAY(fcntl_cmds);
405
406 static const char *rlimit_resources[] = {
407         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
408         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
409         "RTTIME",
410 };
411 static DEFINE_STRARRAY(rlimit_resources);
412
413 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
414 static DEFINE_STRARRAY(sighow);
415
416 static const char *clockid[] = {
417         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
418         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
419         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
420 };
421 static DEFINE_STRARRAY(clockid);
422
423 static const char *socket_families[] = {
424         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
425         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
426         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
427         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
428         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
429         "ALG", "NFC", "VSOCK",
430 };
431 static DEFINE_STRARRAY(socket_families);
432
433 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
434                                                  struct syscall_arg *arg)
435 {
436         size_t printed = 0;
437         int mode = arg->val;
438
439         if (mode == F_OK) /* 0 */
440                 return scnprintf(bf, size, "F");
441 #define P_MODE(n) \
442         if (mode & n##_OK) { \
443                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
444                 mode &= ~n##_OK; \
445         }
446
447         P_MODE(R);
448         P_MODE(W);
449         P_MODE(X);
450 #undef P_MODE
451
452         if (mode)
453                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
454
455         return printed;
456 }
457
458 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
459
460 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
461                                               struct syscall_arg *arg);
462
463 #define SCA_FILENAME syscall_arg__scnprintf_filename
464
465 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
466                                                 struct syscall_arg *arg)
467 {
468         int printed = 0, flags = arg->val;
469
470 #define P_FLAG(n) \
471         if (flags & O_##n) { \
472                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
473                 flags &= ~O_##n; \
474         }
475
476         P_FLAG(CLOEXEC);
477         P_FLAG(NONBLOCK);
478 #undef P_FLAG
479
480         if (flags)
481                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
482
483         return printed;
484 }
485
486 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
487
488 #if defined(__i386__) || defined(__x86_64__)
489 /*
490  * FIXME: Make this available to all arches.
491  */
492 #define TCGETS          0x5401
493
494 static const char *tioctls[] = {
495         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
496         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
497         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
498         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
499         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
500         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
501         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
502         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
503         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
504         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
505         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
506         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
507         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
508         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
509         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
510 };
511
512 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
513 #endif /* defined(__i386__) || defined(__x86_64__) */
514
515 #ifndef GRND_NONBLOCK
516 #define GRND_NONBLOCK   0x0001
517 #endif
518 #ifndef GRND_RANDOM
519 #define GRND_RANDOM     0x0002
520 #endif
521
522 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
523                                                    struct syscall_arg *arg)
524 {
525         int printed = 0, flags = arg->val;
526
527 #define P_FLAG(n) \
528         if (flags & GRND_##n) { \
529                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
530                 flags &= ~GRND_##n; \
531         }
532
533         P_FLAG(RANDOM);
534         P_FLAG(NONBLOCK);
535 #undef P_FLAG
536
537         if (flags)
538                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
539
540         return printed;
541 }
542
543 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
544
545 #define STRARRAY(arg, name, array) \
546           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
547           .arg_parm      = { [arg] = &strarray__##array, }
548
549 #include "trace/beauty/eventfd.c"
550 #include "trace/beauty/flock.c"
551 #include "trace/beauty/futex_op.c"
552 #include "trace/beauty/mmap.c"
553 #include "trace/beauty/mode_t.c"
554 #include "trace/beauty/msg_flags.c"
555 #include "trace/beauty/open_flags.c"
556 #include "trace/beauty/perf_event_open.c"
557 #include "trace/beauty/pid.c"
558 #include "trace/beauty/sched_policy.c"
559 #include "trace/beauty/seccomp.c"
560 #include "trace/beauty/signum.c"
561 #include "trace/beauty/socket_type.c"
562 #include "trace/beauty/waitid_options.c"
563
564 static struct syscall_fmt {
565         const char *name;
566         const char *alias;
567         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
568         void       *arg_parm[6];
569         bool       errmsg;
570         bool       errpid;
571         bool       timeout;
572         bool       hexret;
573 } syscall_fmts[] = {
574         { .name     = "access",     .errmsg = true,
575           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
576         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
577         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
578         { .name     = "brk",        .hexret = true,
579           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
580         { .name     = "chdir",      .errmsg = true, },
581         { .name     = "chmod",      .errmsg = true, },
582         { .name     = "chroot",     .errmsg = true, },
583         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
584         { .name     = "clone",      .errpid = true, },
585         { .name     = "close",      .errmsg = true,
586           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
587         { .name     = "connect",    .errmsg = true, },
588         { .name     = "creat",      .errmsg = true, },
589         { .name     = "dup",        .errmsg = true, },
590         { .name     = "dup2",       .errmsg = true, },
591         { .name     = "dup3",       .errmsg = true, },
592         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
593         { .name     = "eventfd2",   .errmsg = true,
594           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
595         { .name     = "faccessat",  .errmsg = true, },
596         { .name     = "fadvise64",  .errmsg = true, },
597         { .name     = "fallocate",  .errmsg = true, },
598         { .name     = "fchdir",     .errmsg = true, },
599         { .name     = "fchmod",     .errmsg = true, },
600         { .name     = "fchmodat",   .errmsg = true,
601           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
602         { .name     = "fchown",     .errmsg = true, },
603         { .name     = "fchownat",   .errmsg = true,
604           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
605         { .name     = "fcntl",      .errmsg = true,
606           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
607           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
608         { .name     = "fdatasync",  .errmsg = true, },
609         { .name     = "flock",      .errmsg = true,
610           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
611         { .name     = "fsetxattr",  .errmsg = true, },
612         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
613         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
614         { .name     = "fstatfs",    .errmsg = true, },
615         { .name     = "fsync",    .errmsg = true, },
616         { .name     = "ftruncate", .errmsg = true, },
617         { .name     = "futex",      .errmsg = true,
618           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
619         { .name     = "futimesat", .errmsg = true,
620           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
621         { .name     = "getdents",   .errmsg = true, },
622         { .name     = "getdents64", .errmsg = true, },
623         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
624         { .name     = "getpid",     .errpid = true, },
625         { .name     = "getpgid",    .errpid = true, },
626         { .name     = "getppid",    .errpid = true, },
627         { .name     = "getrandom",  .errmsg = true,
628           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
629         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
630         { .name     = "getxattr",   .errmsg = true, },
631         { .name     = "inotify_add_watch",          .errmsg = true, },
632         { .name     = "ioctl",      .errmsg = true,
633           .arg_scnprintf = {
634 #if defined(__i386__) || defined(__x86_64__)
635 /*
636  * FIXME: Make this available to all arches.
637  */
638                              [1] = SCA_STRHEXARRAY, /* cmd */
639                              [2] = SCA_HEX, /* arg */ },
640           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
641 #else
642                              [2] = SCA_HEX, /* arg */ }, },
643 #endif
644         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
645         { .name     = "kill",       .errmsg = true,
646           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
647         { .name     = "lchown",    .errmsg = true, },
648         { .name     = "lgetxattr",  .errmsg = true, },
649         { .name     = "linkat",     .errmsg = true,
650           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
651         { .name     = "listxattr",  .errmsg = true, },
652         { .name     = "llistxattr", .errmsg = true, },
653         { .name     = "lremovexattr",  .errmsg = true, },
654         { .name     = "lseek",      .errmsg = true,
655           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
656           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
657         { .name     = "lsetxattr",  .errmsg = true, },
658         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
659         { .name     = "lsxattr",    .errmsg = true, },
660         { .name     = "madvise",    .errmsg = true,
661           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
662                              [2] = SCA_MADV_BHV, /* behavior */ }, },
663         { .name     = "mkdir",    .errmsg = true, },
664         { .name     = "mkdirat",    .errmsg = true,
665           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
666         { .name     = "mknod",      .errmsg = true, },
667         { .name     = "mknodat",    .errmsg = true,
668           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
669         { .name     = "mlock",      .errmsg = true,
670           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
671         { .name     = "mlockall",   .errmsg = true,
672           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
673         { .name     = "mmap",       .hexret = true,
674           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
675                              [2] = SCA_MMAP_PROT, /* prot */
676                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
677         { .name     = "mprotect",   .errmsg = true,
678           .arg_scnprintf = { [0] = SCA_HEX, /* start */
679                              [2] = SCA_MMAP_PROT, /* prot */ }, },
680         { .name     = "mq_unlink", .errmsg = true,
681           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
682         { .name     = "mremap",     .hexret = true,
683           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
684                              [3] = SCA_MREMAP_FLAGS, /* flags */
685                              [4] = SCA_HEX, /* new_addr */ }, },
686         { .name     = "munlock",    .errmsg = true,
687           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
688         { .name     = "munmap",     .errmsg = true,
689           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
690         { .name     = "name_to_handle_at", .errmsg = true,
691           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
692         { .name     = "newfstatat", .errmsg = true,
693           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
694         { .name     = "open",       .errmsg = true,
695           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
696         { .name     = "open_by_handle_at", .errmsg = true,
697           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
698                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
699         { .name     = "openat",     .errmsg = true,
700           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
701                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
702         { .name     = "perf_event_open", .errmsg = true,
703           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
704                              [3] = SCA_FD,  /* group_fd */
705                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
706         { .name     = "pipe2",      .errmsg = true,
707           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
708         { .name     = "poll",       .errmsg = true, .timeout = true, },
709         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
710         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
711         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
712         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
713         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
714         { .name     = "pwritev",    .errmsg = true, },
715         { .name     = "read",       .errmsg = true, },
716         { .name     = "readlink",   .errmsg = true, },
717         { .name     = "readlinkat", .errmsg = true,
718           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
719         { .name     = "readv",      .errmsg = true, },
720         { .name     = "recvfrom",   .errmsg = true,
721           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
722         { .name     = "recvmmsg",   .errmsg = true,
723           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
724         { .name     = "recvmsg",    .errmsg = true,
725           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
726         { .name     = "removexattr", .errmsg = true, },
727         { .name     = "renameat",   .errmsg = true,
728           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
729         { .name     = "rmdir",    .errmsg = true, },
730         { .name     = "rt_sigaction", .errmsg = true,
731           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
732         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
733         { .name     = "rt_sigqueueinfo", .errmsg = true,
734           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
735         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
736           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
737         { .name     = "sched_getattr",        .errmsg = true, },
738         { .name     = "sched_setattr",        .errmsg = true, },
739         { .name     = "sched_setscheduler",   .errmsg = true,
740           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
741         { .name     = "seccomp", .errmsg = true,
742           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
743                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
744         { .name     = "select",     .errmsg = true, .timeout = true, },
745         { .name     = "sendmmsg",    .errmsg = true,
746           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
747         { .name     = "sendmsg",    .errmsg = true,
748           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
749         { .name     = "sendto",     .errmsg = true,
750           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
751         { .name     = "set_tid_address", .errpid = true, },
752         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
753         { .name     = "setpgid",    .errmsg = true, },
754         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
755         { .name     = "setxattr",   .errmsg = true, },
756         { .name     = "shutdown",   .errmsg = true, },
757         { .name     = "socket",     .errmsg = true,
758           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
759                              [1] = SCA_SK_TYPE, /* type */ },
760           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
761         { .name     = "socketpair", .errmsg = true,
762           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
763                              [1] = SCA_SK_TYPE, /* type */ },
764           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
765         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
766         { .name     = "statfs",     .errmsg = true, },
767         { .name     = "statx",      .errmsg = true,
768           .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
769                              [2] = SCA_STATX_FLAGS, /* flags */
770                              [3] = SCA_STATX_MASK, /* mask */ }, },
771         { .name     = "swapoff",    .errmsg = true,
772           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
773         { .name     = "swapon",     .errmsg = true,
774           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
775         { .name     = "symlinkat",  .errmsg = true,
776           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
777         { .name     = "tgkill",     .errmsg = true,
778           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
779         { .name     = "tkill",      .errmsg = true,
780           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
781         { .name     = "truncate",   .errmsg = true, },
782         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
783         { .name     = "unlinkat",   .errmsg = true,
784           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
785         { .name     = "utime",  .errmsg = true, },
786         { .name     = "utimensat",  .errmsg = true,
787           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
788         { .name     = "utimes",  .errmsg = true, },
789         { .name     = "vmsplice",  .errmsg = true, },
790         { .name     = "wait4",      .errpid = true,
791           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
792         { .name     = "waitid",     .errpid = true,
793           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
794         { .name     = "write",      .errmsg = true, },
795         { .name     = "writev",     .errmsg = true, },
796 };
797
798 static int syscall_fmt__cmp(const void *name, const void *fmtp)
799 {
800         const struct syscall_fmt *fmt = fmtp;
801         return strcmp(name, fmt->name);
802 }
803
804 static struct syscall_fmt *syscall_fmt__find(const char *name)
805 {
806         const int nmemb = ARRAY_SIZE(syscall_fmts);
807         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
808 }
809
810 struct syscall {
811         struct event_format *tp_format;
812         int                 nr_args;
813         struct format_field *args;
814         const char          *name;
815         bool                is_exit;
816         struct syscall_fmt  *fmt;
817         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
818         void                **arg_parm;
819 };
820
821 /*
822  * We need to have this 'calculated' boolean because in some cases we really
823  * don't know what is the duration of a syscall, for instance, when we start
824  * a session and some threads are waiting for a syscall to finish, say 'poll',
825  * in which case all we can do is to print "( ? ) for duration and for the
826  * start timestamp.
827  */
828 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
829 {
830         double duration = (double)t / NSEC_PER_MSEC;
831         size_t printed = fprintf(fp, "(");
832
833         if (!calculated)
834                 printed += fprintf(fp, "     ?   ");
835         else if (duration >= 1.0)
836                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
837         else if (duration >= 0.01)
838                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
839         else
840                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
841         return printed + fprintf(fp, "): ");
842 }
843
844 /**
845  * filename.ptr: The filename char pointer that will be vfs_getname'd
846  * filename.entry_str_pos: Where to insert the string translated from
847  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
848  */
849 struct thread_trace {
850         u64               entry_time;
851         bool              entry_pending;
852         unsigned long     nr_events;
853         unsigned long     pfmaj, pfmin;
854         char              *entry_str;
855         double            runtime_ms;
856         struct {
857                 unsigned long ptr;
858                 short int     entry_str_pos;
859                 bool          pending_open;
860                 unsigned int  namelen;
861                 char          *name;
862         } filename;
863         struct {
864                 int       max;
865                 char      **table;
866         } paths;
867
868         struct intlist *syscall_stats;
869 };
870
871 static struct thread_trace *thread_trace__new(void)
872 {
873         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
874
875         if (ttrace)
876                 ttrace->paths.max = -1;
877
878         ttrace->syscall_stats = intlist__new(NULL);
879
880         return ttrace;
881 }
882
883 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
884 {
885         struct thread_trace *ttrace;
886
887         if (thread == NULL)
888                 goto fail;
889
890         if (thread__priv(thread) == NULL)
891                 thread__set_priv(thread, thread_trace__new());
892
893         if (thread__priv(thread) == NULL)
894                 goto fail;
895
896         ttrace = thread__priv(thread);
897         ++ttrace->nr_events;
898
899         return ttrace;
900 fail:
901         color_fprintf(fp, PERF_COLOR_RED,
902                       "WARNING: not enough memory, dropping samples!\n");
903         return NULL;
904 }
905
906 #define TRACE_PFMAJ             (1 << 0)
907 #define TRACE_PFMIN             (1 << 1)
908
909 static const size_t trace__entry_str_size = 2048;
910
911 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
912 {
913         struct thread_trace *ttrace = thread__priv(thread);
914
915         if (fd > ttrace->paths.max) {
916                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
917
918                 if (npath == NULL)
919                         return -1;
920
921                 if (ttrace->paths.max != -1) {
922                         memset(npath + ttrace->paths.max + 1, 0,
923                                (fd - ttrace->paths.max) * sizeof(char *));
924                 } else {
925                         memset(npath, 0, (fd + 1) * sizeof(char *));
926                 }
927
928                 ttrace->paths.table = npath;
929                 ttrace->paths.max   = fd;
930         }
931
932         ttrace->paths.table[fd] = strdup(pathname);
933
934         return ttrace->paths.table[fd] != NULL ? 0 : -1;
935 }
936
937 static int thread__read_fd_path(struct thread *thread, int fd)
938 {
939         char linkname[PATH_MAX], pathname[PATH_MAX];
940         struct stat st;
941         int ret;
942
943         if (thread->pid_ == thread->tid) {
944                 scnprintf(linkname, sizeof(linkname),
945                           "/proc/%d/fd/%d", thread->pid_, fd);
946         } else {
947                 scnprintf(linkname, sizeof(linkname),
948                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
949         }
950
951         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
952                 return -1;
953
954         ret = readlink(linkname, pathname, sizeof(pathname));
955
956         if (ret < 0 || ret > st.st_size)
957                 return -1;
958
959         pathname[ret] = '\0';
960         return trace__set_fd_pathname(thread, fd, pathname);
961 }
962
963 static const char *thread__fd_path(struct thread *thread, int fd,
964                                    struct trace *trace)
965 {
966         struct thread_trace *ttrace = thread__priv(thread);
967
968         if (ttrace == NULL)
969                 return NULL;
970
971         if (fd < 0)
972                 return NULL;
973
974         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
975                 if (!trace->live)
976                         return NULL;
977                 ++trace->stats.proc_getname;
978                 if (thread__read_fd_path(thread, fd))
979                         return NULL;
980         }
981
982         return ttrace->paths.table[fd];
983 }
984
985 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
986                                         struct syscall_arg *arg)
987 {
988         int fd = arg->val;
989         size_t printed = scnprintf(bf, size, "%d", fd);
990         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
991
992         if (path)
993                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
994
995         return printed;
996 }
997
998 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
999                                               struct syscall_arg *arg)
1000 {
1001         int fd = arg->val;
1002         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1003         struct thread_trace *ttrace = thread__priv(arg->thread);
1004
1005         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1006                 zfree(&ttrace->paths.table[fd]);
1007
1008         return printed;
1009 }
1010
1011 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1012                                      unsigned long ptr)
1013 {
1014         struct thread_trace *ttrace = thread__priv(thread);
1015
1016         ttrace->filename.ptr = ptr;
1017         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1018 }
1019
1020 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1021                                               struct syscall_arg *arg)
1022 {
1023         unsigned long ptr = arg->val;
1024
1025         if (!arg->trace->vfs_getname)
1026                 return scnprintf(bf, size, "%#x", ptr);
1027
1028         thread__set_filename_pos(arg->thread, bf, ptr);
1029         return 0;
1030 }
1031
1032 static bool trace__filter_duration(struct trace *trace, double t)
1033 {
1034         return t < (trace->duration_filter * NSEC_PER_MSEC);
1035 }
1036
1037 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1038 {
1039         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1040
1041         return fprintf(fp, "%10.3f ", ts);
1042 }
1043
1044 /*
1045  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1046  * using ttrace->entry_time for a thread that receives a sys_exit without
1047  * first having received a sys_enter ("poll" issued before tracing session
1048  * starts, lost sys_enter exit due to ring buffer overflow).
1049  */
1050 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1051 {
1052         if (tstamp > 0)
1053                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1054
1055         return fprintf(fp, "         ? ");
1056 }
1057
1058 static bool done = false;
1059 static bool interrupted = false;
1060
1061 static void sig_handler(int sig)
1062 {
1063         done = true;
1064         interrupted = sig == SIGINT;
1065 }
1066
1067 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1068                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1069 {
1070         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1071         printed += fprintf_duration(duration, duration_calculated, fp);
1072
1073         if (trace->multiple_threads) {
1074                 if (trace->show_comm)
1075                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1076                 printed += fprintf(fp, "%d ", thread->tid);
1077         }
1078
1079         return printed;
1080 }
1081
1082 static int trace__process_event(struct trace *trace, struct machine *machine,
1083                                 union perf_event *event, struct perf_sample *sample)
1084 {
1085         int ret = 0;
1086
1087         switch (event->header.type) {
1088         case PERF_RECORD_LOST:
1089                 color_fprintf(trace->output, PERF_COLOR_RED,
1090                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1091                 ret = machine__process_lost_event(machine, event, sample);
1092                 break;
1093         default:
1094                 ret = machine__process_event(machine, event, sample);
1095                 break;
1096         }
1097
1098         return ret;
1099 }
1100
1101 static int trace__tool_process(struct perf_tool *tool,
1102                                union perf_event *event,
1103                                struct perf_sample *sample,
1104                                struct machine *machine)
1105 {
1106         struct trace *trace = container_of(tool, struct trace, tool);
1107         return trace__process_event(trace, machine, event, sample);
1108 }
1109
1110 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1111 {
1112         struct machine *machine = vmachine;
1113
1114         if (machine->kptr_restrict_warned)
1115                 return NULL;
1116
1117         if (symbol_conf.kptr_restrict) {
1118                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1119                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1120                            "Kernel samples will not be resolved.\n");
1121                 machine->kptr_restrict_warned = true;
1122                 return NULL;
1123         }
1124
1125         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1126 }
1127
1128 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1129 {
1130         int err = symbol__init(NULL);
1131
1132         if (err)
1133                 return err;
1134
1135         trace->host = machine__new_host();
1136         if (trace->host == NULL)
1137                 return -ENOMEM;
1138
1139         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1140                 return -errno;
1141
1142         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1143                                             evlist->threads, trace__tool_process, false,
1144                                             trace->opts.proc_map_timeout);
1145         if (err)
1146                 symbol__exit();
1147
1148         return err;
1149 }
1150
1151 static int syscall__set_arg_fmts(struct syscall *sc)
1152 {
1153         struct format_field *field;
1154         int idx = 0, len;
1155
1156         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1157         if (sc->arg_scnprintf == NULL)
1158                 return -1;
1159
1160         if (sc->fmt)
1161                 sc->arg_parm = sc->fmt->arg_parm;
1162
1163         for (field = sc->args; field; field = field->next) {
1164                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1165                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1166                 else if (strcmp(field->type, "const char *") == 0 &&
1167                          (strcmp(field->name, "filename") == 0 ||
1168                           strcmp(field->name, "path") == 0 ||
1169                           strcmp(field->name, "pathname") == 0))
1170                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1171                 else if (field->flags & FIELD_IS_POINTER)
1172                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1173                 else if (strcmp(field->type, "pid_t") == 0)
1174                         sc->arg_scnprintf[idx] = SCA_PID;
1175                 else if (strcmp(field->type, "umode_t") == 0)
1176                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1177                 else if ((strcmp(field->type, "int") == 0 ||
1178                           strcmp(field->type, "unsigned int") == 0 ||
1179                           strcmp(field->type, "long") == 0) &&
1180                          (len = strlen(field->name)) >= 2 &&
1181                          strcmp(field->name + len - 2, "fd") == 0) {
1182                         /*
1183                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1184                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1185                          * 65 int
1186                          * 23 unsigned int
1187                          * 7 unsigned long
1188                          */
1189                         sc->arg_scnprintf[idx] = SCA_FD;
1190                 }
1191                 ++idx;
1192         }
1193
1194         return 0;
1195 }
1196
1197 static int trace__read_syscall_info(struct trace *trace, int id)
1198 {
1199         char tp_name[128];
1200         struct syscall *sc;
1201         const char *name = syscalltbl__name(trace->sctbl, id);
1202
1203         if (name == NULL)
1204                 return -1;
1205
1206         if (id > trace->syscalls.max) {
1207                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1208
1209                 if (nsyscalls == NULL)
1210                         return -1;
1211
1212                 if (trace->syscalls.max != -1) {
1213                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1214                                (id - trace->syscalls.max) * sizeof(*sc));
1215                 } else {
1216                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1217                 }
1218
1219                 trace->syscalls.table = nsyscalls;
1220                 trace->syscalls.max   = id;
1221         }
1222
1223         sc = trace->syscalls.table + id;
1224         sc->name = name;
1225
1226         sc->fmt  = syscall_fmt__find(sc->name);
1227
1228         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1229         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1230
1231         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1232                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1233                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1234         }
1235
1236         if (IS_ERR(sc->tp_format))
1237                 return -1;
1238
1239         sc->args = sc->tp_format->format.fields;
1240         sc->nr_args = sc->tp_format->format.nr_fields;
1241         /*
1242          * We need to check and discard the first variable '__syscall_nr'
1243          * or 'nr' that mean the syscall number. It is needless here.
1244          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1245          */
1246         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1247                 sc->args = sc->args->next;
1248                 --sc->nr_args;
1249         }
1250
1251         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1252
1253         return syscall__set_arg_fmts(sc);
1254 }
1255
1256 static int trace__validate_ev_qualifier(struct trace *trace)
1257 {
1258         int err = 0, i;
1259         struct str_node *pos;
1260
1261         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1262         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1263                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1264
1265         if (trace->ev_qualifier_ids.entries == NULL) {
1266                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1267                        trace->output);
1268                 err = -EINVAL;
1269                 goto out;
1270         }
1271
1272         i = 0;
1273
1274         strlist__for_each_entry(pos, trace->ev_qualifier) {
1275                 const char *sc = pos->s;
1276                 int id = syscalltbl__id(trace->sctbl, sc);
1277
1278                 if (id < 0) {
1279                         if (err == 0) {
1280                                 fputs("Error:\tInvalid syscall ", trace->output);
1281                                 err = -EINVAL;
1282                         } else {
1283                                 fputs(", ", trace->output);
1284                         }
1285
1286                         fputs(sc, trace->output);
1287                 }
1288
1289                 trace->ev_qualifier_ids.entries[i++] = id;
1290         }
1291
1292         if (err < 0) {
1293                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1294                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1295                 zfree(&trace->ev_qualifier_ids.entries);
1296                 trace->ev_qualifier_ids.nr = 0;
1297         }
1298 out:
1299         return err;
1300 }
1301
1302 /*
1303  * args is to be interpreted as a series of longs but we need to handle
1304  * 8-byte unaligned accesses. args points to raw_data within the event
1305  * and raw_data is guaranteed to be 8-byte unaligned because it is
1306  * preceded by raw_size which is a u32. So we need to copy args to a temp
1307  * variable to read it. Most notably this avoids extended load instructions
1308  * on unaligned addresses
1309  */
1310
1311 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1312                                       unsigned char *args, struct trace *trace,
1313                                       struct thread *thread)
1314 {
1315         size_t printed = 0;
1316         unsigned char *p;
1317         unsigned long val;
1318
1319         if (sc->args != NULL) {
1320                 struct format_field *field;
1321                 u8 bit = 1;
1322                 struct syscall_arg arg = {
1323                         .idx    = 0,
1324                         .mask   = 0,
1325                         .trace  = trace,
1326                         .thread = thread,
1327                 };
1328
1329                 for (field = sc->args; field;
1330                      field = field->next, ++arg.idx, bit <<= 1) {
1331                         if (arg.mask & bit)
1332                                 continue;
1333
1334                         /* special care for unaligned accesses */
1335                         p = args + sizeof(unsigned long) * arg.idx;
1336                         memcpy(&val, p, sizeof(val));
1337
1338                         /*
1339                          * Suppress this argument if its value is zero and
1340                          * and we don't have a string associated in an
1341                          * strarray for it.
1342                          */
1343                         if (val == 0 &&
1344                             !(sc->arg_scnprintf &&
1345                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1346                               sc->arg_parm[arg.idx]))
1347                                 continue;
1348
1349                         printed += scnprintf(bf + printed, size - printed,
1350                                              "%s%s: ", printed ? ", " : "", field->name);
1351                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1352                                 arg.val = val;
1353                                 if (sc->arg_parm)
1354                                         arg.parm = sc->arg_parm[arg.idx];
1355                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1356                                                                       size - printed, &arg);
1357                         } else {
1358                                 printed += scnprintf(bf + printed, size - printed,
1359                                                      "%ld", val);
1360                         }
1361                 }
1362         } else if (IS_ERR(sc->tp_format)) {
1363                 /*
1364                  * If we managed to read the tracepoint /format file, then we
1365                  * may end up not having any args, like with gettid(), so only
1366                  * print the raw args when we didn't manage to read it.
1367                  */
1368                 int i = 0;
1369
1370                 while (i < 6) {
1371                         /* special care for unaligned accesses */
1372                         p = args + sizeof(unsigned long) * i;
1373                         memcpy(&val, p, sizeof(val));
1374                         printed += scnprintf(bf + printed, size - printed,
1375                                              "%sarg%d: %ld",
1376                                              printed ? ", " : "", i, val);
1377                         ++i;
1378                 }
1379         }
1380
1381         return printed;
1382 }
1383
1384 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1385                                   union perf_event *event,
1386                                   struct perf_sample *sample);
1387
1388 static struct syscall *trace__syscall_info(struct trace *trace,
1389                                            struct perf_evsel *evsel, int id)
1390 {
1391
1392         if (id < 0) {
1393
1394                 /*
1395                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1396                  * before that, leaving at a higher verbosity level till that is
1397                  * explained. Reproduced with plain ftrace with:
1398                  *
1399                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1400                  * grep "NR -1 " /t/trace_pipe
1401                  *
1402                  * After generating some load on the machine.
1403                  */
1404                 if (verbose > 1) {
1405                         static u64 n;
1406                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1407                                 id, perf_evsel__name(evsel), ++n);
1408                 }
1409                 return NULL;
1410         }
1411
1412         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1413             trace__read_syscall_info(trace, id))
1414                 goto out_cant_read;
1415
1416         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1417                 goto out_cant_read;
1418
1419         return &trace->syscalls.table[id];
1420
1421 out_cant_read:
1422         if (verbose > 0) {
1423                 fprintf(trace->output, "Problems reading syscall %d", id);
1424                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1425                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1426                 fputs(" information\n", trace->output);
1427         }
1428         return NULL;
1429 }
1430
1431 static void thread__update_stats(struct thread_trace *ttrace,
1432                                  int id, struct perf_sample *sample)
1433 {
1434         struct int_node *inode;
1435         struct stats *stats;
1436         u64 duration = 0;
1437
1438         inode = intlist__findnew(ttrace->syscall_stats, id);
1439         if (inode == NULL)
1440                 return;
1441
1442         stats = inode->priv;
1443         if (stats == NULL) {
1444                 stats = malloc(sizeof(struct stats));
1445                 if (stats == NULL)
1446                         return;
1447                 init_stats(stats);
1448                 inode->priv = stats;
1449         }
1450
1451         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1452                 duration = sample->time - ttrace->entry_time;
1453
1454         update_stats(stats, duration);
1455 }
1456
1457 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1458 {
1459         struct thread_trace *ttrace;
1460         u64 duration;
1461         size_t printed;
1462
1463         if (trace->current == NULL)
1464                 return 0;
1465
1466         ttrace = thread__priv(trace->current);
1467
1468         if (!ttrace->entry_pending)
1469                 return 0;
1470
1471         duration = sample->time - ttrace->entry_time;
1472
1473         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1474         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1475         ttrace->entry_pending = false;
1476
1477         return printed;
1478 }
1479
1480 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1481                             union perf_event *event __maybe_unused,
1482                             struct perf_sample *sample)
1483 {
1484         char *msg;
1485         void *args;
1486         size_t printed = 0;
1487         struct thread *thread;
1488         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1489         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1490         struct thread_trace *ttrace;
1491
1492         if (sc == NULL)
1493                 return -1;
1494
1495         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1496         ttrace = thread__trace(thread, trace->output);
1497         if (ttrace == NULL)
1498                 goto out_put;
1499
1500         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1501
1502         if (ttrace->entry_str == NULL) {
1503                 ttrace->entry_str = malloc(trace__entry_str_size);
1504                 if (!ttrace->entry_str)
1505                         goto out_put;
1506         }
1507
1508         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1509                 trace__printf_interrupted_entry(trace, sample);
1510
1511         ttrace->entry_time = sample->time;
1512         msg = ttrace->entry_str;
1513         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1514
1515         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1516                                            args, trace, thread);
1517
1518         if (sc->is_exit) {
1519                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1520                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1521                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1522                 }
1523         } else {
1524                 ttrace->entry_pending = true;
1525                 /* See trace__vfs_getname & trace__sys_exit */
1526                 ttrace->filename.pending_open = false;
1527         }
1528
1529         if (trace->current != thread) {
1530                 thread__put(trace->current);
1531                 trace->current = thread__get(thread);
1532         }
1533         err = 0;
1534 out_put:
1535         thread__put(thread);
1536         return err;
1537 }
1538
1539 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1540                                     struct perf_sample *sample,
1541                                     struct callchain_cursor *cursor)
1542 {
1543         struct addr_location al;
1544
1545         if (machine__resolve(trace->host, &al, sample) < 0 ||
1546             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1547                 return -1;
1548
1549         return 0;
1550 }
1551
1552 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1553 {
1554         /* TODO: user-configurable print_opts */
1555         const unsigned int print_opts = EVSEL__PRINT_SYM |
1556                                         EVSEL__PRINT_DSO |
1557                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1558
1559         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1560 }
1561
1562 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1563                            union perf_event *event __maybe_unused,
1564                            struct perf_sample *sample)
1565 {
1566         long ret;
1567         u64 duration = 0;
1568         bool duration_calculated = false;
1569         struct thread *thread;
1570         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1571         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1572         struct thread_trace *ttrace;
1573
1574         if (sc == NULL)
1575                 return -1;
1576
1577         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1578         ttrace = thread__trace(thread, trace->output);
1579         if (ttrace == NULL)
1580                 goto out_put;
1581
1582         if (trace->summary)
1583                 thread__update_stats(ttrace, id, sample);
1584
1585         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1586
1587         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1588                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1589                 ttrace->filename.pending_open = false;
1590                 ++trace->stats.vfs_getname;
1591         }
1592
1593         if (ttrace->entry_time) {
1594                 duration = sample->time - ttrace->entry_time;
1595                 if (trace__filter_duration(trace, duration))
1596                         goto out;
1597                 duration_calculated = true;
1598         } else if (trace->duration_filter)
1599                 goto out;
1600
1601         if (sample->callchain) {
1602                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1603                 if (callchain_ret == 0) {
1604                         if (callchain_cursor.nr < trace->min_stack)
1605                                 goto out;
1606                         callchain_ret = 1;
1607                 }
1608         }
1609
1610         if (trace->summary_only)
1611                 goto out;
1612
1613         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1614
1615         if (ttrace->entry_pending) {
1616                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1617         } else {
1618                 fprintf(trace->output, " ... [");
1619                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1620                 fprintf(trace->output, "]: %s()", sc->name);
1621         }
1622
1623         if (sc->fmt == NULL) {
1624 signed_print:
1625                 fprintf(trace->output, ") = %ld", ret);
1626         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1627                 char bf[STRERR_BUFSIZE];
1628                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1629                            *e = audit_errno_to_name(-ret);
1630
1631                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1632         } else if (ret == 0 && sc->fmt->timeout)
1633                 fprintf(trace->output, ") = 0 Timeout");
1634         else if (sc->fmt->hexret)
1635                 fprintf(trace->output, ") = %#lx", ret);
1636         else if (sc->fmt->errpid) {
1637                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1638
1639                 if (child != NULL) {
1640                         fprintf(trace->output, ") = %ld", ret);
1641                         if (child->comm_set)
1642                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1643                         thread__put(child);
1644                 }
1645         } else
1646                 goto signed_print;
1647
1648         fputc('\n', trace->output);
1649
1650         if (callchain_ret > 0)
1651                 trace__fprintf_callchain(trace, sample);
1652         else if (callchain_ret < 0)
1653                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1654 out:
1655         ttrace->entry_pending = false;
1656         err = 0;
1657 out_put:
1658         thread__put(thread);
1659         return err;
1660 }
1661
1662 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1663                               union perf_event *event __maybe_unused,
1664                               struct perf_sample *sample)
1665 {
1666         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1667         struct thread_trace *ttrace;
1668         size_t filename_len, entry_str_len, to_move;
1669         ssize_t remaining_space;
1670         char *pos;
1671         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1672
1673         if (!thread)
1674                 goto out;
1675
1676         ttrace = thread__priv(thread);
1677         if (!ttrace)
1678                 goto out_put;
1679
1680         filename_len = strlen(filename);
1681         if (filename_len == 0)
1682                 goto out_put;
1683
1684         if (ttrace->filename.namelen < filename_len) {
1685                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1686
1687                 if (f == NULL)
1688                         goto out_put;
1689
1690                 ttrace->filename.namelen = filename_len;
1691                 ttrace->filename.name = f;
1692         }
1693
1694         strcpy(ttrace->filename.name, filename);
1695         ttrace->filename.pending_open = true;
1696
1697         if (!ttrace->filename.ptr)
1698                 goto out_put;
1699
1700         entry_str_len = strlen(ttrace->entry_str);
1701         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1702         if (remaining_space <= 0)
1703                 goto out_put;
1704
1705         if (filename_len > (size_t)remaining_space) {
1706                 filename += filename_len - remaining_space;
1707                 filename_len = remaining_space;
1708         }
1709
1710         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1711         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1712         memmove(pos + filename_len, pos, to_move);
1713         memcpy(pos, filename, filename_len);
1714
1715         ttrace->filename.ptr = 0;
1716         ttrace->filename.entry_str_pos = 0;
1717 out_put:
1718         thread__put(thread);
1719 out:
1720         return 0;
1721 }
1722
1723 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1724                                      union perf_event *event __maybe_unused,
1725                                      struct perf_sample *sample)
1726 {
1727         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1728         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1729         struct thread *thread = machine__findnew_thread(trace->host,
1730                                                         sample->pid,
1731                                                         sample->tid);
1732         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1733
1734         if (ttrace == NULL)
1735                 goto out_dump;
1736
1737         ttrace->runtime_ms += runtime_ms;
1738         trace->runtime_ms += runtime_ms;
1739 out_put:
1740         thread__put(thread);
1741         return 0;
1742
1743 out_dump:
1744         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1745                evsel->name,
1746                perf_evsel__strval(evsel, sample, "comm"),
1747                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1748                runtime,
1749                perf_evsel__intval(evsel, sample, "vruntime"));
1750         goto out_put;
1751 }
1752
1753 static void bpf_output__printer(enum binary_printer_ops op,
1754                                 unsigned int val, void *extra)
1755 {
1756         FILE *output = extra;
1757         unsigned char ch = (unsigned char)val;
1758
1759         switch (op) {
1760         case BINARY_PRINT_CHAR_DATA:
1761                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1762                 break;
1763         case BINARY_PRINT_DATA_BEGIN:
1764         case BINARY_PRINT_LINE_BEGIN:
1765         case BINARY_PRINT_ADDR:
1766         case BINARY_PRINT_NUM_DATA:
1767         case BINARY_PRINT_NUM_PAD:
1768         case BINARY_PRINT_SEP:
1769         case BINARY_PRINT_CHAR_PAD:
1770         case BINARY_PRINT_LINE_END:
1771         case BINARY_PRINT_DATA_END:
1772         default:
1773                 break;
1774         }
1775 }
1776
1777 static void bpf_output__fprintf(struct trace *trace,
1778                                 struct perf_sample *sample)
1779 {
1780         print_binary(sample->raw_data, sample->raw_size, 8,
1781                      bpf_output__printer, trace->output);
1782 }
1783
1784 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1785                                 union perf_event *event __maybe_unused,
1786                                 struct perf_sample *sample)
1787 {
1788         int callchain_ret = 0;
1789
1790         if (sample->callchain) {
1791                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1792                 if (callchain_ret == 0) {
1793                         if (callchain_cursor.nr < trace->min_stack)
1794                                 goto out;
1795                         callchain_ret = 1;
1796                 }
1797         }
1798
1799         trace__printf_interrupted_entry(trace, sample);
1800         trace__fprintf_tstamp(trace, sample->time, trace->output);
1801
1802         if (trace->trace_syscalls)
1803                 fprintf(trace->output, "(         ): ");
1804
1805         fprintf(trace->output, "%s:", evsel->name);
1806
1807         if (perf_evsel__is_bpf_output(evsel)) {
1808                 bpf_output__fprintf(trace, sample);
1809         } else if (evsel->tp_format) {
1810                 event_format__fprintf(evsel->tp_format, sample->cpu,
1811                                       sample->raw_data, sample->raw_size,
1812                                       trace->output);
1813         }
1814
1815         fprintf(trace->output, ")\n");
1816
1817         if (callchain_ret > 0)
1818                 trace__fprintf_callchain(trace, sample);
1819         else if (callchain_ret < 0)
1820                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1821 out:
1822         return 0;
1823 }
1824
1825 static void print_location(FILE *f, struct perf_sample *sample,
1826                            struct addr_location *al,
1827                            bool print_dso, bool print_sym)
1828 {
1829
1830         if ((verbose > 0 || print_dso) && al->map)
1831                 fprintf(f, "%s@", al->map->dso->long_name);
1832
1833         if ((verbose > 0 || print_sym) && al->sym)
1834                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1835                         al->addr - al->sym->start);
1836         else if (al->map)
1837                 fprintf(f, "0x%" PRIx64, al->addr);
1838         else
1839                 fprintf(f, "0x%" PRIx64, sample->addr);
1840 }
1841
1842 static int trace__pgfault(struct trace *trace,
1843                           struct perf_evsel *evsel,
1844                           union perf_event *event __maybe_unused,
1845                           struct perf_sample *sample)
1846 {
1847         struct thread *thread;
1848         struct addr_location al;
1849         char map_type = 'd';
1850         struct thread_trace *ttrace;
1851         int err = -1;
1852         int callchain_ret = 0;
1853
1854         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1855
1856         if (sample->callchain) {
1857                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1858                 if (callchain_ret == 0) {
1859                         if (callchain_cursor.nr < trace->min_stack)
1860                                 goto out_put;
1861                         callchain_ret = 1;
1862                 }
1863         }
1864
1865         ttrace = thread__trace(thread, trace->output);
1866         if (ttrace == NULL)
1867                 goto out_put;
1868
1869         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1870                 ttrace->pfmaj++;
1871         else
1872                 ttrace->pfmin++;
1873
1874         if (trace->summary_only)
1875                 goto out;
1876
1877         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1878                               sample->ip, &al);
1879
1880         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1881
1882         fprintf(trace->output, "%sfault [",
1883                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1884                 "maj" : "min");
1885
1886         print_location(trace->output, sample, &al, false, true);
1887
1888         fprintf(trace->output, "] => ");
1889
1890         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1891                                    sample->addr, &al);
1892
1893         if (!al.map) {
1894                 thread__find_addr_location(thread, sample->cpumode,
1895                                            MAP__FUNCTION, sample->addr, &al);
1896
1897                 if (al.map)
1898                         map_type = 'x';
1899                 else
1900                         map_type = '?';
1901         }
1902
1903         print_location(trace->output, sample, &al, true, false);
1904
1905         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1906
1907         if (callchain_ret > 0)
1908                 trace__fprintf_callchain(trace, sample);
1909         else if (callchain_ret < 0)
1910                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1911 out:
1912         err = 0;
1913 out_put:
1914         thread__put(thread);
1915         return err;
1916 }
1917
1918 static void trace__set_base_time(struct trace *trace,
1919                                  struct perf_evsel *evsel,
1920                                  struct perf_sample *sample)
1921 {
1922         /*
1923          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1924          * and don't use sample->time unconditionally, we may end up having
1925          * some other event in the future without PERF_SAMPLE_TIME for good
1926          * reason, i.e. we may not be interested in its timestamps, just in
1927          * it taking place, picking some piece of information when it
1928          * appears in our event stream (vfs_getname comes to mind).
1929          */
1930         if (trace->base_time == 0 && !trace->full_time &&
1931             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1932                 trace->base_time = sample->time;
1933 }
1934
1935 static int trace__process_sample(struct perf_tool *tool,
1936                                  union perf_event *event,
1937                                  struct perf_sample *sample,
1938                                  struct perf_evsel *evsel,
1939                                  struct machine *machine __maybe_unused)
1940 {
1941         struct trace *trace = container_of(tool, struct trace, tool);
1942         struct thread *thread;
1943         int err = 0;
1944
1945         tracepoint_handler handler = evsel->handler;
1946
1947         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1948         if (thread && thread__is_filtered(thread))
1949                 goto out;
1950
1951         trace__set_base_time(trace, evsel, sample);
1952
1953         if (handler) {
1954                 ++trace->nr_events;
1955                 handler(trace, evsel, event, sample);
1956         }
1957 out:
1958         thread__put(thread);
1959         return err;
1960 }
1961
1962 static int trace__record(struct trace *trace, int argc, const char **argv)
1963 {
1964         unsigned int rec_argc, i, j;
1965         const char **rec_argv;
1966         const char * const record_args[] = {
1967                 "record",
1968                 "-R",
1969                 "-m", "1024",
1970                 "-c", "1",
1971         };
1972
1973         const char * const sc_args[] = { "-e", };
1974         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1975         const char * const majpf_args[] = { "-e", "major-faults" };
1976         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1977         const char * const minpf_args[] = { "-e", "minor-faults" };
1978         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1979
1980         /* +1 is for the event string below */
1981         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1982                 majpf_args_nr + minpf_args_nr + argc;
1983         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1984
1985         if (rec_argv == NULL)
1986                 return -ENOMEM;
1987
1988         j = 0;
1989         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1990                 rec_argv[j++] = record_args[i];
1991
1992         if (trace->trace_syscalls) {
1993                 for (i = 0; i < sc_args_nr; i++)
1994                         rec_argv[j++] = sc_args[i];
1995
1996                 /* event string may be different for older kernels - e.g., RHEL6 */
1997                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1998                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1999                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2000                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2001                 else {
2002                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2003                         return -1;
2004                 }
2005         }
2006
2007         if (trace->trace_pgfaults & TRACE_PFMAJ)
2008                 for (i = 0; i < majpf_args_nr; i++)
2009                         rec_argv[j++] = majpf_args[i];
2010
2011         if (trace->trace_pgfaults & TRACE_PFMIN)
2012                 for (i = 0; i < minpf_args_nr; i++)
2013                         rec_argv[j++] = minpf_args[i];
2014
2015         for (i = 0; i < (unsigned int)argc; i++)
2016                 rec_argv[j++] = argv[i];
2017
2018         return cmd_record(j, rec_argv);
2019 }
2020
2021 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2022
2023 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2024 {
2025         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2026
2027         if (IS_ERR(evsel))
2028                 return false;
2029
2030         if (perf_evsel__field(evsel, "pathname") == NULL) {
2031                 perf_evsel__delete(evsel);
2032                 return false;
2033         }
2034
2035         evsel->handler = trace__vfs_getname;
2036         perf_evlist__add(evlist, evsel);
2037         return true;
2038 }
2039
2040 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2041 {
2042         struct perf_evsel *evsel;
2043         struct perf_event_attr attr = {
2044                 .type = PERF_TYPE_SOFTWARE,
2045                 .mmap_data = 1,
2046         };
2047
2048         attr.config = config;
2049         attr.sample_period = 1;
2050
2051         event_attr_init(&attr);
2052
2053         evsel = perf_evsel__new(&attr);
2054         if (evsel)
2055                 evsel->handler = trace__pgfault;
2056
2057         return evsel;
2058 }
2059
2060 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2061 {
2062         const u32 type = event->header.type;
2063         struct perf_evsel *evsel;
2064
2065         if (type != PERF_RECORD_SAMPLE) {
2066                 trace__process_event(trace, trace->host, event, sample);
2067                 return;
2068         }
2069
2070         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2071         if (evsel == NULL) {
2072                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2073                 return;
2074         }
2075
2076         trace__set_base_time(trace, evsel, sample);
2077
2078         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2079             sample->raw_data == NULL) {
2080                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2081                        perf_evsel__name(evsel), sample->tid,
2082                        sample->cpu, sample->raw_size);
2083         } else {
2084                 tracepoint_handler handler = evsel->handler;
2085                 handler(trace, evsel, event, sample);
2086         }
2087 }
2088
2089 static int trace__add_syscall_newtp(struct trace *trace)
2090 {
2091         int ret = -1;
2092         struct perf_evlist *evlist = trace->evlist;
2093         struct perf_evsel *sys_enter, *sys_exit;
2094
2095         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2096         if (sys_enter == NULL)
2097                 goto out;
2098
2099         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2100                 goto out_delete_sys_enter;
2101
2102         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2103         if (sys_exit == NULL)
2104                 goto out_delete_sys_enter;
2105
2106         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2107                 goto out_delete_sys_exit;
2108
2109         perf_evlist__add(evlist, sys_enter);
2110         perf_evlist__add(evlist, sys_exit);
2111
2112         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2113                 /*
2114                  * We're interested only in the user space callchain
2115                  * leading to the syscall, allow overriding that for
2116                  * debugging reasons using --kernel_syscall_callchains
2117                  */
2118                 sys_exit->attr.exclude_callchain_kernel = 1;
2119         }
2120
2121         trace->syscalls.events.sys_enter = sys_enter;
2122         trace->syscalls.events.sys_exit  = sys_exit;
2123
2124         ret = 0;
2125 out:
2126         return ret;
2127
2128 out_delete_sys_exit:
2129         perf_evsel__delete_priv(sys_exit);
2130 out_delete_sys_enter:
2131         perf_evsel__delete_priv(sys_enter);
2132         goto out;
2133 }
2134
2135 static int trace__set_ev_qualifier_filter(struct trace *trace)
2136 {
2137         int err = -1;
2138         struct perf_evsel *sys_exit;
2139         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2140                                                 trace->ev_qualifier_ids.nr,
2141                                                 trace->ev_qualifier_ids.entries);
2142
2143         if (filter == NULL)
2144                 goto out_enomem;
2145
2146         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2147                                           filter)) {
2148                 sys_exit = trace->syscalls.events.sys_exit;
2149                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2150         }
2151
2152         free(filter);
2153 out:
2154         return err;
2155 out_enomem:
2156         errno = ENOMEM;
2157         goto out;
2158 }
2159
2160 static int trace__run(struct trace *trace, int argc, const char **argv)
2161 {
2162         struct perf_evlist *evlist = trace->evlist;
2163         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2164         int err = -1, i;
2165         unsigned long before;
2166         const bool forks = argc > 0;
2167         bool draining = false;
2168
2169         trace->live = true;
2170
2171         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2172                 goto out_error_raw_syscalls;
2173
2174         if (trace->trace_syscalls)
2175                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2176
2177         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2178                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2179                 if (pgfault_maj == NULL)
2180                         goto out_error_mem;
2181                 perf_evlist__add(evlist, pgfault_maj);
2182         }
2183
2184         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2185                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2186                 if (pgfault_min == NULL)
2187                         goto out_error_mem;
2188                 perf_evlist__add(evlist, pgfault_min);
2189         }
2190
2191         if (trace->sched &&
2192             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2193                                    trace__sched_stat_runtime))
2194                 goto out_error_sched_stat_runtime;
2195
2196         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2197         if (err < 0) {
2198                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2199                 goto out_delete_evlist;
2200         }
2201
2202         err = trace__symbols_init(trace, evlist);
2203         if (err < 0) {
2204                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2205                 goto out_delete_evlist;
2206         }
2207
2208         perf_evlist__config(evlist, &trace->opts, NULL);
2209
2210         if (callchain_param.enabled) {
2211                 bool use_identifier = false;
2212
2213                 if (trace->syscalls.events.sys_exit) {
2214                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2215                                                      &trace->opts, &callchain_param);
2216                         use_identifier = true;
2217                 }
2218
2219                 if (pgfault_maj) {
2220                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2221                         use_identifier = true;
2222                 }
2223
2224                 if (pgfault_min) {
2225                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2226                         use_identifier = true;
2227                 }
2228
2229                 if (use_identifier) {
2230                        /*
2231                         * Now we have evsels with different sample_ids, use
2232                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2233                         * from a fixed position in each ring buffer record.
2234                         *
2235                         * As of this the changeset introducing this comment, this
2236                         * isn't strictly needed, as the fields that can come before
2237                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2238                         * some of those for things like copying the payload of
2239                         * pointer syscall arguments, and for vfs_getname we don't
2240                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2241                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2242                         */
2243                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2244                         perf_evlist__reset_sample_bit(evlist, ID);
2245                 }
2246         }
2247
2248         signal(SIGCHLD, sig_handler);
2249         signal(SIGINT, sig_handler);
2250
2251         if (forks) {
2252                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2253                                                     argv, false, NULL);
2254                 if (err < 0) {
2255                         fprintf(trace->output, "Couldn't run the workload!\n");
2256                         goto out_delete_evlist;
2257                 }
2258         }
2259
2260         err = perf_evlist__open(evlist);
2261         if (err < 0)
2262                 goto out_error_open;
2263
2264         err = bpf__apply_obj_config();
2265         if (err) {
2266                 char errbuf[BUFSIZ];
2267
2268                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2269                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2270                          errbuf);
2271                 goto out_error_open;
2272         }
2273
2274         /*
2275          * Better not use !target__has_task() here because we need to cover the
2276          * case where no threads were specified in the command line, but a
2277          * workload was, and in that case we will fill in the thread_map when
2278          * we fork the workload in perf_evlist__prepare_workload.
2279          */
2280         if (trace->filter_pids.nr > 0)
2281                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2282         else if (thread_map__pid(evlist->threads, 0) == -1)
2283                 err = perf_evlist__set_filter_pid(evlist, getpid());
2284
2285         if (err < 0)
2286                 goto out_error_mem;
2287
2288         if (trace->ev_qualifier_ids.nr > 0) {
2289                 err = trace__set_ev_qualifier_filter(trace);
2290                 if (err < 0)
2291                         goto out_errno;
2292
2293                 pr_debug("event qualifier tracepoint filter: %s\n",
2294                          trace->syscalls.events.sys_exit->filter);
2295         }
2296
2297         err = perf_evlist__apply_filters(evlist, &evsel);
2298         if (err < 0)
2299                 goto out_error_apply_filters;
2300
2301         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2302         if (err < 0)
2303                 goto out_error_mmap;
2304
2305         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2306                 perf_evlist__enable(evlist);
2307
2308         if (forks)
2309                 perf_evlist__start_workload(evlist);
2310
2311         if (trace->opts.initial_delay) {
2312                 usleep(trace->opts.initial_delay * 1000);
2313                 perf_evlist__enable(evlist);
2314         }
2315
2316         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2317                                   evlist->threads->nr > 1 ||
2318                                   perf_evlist__first(evlist)->attr.inherit;
2319 again:
2320         before = trace->nr_events;
2321
2322         for (i = 0; i < evlist->nr_mmaps; i++) {
2323                 union perf_event *event;
2324
2325                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2326                         struct perf_sample sample;
2327
2328                         ++trace->nr_events;
2329
2330                         err = perf_evlist__parse_sample(evlist, event, &sample);
2331                         if (err) {
2332                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2333                                 goto next_event;
2334                         }
2335
2336                         trace__handle_event(trace, event, &sample);
2337 next_event:
2338                         perf_evlist__mmap_consume(evlist, i);
2339
2340                         if (interrupted)
2341                                 goto out_disable;
2342
2343                         if (done && !draining) {
2344                                 perf_evlist__disable(evlist);
2345                                 draining = true;
2346                         }
2347                 }
2348         }
2349
2350         if (trace->nr_events == before) {
2351                 int timeout = done ? 100 : -1;
2352
2353                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2354                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2355                                 draining = true;
2356
2357                         goto again;
2358                 }
2359         } else {
2360                 goto again;
2361         }
2362
2363 out_disable:
2364         thread__zput(trace->current);
2365
2366         perf_evlist__disable(evlist);
2367
2368         if (!err) {
2369                 if (trace->summary)
2370                         trace__fprintf_thread_summary(trace, trace->output);
2371
2372                 if (trace->show_tool_stats) {
2373                         fprintf(trace->output, "Stats:\n "
2374                                                " vfs_getname : %" PRIu64 "\n"
2375                                                " proc_getname: %" PRIu64 "\n",
2376                                 trace->stats.vfs_getname,
2377                                 trace->stats.proc_getname);
2378                 }
2379         }
2380
2381 out_delete_evlist:
2382         perf_evlist__delete(evlist);
2383         trace->evlist = NULL;
2384         trace->live = false;
2385         return err;
2386 {
2387         char errbuf[BUFSIZ];
2388
2389 out_error_sched_stat_runtime:
2390         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2391         goto out_error;
2392
2393 out_error_raw_syscalls:
2394         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2395         goto out_error;
2396
2397 out_error_mmap:
2398         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2399         goto out_error;
2400
2401 out_error_open:
2402         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2403
2404 out_error:
2405         fprintf(trace->output, "%s\n", errbuf);
2406         goto out_delete_evlist;
2407
2408 out_error_apply_filters:
2409         fprintf(trace->output,
2410                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2411                 evsel->filter, perf_evsel__name(evsel), errno,
2412                 str_error_r(errno, errbuf, sizeof(errbuf)));
2413         goto out_delete_evlist;
2414 }
2415 out_error_mem:
2416         fprintf(trace->output, "Not enough memory to run!\n");
2417         goto out_delete_evlist;
2418
2419 out_errno:
2420         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2421         goto out_delete_evlist;
2422 }
2423
2424 static int trace__replay(struct trace *trace)
2425 {
2426         const struct perf_evsel_str_handler handlers[] = {
2427                 { "probe:vfs_getname",       trace__vfs_getname, },
2428         };
2429         struct perf_data_file file = {
2430                 .path  = input_name,
2431                 .mode  = PERF_DATA_MODE_READ,
2432                 .force = trace->force,
2433         };
2434         struct perf_session *session;
2435         struct perf_evsel *evsel;
2436         int err = -1;
2437
2438         trace->tool.sample        = trace__process_sample;
2439         trace->tool.mmap          = perf_event__process_mmap;
2440         trace->tool.mmap2         = perf_event__process_mmap2;
2441         trace->tool.comm          = perf_event__process_comm;
2442         trace->tool.exit          = perf_event__process_exit;
2443         trace->tool.fork          = perf_event__process_fork;
2444         trace->tool.attr          = perf_event__process_attr;
2445         trace->tool.tracing_data  = perf_event__process_tracing_data;
2446         trace->tool.build_id      = perf_event__process_build_id;
2447         trace->tool.namespaces    = perf_event__process_namespaces;
2448
2449         trace->tool.ordered_events = true;
2450         trace->tool.ordering_requires_timestamps = true;
2451
2452         /* add tid to output */
2453         trace->multiple_threads = true;
2454
2455         session = perf_session__new(&file, false, &trace->tool);
2456         if (session == NULL)
2457                 return -1;
2458
2459         if (trace->opts.target.pid)
2460                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2461
2462         if (trace->opts.target.tid)
2463                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2464
2465         if (symbol__init(&session->header.env) < 0)
2466                 goto out;
2467
2468         trace->host = &session->machines.host;
2469
2470         err = perf_session__set_tracepoints_handlers(session, handlers);
2471         if (err)
2472                 goto out;
2473
2474         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475                                                      "raw_syscalls:sys_enter");
2476         /* older kernels have syscalls tp versus raw_syscalls */
2477         if (evsel == NULL)
2478                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2479                                                              "syscalls:sys_enter");
2480
2481         if (evsel &&
2482             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2483             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2484                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2485                 goto out;
2486         }
2487
2488         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2489                                                      "raw_syscalls:sys_exit");
2490         if (evsel == NULL)
2491                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2492                                                              "syscalls:sys_exit");
2493         if (evsel &&
2494             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2495             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2496                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2497                 goto out;
2498         }
2499
2500         evlist__for_each_entry(session->evlist, evsel) {
2501                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2502                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2503                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2504                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2505                         evsel->handler = trace__pgfault;
2506         }
2507
2508         setup_pager();
2509
2510         err = perf_session__process_events(session);
2511         if (err)
2512                 pr_err("Failed to process events, error %d", err);
2513
2514         else if (trace->summary)
2515                 trace__fprintf_thread_summary(trace, trace->output);
2516
2517 out:
2518         perf_session__delete(session);
2519
2520         return err;
2521 }
2522
2523 static size_t trace__fprintf_threads_header(FILE *fp)
2524 {
2525         size_t printed;
2526
2527         printed  = fprintf(fp, "\n Summary of events:\n\n");
2528
2529         return printed;
2530 }
2531
2532 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2533         struct stats    *stats;
2534         double          msecs;
2535         int             syscall;
2536 )
2537 {
2538         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2539         struct stats *stats = source->priv;
2540
2541         entry->syscall = source->i;
2542         entry->stats   = stats;
2543         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2544 }
2545
2546 static size_t thread__dump_stats(struct thread_trace *ttrace,
2547                                  struct trace *trace, FILE *fp)
2548 {
2549         size_t printed = 0;
2550         struct syscall *sc;
2551         struct rb_node *nd;
2552         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2553
2554         if (syscall_stats == NULL)
2555                 return 0;
2556
2557         printed += fprintf(fp, "\n");
2558
2559         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2560         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2561         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2562
2563         resort_rb__for_each_entry(nd, syscall_stats) {
2564                 struct stats *stats = syscall_stats_entry->stats;
2565                 if (stats) {
2566                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2567                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2568                         double avg = avg_stats(stats);
2569                         double pct;
2570                         u64 n = (u64) stats->n;
2571
2572                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2573                         avg /= NSEC_PER_MSEC;
2574
2575                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2576                         printed += fprintf(fp, "   %-15s", sc->name);
2577                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2578                                            n, syscall_stats_entry->msecs, min, avg);
2579                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2580                 }
2581         }
2582
2583         resort_rb__delete(syscall_stats);
2584         printed += fprintf(fp, "\n\n");
2585
2586         return printed;
2587 }
2588
2589 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2590 {
2591         size_t printed = 0;
2592         struct thread_trace *ttrace = thread__priv(thread);
2593         double ratio;
2594
2595         if (ttrace == NULL)
2596                 return 0;
2597
2598         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2599
2600         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2601         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2602         printed += fprintf(fp, "%.1f%%", ratio);
2603         if (ttrace->pfmaj)
2604                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2605         if (ttrace->pfmin)
2606                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2607         if (trace->sched)
2608                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2609         else if (fputc('\n', fp) != EOF)
2610                 ++printed;
2611
2612         printed += thread__dump_stats(ttrace, trace, fp);
2613
2614         return printed;
2615 }
2616
2617 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2618 {
2619         return ttrace ? ttrace->nr_events : 0;
2620 }
2621
2622 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2623         struct thread *thread;
2624 )
2625 {
2626         entry->thread = rb_entry(nd, struct thread, rb_node);
2627 }
2628
2629 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2630 {
2631         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2632         size_t printed = trace__fprintf_threads_header(fp);
2633         struct rb_node *nd;
2634
2635         if (threads == NULL) {
2636                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2637                 return 0;
2638         }
2639
2640         resort_rb__for_each_entry(nd, threads)
2641                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2642
2643         resort_rb__delete(threads);
2644
2645         return printed;
2646 }
2647
2648 static int trace__set_duration(const struct option *opt, const char *str,
2649                                int unset __maybe_unused)
2650 {
2651         struct trace *trace = opt->value;
2652
2653         trace->duration_filter = atof(str);
2654         return 0;
2655 }
2656
2657 static int trace__set_filter_pids(const struct option *opt, const char *str,
2658                                   int unset __maybe_unused)
2659 {
2660         int ret = -1;
2661         size_t i;
2662         struct trace *trace = opt->value;
2663         /*
2664          * FIXME: introduce a intarray class, plain parse csv and create a
2665          * { int nr, int entries[] } struct...
2666          */
2667         struct intlist *list = intlist__new(str);
2668
2669         if (list == NULL)
2670                 return -1;
2671
2672         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2673         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2674
2675         if (trace->filter_pids.entries == NULL)
2676                 goto out;
2677
2678         trace->filter_pids.entries[0] = getpid();
2679
2680         for (i = 1; i < trace->filter_pids.nr; ++i)
2681                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2682
2683         intlist__delete(list);
2684         ret = 0;
2685 out:
2686         return ret;
2687 }
2688
2689 static int trace__open_output(struct trace *trace, const char *filename)
2690 {
2691         struct stat st;
2692
2693         if (!stat(filename, &st) && st.st_size) {
2694                 char oldname[PATH_MAX];
2695
2696                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2697                 unlink(oldname);
2698                 rename(filename, oldname);
2699         }
2700
2701         trace->output = fopen(filename, "w");
2702
2703         return trace->output == NULL ? -errno : 0;
2704 }
2705
2706 static int parse_pagefaults(const struct option *opt, const char *str,
2707                             int unset __maybe_unused)
2708 {
2709         int *trace_pgfaults = opt->value;
2710
2711         if (strcmp(str, "all") == 0)
2712                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2713         else if (strcmp(str, "maj") == 0)
2714                 *trace_pgfaults |= TRACE_PFMAJ;
2715         else if (strcmp(str, "min") == 0)
2716                 *trace_pgfaults |= TRACE_PFMIN;
2717         else
2718                 return -1;
2719
2720         return 0;
2721 }
2722
2723 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2724 {
2725         struct perf_evsel *evsel;
2726
2727         evlist__for_each_entry(evlist, evsel)
2728                 evsel->handler = handler;
2729 }
2730
2731 /*
2732  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2733  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2734  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2735  *
2736  * It'd be better to introduce a parse_options() variant that would return a
2737  * list with the terms it didn't match to an event...
2738  */
2739 static int trace__parse_events_option(const struct option *opt, const char *str,
2740                                       int unset __maybe_unused)
2741 {
2742         struct trace *trace = (struct trace *)opt->value;
2743         const char *s = str;
2744         char *sep = NULL, *lists[2] = { NULL, NULL, };
2745         int len = strlen(str), err = -1, list;
2746         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2747         char group_name[PATH_MAX];
2748
2749         if (strace_groups_dir == NULL)
2750                 return -1;
2751
2752         if (*s == '!') {
2753                 ++s;
2754                 trace->not_ev_qualifier = true;
2755         }
2756
2757         while (1) {
2758                 if ((sep = strchr(s, ',')) != NULL)
2759                         *sep = '\0';
2760
2761                 list = 0;
2762                 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2763                         list = 1;
2764                 } else {
2765                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2766                         if (access(group_name, R_OK) == 0)
2767                                 list = 1;
2768                 }
2769
2770                 if (lists[list]) {
2771                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2772                 } else {
2773                         lists[list] = malloc(len);
2774                         if (lists[list] == NULL)
2775                                 goto out;
2776                         strcpy(lists[list], s);
2777                 }
2778
2779                 if (!sep)
2780                         break;
2781
2782                 *sep = ',';
2783                 s = sep + 1;
2784         }
2785
2786         if (lists[1] != NULL) {
2787                 struct strlist_config slist_config = {
2788                         .dirname = strace_groups_dir,
2789                 };
2790
2791                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2792                 if (trace->ev_qualifier == NULL) {
2793                         fputs("Not enough memory to parse event qualifier", trace->output);
2794                         goto out;
2795                 }
2796
2797                 if (trace__validate_ev_qualifier(trace))
2798                         goto out;
2799         }
2800
2801         err = 0;
2802
2803         if (lists[0]) {
2804                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2805                                                "event selector. use 'perf list' to list available events",
2806                                                parse_events_option);
2807                 err = parse_events_option(&o, lists[0], 0);
2808         }
2809 out:
2810         if (sep)
2811                 *sep = ',';
2812
2813         return err;
2814 }
2815
2816 int cmd_trace(int argc, const char **argv)
2817 {
2818         const char *trace_usage[] = {
2819                 "perf trace [<options>] [<command>]",
2820                 "perf trace [<options>] -- <command> [<options>]",
2821                 "perf trace record [<options>] [<command>]",
2822                 "perf trace record [<options>] -- <command> [<options>]",
2823                 NULL
2824         };
2825         struct trace trace = {
2826                 .syscalls = {
2827                         . max = -1,
2828                 },
2829                 .opts = {
2830                         .target = {
2831                                 .uid       = UINT_MAX,
2832                                 .uses_mmap = true,
2833                         },
2834                         .user_freq     = UINT_MAX,
2835                         .user_interval = ULLONG_MAX,
2836                         .no_buffering  = true,
2837                         .mmap_pages    = UINT_MAX,
2838                         .proc_map_timeout  = 500,
2839                 },
2840                 .output = stderr,
2841                 .show_comm = true,
2842                 .trace_syscalls = true,
2843                 .kernel_syscallchains = false,
2844                 .max_stack = UINT_MAX,
2845         };
2846         const char *output_name = NULL;
2847         const struct option trace_options[] = {
2848         OPT_CALLBACK('e', "event", &trace, "event",
2849                      "event/syscall selector. use 'perf list' to list available events",
2850                      trace__parse_events_option),
2851         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2852                     "show the thread COMM next to its id"),
2853         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2854         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2855                      trace__parse_events_option),
2856         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2857         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2858         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2859                     "trace events on existing process id"),
2860         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2861                     "trace events on existing thread id"),
2862         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2863                      "pids to filter (by the kernel)", trace__set_filter_pids),
2864         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2865                     "system-wide collection from all CPUs"),
2866         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2867                     "list of cpus to monitor"),
2868         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2869                     "child tasks do not inherit counters"),
2870         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2871                      "number of mmap data pages",
2872                      perf_evlist__parse_mmap_pages),
2873         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2874                    "user to profile"),
2875         OPT_CALLBACK(0, "duration", &trace, "float",
2876                      "show only events with duration > N.M ms",
2877                      trace__set_duration),
2878         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2879         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2880         OPT_BOOLEAN('T', "time", &trace.full_time,
2881                     "Show full timestamp, not time relative to first start"),
2882         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2883                     "Show only syscall summary with statistics"),
2884         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2885                     "Show all syscalls and summary with statistics"),
2886         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2887                      "Trace pagefaults", parse_pagefaults, "maj"),
2888         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2889         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2890         OPT_CALLBACK(0, "call-graph", &trace.opts,
2891                      "record_mode[,record_size]", record_callchain_help,
2892                      &record_parse_callchain_opt),
2893         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2894                     "Show the kernel callchains on the syscall exit path"),
2895         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2896                      "Set the minimum stack depth when parsing the callchain, "
2897                      "anything below the specified depth will be ignored."),
2898         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2899                      "Set the maximum stack depth when parsing the callchain, "
2900                      "anything beyond the specified depth will be ignored. "
2901                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2902         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2903                         "per thread proc mmap processing timeout in ms"),
2904         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2905                      "ms to wait before starting measurement after program "
2906                      "start"),
2907         OPT_END()
2908         };
2909         bool __maybe_unused max_stack_user_set = true;
2910         bool mmap_pages_user_set = true;
2911         const char * const trace_subcommands[] = { "record", NULL };
2912         int err;
2913         char bf[BUFSIZ];
2914
2915         signal(SIGSEGV, sighandler_dump_stack);
2916         signal(SIGFPE, sighandler_dump_stack);
2917
2918         trace.evlist = perf_evlist__new();
2919         trace.sctbl = syscalltbl__new();
2920
2921         if (trace.evlist == NULL || trace.sctbl == NULL) {
2922                 pr_err("Not enough memory to run!\n");
2923                 err = -ENOMEM;
2924                 goto out;
2925         }
2926
2927         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2928                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2929
2930         err = bpf__setup_stdout(trace.evlist);
2931         if (err) {
2932                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2933                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2934                 goto out;
2935         }
2936
2937         err = -1;
2938
2939         if (trace.trace_pgfaults) {
2940                 trace.opts.sample_address = true;
2941                 trace.opts.sample_time = true;
2942         }
2943
2944         if (trace.opts.mmap_pages == UINT_MAX)
2945                 mmap_pages_user_set = false;
2946
2947         if (trace.max_stack == UINT_MAX) {
2948                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2949                 max_stack_user_set = false;
2950         }
2951
2952 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2953         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2954                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2955 #endif
2956
2957         if (callchain_param.enabled) {
2958                 if (!mmap_pages_user_set && geteuid() == 0)
2959                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2960
2961                 symbol_conf.use_callchain = true;
2962         }
2963
2964         if (trace.evlist->nr_entries > 0)
2965                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2966
2967         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2968                 return trace__record(&trace, argc-1, &argv[1]);
2969
2970         /* summary_only implies summary option, but don't overwrite summary if set */
2971         if (trace.summary_only)
2972                 trace.summary = trace.summary_only;
2973
2974         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2975             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2976                 pr_err("Please specify something to trace.\n");
2977                 return -1;
2978         }
2979
2980         if (!trace.trace_syscalls && trace.ev_qualifier) {
2981                 pr_err("The -e option can't be used with --no-syscalls.\n");
2982                 goto out;
2983         }
2984
2985         if (output_name != NULL) {
2986                 err = trace__open_output(&trace, output_name);
2987                 if (err < 0) {
2988                         perror("failed to create output file");
2989                         goto out;
2990                 }
2991         }
2992
2993         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2994
2995         err = target__validate(&trace.opts.target);
2996         if (err) {
2997                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2998                 fprintf(trace.output, "%s", bf);
2999                 goto out_close;
3000         }
3001
3002         err = target__parse_uid(&trace.opts.target);
3003         if (err) {
3004                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3005                 fprintf(trace.output, "%s", bf);
3006                 goto out_close;
3007         }
3008
3009         if (!argc && target__none(&trace.opts.target))
3010                 trace.opts.target.system_wide = true;
3011
3012         if (input_name)
3013                 err = trace__replay(&trace);
3014         else
3015                 err = trace__run(&trace, argc, argv);
3016
3017 out_close:
3018         if (output_name != NULL)
3019                 fclose(trace.output);
3020 out:
3021         return err;
3022 }