]> git.karo-electronics.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
Merge tag 'v4.11-rc6' into perf/core, to pick up fixes
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace/beauty/beauty.h"
35 #include "trace-event.h"
36 #include "util/parse-events.h"
37 #include "util/bpf-loader.h"
38 #include "callchain.h"
39 #include "syscalltbl.h"
40 #include "rb_resort.h"
41
42 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
43 #include <stdlib.h>
44 #include <string.h>
45 #include <linux/err.h>
46 #include <linux/filter.h>
47 #include <linux/audit.h>
48 #include <linux/random.h>
49 #include <linux/stringify.h>
50 #include <linux/time64.h>
51
52 #ifndef O_CLOEXEC
53 # define O_CLOEXEC              02000000
54 #endif
55
56 struct trace {
57         struct perf_tool        tool;
58         struct syscalltbl       *sctbl;
59         struct {
60                 int             max;
61                 struct syscall  *table;
62                 struct {
63                         struct perf_evsel *sys_enter,
64                                           *sys_exit;
65                 }               events;
66         } syscalls;
67         struct record_opts      opts;
68         struct perf_evlist      *evlist;
69         struct machine          *host;
70         struct thread           *current;
71         u64                     base_time;
72         FILE                    *output;
73         unsigned long           nr_events;
74         struct strlist          *ev_qualifier;
75         struct {
76                 size_t          nr;
77                 int             *entries;
78         }                       ev_qualifier_ids;
79         struct {
80                 size_t          nr;
81                 pid_t           *entries;
82         }                       filter_pids;
83         double                  duration_filter;
84         double                  runtime_ms;
85         struct {
86                 u64             vfs_getname,
87                                 proc_getname;
88         } stats;
89         unsigned int            max_stack;
90         unsigned int            min_stack;
91         bool                    not_ev_qualifier;
92         bool                    live;
93         bool                    full_time;
94         bool                    sched;
95         bool                    multiple_threads;
96         bool                    summary;
97         bool                    summary_only;
98         bool                    show_comm;
99         bool                    show_tool_stats;
100         bool                    trace_syscalls;
101         bool                    kernel_syscallchains;
102         bool                    force;
103         bool                    vfs_getname;
104         int                     trace_pgfaults;
105         int                     open_id;
106 };
107
108 struct tp_field {
109         int offset;
110         union {
111                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113         };
114 };
115
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119         u##bits value; \
120         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121         return value;  \
122 }
123
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132         u##bits value; \
133         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134         return bswap_##bits(value);\
135 }
136
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140
141 static int tp_field__init_uint(struct tp_field *field,
142                                struct format_field *format_field,
143                                bool needs_swap)
144 {
145         field->offset = format_field->offset;
146
147         switch (format_field->size) {
148         case 1:
149                 field->integer = tp_field__u8;
150                 break;
151         case 2:
152                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153                 break;
154         case 4:
155                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156                 break;
157         case 8:
158                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159                 break;
160         default:
161                 return -1;
162         }
163
164         return 0;
165 }
166
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169         return sample->raw_data + field->offset;
170 }
171
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174         field->offset = format_field->offset;
175         field->pointer = tp_field__ptr;
176         return 0;
177 }
178
179 struct syscall_tp {
180         struct tp_field id;
181         union {
182                 struct tp_field args, ret;
183         };
184 };
185
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187                                           struct tp_field *field,
188                                           const char *name)
189 {
190         struct format_field *format_field = perf_evsel__field(evsel, name);
191
192         if (format_field == NULL)
193                 return -1;
194
195         return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199         ({ struct syscall_tp *sc = evsel->priv;\
200            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203                                          struct tp_field *field,
204                                          const char *name)
205 {
206         struct format_field *format_field = perf_evsel__field(evsel, name);
207
208         if (format_field == NULL)
209                 return -1;
210
211         return tp_field__init_ptr(field, format_field);
212 }
213
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215         ({ struct syscall_tp *sc = evsel->priv;\
216            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220         zfree(&evsel->priv);
221         perf_evsel__delete(evsel);
222 }
223
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226         evsel->priv = malloc(sizeof(struct syscall_tp));
227         if (evsel->priv != NULL) {
228                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229                         goto out_delete;
230
231                 evsel->handler = handler;
232                 return 0;
233         }
234
235         return -ENOMEM;
236
237 out_delete:
238         zfree(&evsel->priv);
239         return -ENOENT;
240 }
241
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245
246         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247         if (IS_ERR(evsel))
248                 evsel = perf_evsel__newtp("syscalls", direction);
249
250         if (IS_ERR(evsel))
251                 return NULL;
252
253         if (perf_evsel__init_syscall_tp(evsel, handler))
254                 goto out_delete;
255
256         return evsel;
257
258 out_delete:
259         perf_evsel__delete_priv(evsel);
260         return NULL;
261 }
262
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264         ({ struct syscall_tp *fields = evsel->priv; \
265            fields->name.integer(&fields->name, sample); })
266
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268         ({ struct syscall_tp *fields = evsel->priv; \
269            fields->name.pointer(&fields->name, sample); })
270
271 struct strarray {
272         int         offset;
273         int         nr_entries;
274         const char **entries;
275 };
276
277 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
278         .nr_entries = ARRAY_SIZE(array), \
279         .entries = array, \
280 }
281
282 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
283         .offset     = off, \
284         .nr_entries = ARRAY_SIZE(array), \
285         .entries = array, \
286 }
287
288 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
289                                                 const char *intfmt,
290                                                 struct syscall_arg *arg)
291 {
292         struct strarray *sa = arg->parm;
293         int idx = arg->val - sa->offset;
294
295         if (idx < 0 || idx >= sa->nr_entries)
296                 return scnprintf(bf, size, intfmt, arg->val);
297
298         return scnprintf(bf, size, "%s", sa->entries[idx]);
299 }
300
301 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
302                                               struct syscall_arg *arg)
303 {
304         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
305 }
306
307 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
308
309 #if defined(__i386__) || defined(__x86_64__)
310 /*
311  * FIXME: Make this available to all arches as soon as the ioctl beautifier
312  *        gets rewritten to support all arches.
313  */
314 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
315                                                  struct syscall_arg *arg)
316 {
317         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
318 }
319
320 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
321 #endif /* defined(__i386__) || defined(__x86_64__) */
322
323 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
324                                         struct syscall_arg *arg);
325
326 #define SCA_FD syscall_arg__scnprintf_fd
327
328 #ifndef AT_FDCWD
329 #define AT_FDCWD        -100
330 #endif
331
332 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
333                                            struct syscall_arg *arg)
334 {
335         int fd = arg->val;
336
337         if (fd == AT_FDCWD)
338                 return scnprintf(bf, size, "CWD");
339
340         return syscall_arg__scnprintf_fd(bf, size, arg);
341 }
342
343 #define SCA_FDAT syscall_arg__scnprintf_fd_at
344
345 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
346                                               struct syscall_arg *arg);
347
348 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
349
350 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
351                                          struct syscall_arg *arg)
352 {
353         return scnprintf(bf, size, "%#lx", arg->val);
354 }
355
356 #define SCA_HEX syscall_arg__scnprintf_hex
357
358 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
359                                          struct syscall_arg *arg)
360 {
361         return scnprintf(bf, size, "%d", arg->val);
362 }
363
364 #define SCA_INT syscall_arg__scnprintf_int
365
366 static const char *bpf_cmd[] = {
367         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
368         "MAP_GET_NEXT_KEY", "PROG_LOAD",
369 };
370 static DEFINE_STRARRAY(bpf_cmd);
371
372 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
373 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
374
375 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
376 static DEFINE_STRARRAY(itimers);
377
378 static const char *keyctl_options[] = {
379         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
380         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
381         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
382         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
383         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
384 };
385 static DEFINE_STRARRAY(keyctl_options);
386
387 static const char *whences[] = { "SET", "CUR", "END",
388 #ifdef SEEK_DATA
389 "DATA",
390 #endif
391 #ifdef SEEK_HOLE
392 "HOLE",
393 #endif
394 };
395 static DEFINE_STRARRAY(whences);
396
397 static const char *fcntl_cmds[] = {
398         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
399         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
400         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
401         "F_GETOWNER_UIDS",
402 };
403 static DEFINE_STRARRAY(fcntl_cmds);
404
405 static const char *rlimit_resources[] = {
406         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
407         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
408         "RTTIME",
409 };
410 static DEFINE_STRARRAY(rlimit_resources);
411
412 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
413 static DEFINE_STRARRAY(sighow);
414
415 static const char *clockid[] = {
416         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
417         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
418         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
419 };
420 static DEFINE_STRARRAY(clockid);
421
422 static const char *socket_families[] = {
423         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
424         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
425         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
426         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
427         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
428         "ALG", "NFC", "VSOCK",
429 };
430 static DEFINE_STRARRAY(socket_families);
431
432 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
433                                                  struct syscall_arg *arg)
434 {
435         size_t printed = 0;
436         int mode = arg->val;
437
438         if (mode == F_OK) /* 0 */
439                 return scnprintf(bf, size, "F");
440 #define P_MODE(n) \
441         if (mode & n##_OK) { \
442                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
443                 mode &= ~n##_OK; \
444         }
445
446         P_MODE(R);
447         P_MODE(W);
448         P_MODE(X);
449 #undef P_MODE
450
451         if (mode)
452                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
453
454         return printed;
455 }
456
457 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
458
459 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
460                                               struct syscall_arg *arg);
461
462 #define SCA_FILENAME syscall_arg__scnprintf_filename
463
464 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
465                                                 struct syscall_arg *arg)
466 {
467         int printed = 0, flags = arg->val;
468
469 #define P_FLAG(n) \
470         if (flags & O_##n) { \
471                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
472                 flags &= ~O_##n; \
473         }
474
475         P_FLAG(CLOEXEC);
476         P_FLAG(NONBLOCK);
477 #undef P_FLAG
478
479         if (flags)
480                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
481
482         return printed;
483 }
484
485 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
486
487 #if defined(__i386__) || defined(__x86_64__)
488 /*
489  * FIXME: Make this available to all arches.
490  */
491 #define TCGETS          0x5401
492
493 static const char *tioctls[] = {
494         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
495         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
496         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
497         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
498         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
499         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
500         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
501         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
502         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
503         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
504         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
505         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
506         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
507         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
508         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
509 };
510
511 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
512 #endif /* defined(__i386__) || defined(__x86_64__) */
513
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK   0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM     0x0002
519 #endif
520
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522                                                    struct syscall_arg *arg)
523 {
524         int printed = 0, flags = arg->val;
525
526 #define P_FLAG(n) \
527         if (flags & GRND_##n) { \
528                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529                 flags &= ~GRND_##n; \
530         }
531
532         P_FLAG(RANDOM);
533         P_FLAG(NONBLOCK);
534 #undef P_FLAG
535
536         if (flags)
537                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538
539         return printed;
540 }
541
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543
544 #define STRARRAY(arg, name, array) \
545           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
546           .arg_parm      = { [arg] = &strarray__##array, }
547
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562
563 static struct syscall_fmt {
564         const char *name;
565         const char *alias;
566         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
567         void       *arg_parm[6];
568         bool       errmsg;
569         bool       errpid;
570         bool       timeout;
571         bool       hexret;
572 } syscall_fmts[] = {
573         { .name     = "access",     .errmsg = true,
574           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
575         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
576         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
577         { .name     = "brk",        .hexret = true,
578           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
579         { .name     = "chdir",      .errmsg = true, },
580         { .name     = "chmod",      .errmsg = true, },
581         { .name     = "chroot",     .errmsg = true, },
582         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
583         { .name     = "clone",      .errpid = true, },
584         { .name     = "close",      .errmsg = true,
585           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
586         { .name     = "connect",    .errmsg = true, },
587         { .name     = "creat",      .errmsg = true, },
588         { .name     = "dup",        .errmsg = true, },
589         { .name     = "dup2",       .errmsg = true, },
590         { .name     = "dup3",       .errmsg = true, },
591         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
592         { .name     = "eventfd2",   .errmsg = true,
593           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
594         { .name     = "faccessat",  .errmsg = true, },
595         { .name     = "fadvise64",  .errmsg = true, },
596         { .name     = "fallocate",  .errmsg = true, },
597         { .name     = "fchdir",     .errmsg = true, },
598         { .name     = "fchmod",     .errmsg = true, },
599         { .name     = "fchmodat",   .errmsg = true,
600           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
601         { .name     = "fchown",     .errmsg = true, },
602         { .name     = "fchownat",   .errmsg = true,
603           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
604         { .name     = "fcntl",      .errmsg = true,
605           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
606           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
607         { .name     = "fdatasync",  .errmsg = true, },
608         { .name     = "flock",      .errmsg = true,
609           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
610         { .name     = "fsetxattr",  .errmsg = true, },
611         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
612         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
613         { .name     = "fstatfs",    .errmsg = true, },
614         { .name     = "fsync",    .errmsg = true, },
615         { .name     = "ftruncate", .errmsg = true, },
616         { .name     = "futex",      .errmsg = true,
617           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
618         { .name     = "futimesat", .errmsg = true,
619           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
620         { .name     = "getdents",   .errmsg = true, },
621         { .name     = "getdents64", .errmsg = true, },
622         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
623         { .name     = "getpid",     .errpid = true, },
624         { .name     = "getpgid",    .errpid = true, },
625         { .name     = "getppid",    .errpid = true, },
626         { .name     = "getrandom",  .errmsg = true,
627           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
628         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
629         { .name     = "getxattr",   .errmsg = true, },
630         { .name     = "inotify_add_watch",          .errmsg = true, },
631         { .name     = "ioctl",      .errmsg = true,
632           .arg_scnprintf = {
633 #if defined(__i386__) || defined(__x86_64__)
634 /*
635  * FIXME: Make this available to all arches.
636  */
637                              [1] = SCA_STRHEXARRAY, /* cmd */
638                              [2] = SCA_HEX, /* arg */ },
639           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
640 #else
641                              [2] = SCA_HEX, /* arg */ }, },
642 #endif
643         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
644         { .name     = "kill",       .errmsg = true,
645           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
646         { .name     = "lchown",    .errmsg = true, },
647         { .name     = "lgetxattr",  .errmsg = true, },
648         { .name     = "linkat",     .errmsg = true,
649           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
650         { .name     = "listxattr",  .errmsg = true, },
651         { .name     = "llistxattr", .errmsg = true, },
652         { .name     = "lremovexattr",  .errmsg = true, },
653         { .name     = "lseek",      .errmsg = true,
654           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
655           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
656         { .name     = "lsetxattr",  .errmsg = true, },
657         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
658         { .name     = "lsxattr",    .errmsg = true, },
659         { .name     = "madvise",    .errmsg = true,
660           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
661                              [2] = SCA_MADV_BHV, /* behavior */ }, },
662         { .name     = "mkdir",    .errmsg = true, },
663         { .name     = "mkdirat",    .errmsg = true,
664           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
665         { .name     = "mknod",      .errmsg = true, },
666         { .name     = "mknodat",    .errmsg = true,
667           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
668         { .name     = "mlock",      .errmsg = true,
669           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
670         { .name     = "mlockall",   .errmsg = true,
671           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
672         { .name     = "mmap",       .hexret = true,
673           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
674                              [2] = SCA_MMAP_PROT, /* prot */
675                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
676         { .name     = "mprotect",   .errmsg = true,
677           .arg_scnprintf = { [0] = SCA_HEX, /* start */
678                              [2] = SCA_MMAP_PROT, /* prot */ }, },
679         { .name     = "mq_unlink", .errmsg = true,
680           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
681         { .name     = "mremap",     .hexret = true,
682           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
683                              [3] = SCA_MREMAP_FLAGS, /* flags */
684                              [4] = SCA_HEX, /* new_addr */ }, },
685         { .name     = "munlock",    .errmsg = true,
686           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
687         { .name     = "munmap",     .errmsg = true,
688           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
689         { .name     = "name_to_handle_at", .errmsg = true,
690           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
691         { .name     = "newfstatat", .errmsg = true,
692           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
693         { .name     = "open",       .errmsg = true,
694           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
695         { .name     = "open_by_handle_at", .errmsg = true,
696           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
697                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
698         { .name     = "openat",     .errmsg = true,
699           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
700                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
701         { .name     = "perf_event_open", .errmsg = true,
702           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
703                              [3] = SCA_FD,  /* group_fd */
704                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
705         { .name     = "pipe2",      .errmsg = true,
706           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
707         { .name     = "poll",       .errmsg = true, .timeout = true, },
708         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
709         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
710         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
711         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
712         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
713         { .name     = "pwritev",    .errmsg = true, },
714         { .name     = "read",       .errmsg = true, },
715         { .name     = "readlink",   .errmsg = true, },
716         { .name     = "readlinkat", .errmsg = true,
717           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
718         { .name     = "readv",      .errmsg = true, },
719         { .name     = "recvfrom",   .errmsg = true,
720           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
721         { .name     = "recvmmsg",   .errmsg = true,
722           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
723         { .name     = "recvmsg",    .errmsg = true,
724           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
725         { .name     = "removexattr", .errmsg = true, },
726         { .name     = "renameat",   .errmsg = true,
727           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
728         { .name     = "rmdir",    .errmsg = true, },
729         { .name     = "rt_sigaction", .errmsg = true,
730           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
731         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
732         { .name     = "rt_sigqueueinfo", .errmsg = true,
733           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
734         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
735           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
736         { .name     = "sched_getattr",        .errmsg = true, },
737         { .name     = "sched_setattr",        .errmsg = true, },
738         { .name     = "sched_setscheduler",   .errmsg = true,
739           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
740         { .name     = "seccomp", .errmsg = true,
741           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
742                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
743         { .name     = "select",     .errmsg = true, .timeout = true, },
744         { .name     = "sendmmsg",    .errmsg = true,
745           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
746         { .name     = "sendmsg",    .errmsg = true,
747           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
748         { .name     = "sendto",     .errmsg = true,
749           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
750         { .name     = "set_tid_address", .errpid = true, },
751         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
752         { .name     = "setpgid",    .errmsg = true, },
753         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
754         { .name     = "setxattr",   .errmsg = true, },
755         { .name     = "shutdown",   .errmsg = true, },
756         { .name     = "socket",     .errmsg = true,
757           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
758                              [1] = SCA_SK_TYPE, /* type */ },
759           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
760         { .name     = "socketpair", .errmsg = true,
761           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
762                              [1] = SCA_SK_TYPE, /* type */ },
763           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
764         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
765         { .name     = "statfs",     .errmsg = true, },
766         { .name     = "statx",      .errmsg = true,
767           .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
768                              [2] = SCA_STATX_FLAGS, /* flags */
769                              [3] = SCA_STATX_MASK, /* mask */ }, },
770         { .name     = "swapoff",    .errmsg = true,
771           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
772         { .name     = "swapon",     .errmsg = true,
773           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
774         { .name     = "symlinkat",  .errmsg = true,
775           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
776         { .name     = "tgkill",     .errmsg = true,
777           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
778         { .name     = "tkill",      .errmsg = true,
779           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
780         { .name     = "truncate",   .errmsg = true, },
781         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
782         { .name     = "unlinkat",   .errmsg = true,
783           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
784         { .name     = "utime",  .errmsg = true, },
785         { .name     = "utimensat",  .errmsg = true,
786           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
787         { .name     = "utimes",  .errmsg = true, },
788         { .name     = "vmsplice",  .errmsg = true, },
789         { .name     = "wait4",      .errpid = true,
790           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
791         { .name     = "waitid",     .errpid = true,
792           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
793         { .name     = "write",      .errmsg = true, },
794         { .name     = "writev",     .errmsg = true, },
795 };
796
797 static int syscall_fmt__cmp(const void *name, const void *fmtp)
798 {
799         const struct syscall_fmt *fmt = fmtp;
800         return strcmp(name, fmt->name);
801 }
802
803 static struct syscall_fmt *syscall_fmt__find(const char *name)
804 {
805         const int nmemb = ARRAY_SIZE(syscall_fmts);
806         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
807 }
808
809 struct syscall {
810         struct event_format *tp_format;
811         int                 nr_args;
812         struct format_field *args;
813         const char          *name;
814         bool                is_exit;
815         struct syscall_fmt  *fmt;
816         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
817         void                **arg_parm;
818 };
819
820 /*
821  * We need to have this 'calculated' boolean because in some cases we really
822  * don't know what is the duration of a syscall, for instance, when we start
823  * a session and some threads are waiting for a syscall to finish, say 'poll',
824  * in which case all we can do is to print "( ? ) for duration and for the
825  * start timestamp.
826  */
827 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
828 {
829         double duration = (double)t / NSEC_PER_MSEC;
830         size_t printed = fprintf(fp, "(");
831
832         if (!calculated)
833                 printed += fprintf(fp, "     ?   ");
834         else if (duration >= 1.0)
835                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
836         else if (duration >= 0.01)
837                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
838         else
839                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
840         return printed + fprintf(fp, "): ");
841 }
842
843 /**
844  * filename.ptr: The filename char pointer that will be vfs_getname'd
845  * filename.entry_str_pos: Where to insert the string translated from
846  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
847  */
848 struct thread_trace {
849         u64               entry_time;
850         bool              entry_pending;
851         unsigned long     nr_events;
852         unsigned long     pfmaj, pfmin;
853         char              *entry_str;
854         double            runtime_ms;
855         struct {
856                 unsigned long ptr;
857                 short int     entry_str_pos;
858                 bool          pending_open;
859                 unsigned int  namelen;
860                 char          *name;
861         } filename;
862         struct {
863                 int       max;
864                 char      **table;
865         } paths;
866
867         struct intlist *syscall_stats;
868 };
869
870 static struct thread_trace *thread_trace__new(void)
871 {
872         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
873
874         if (ttrace)
875                 ttrace->paths.max = -1;
876
877         ttrace->syscall_stats = intlist__new(NULL);
878
879         return ttrace;
880 }
881
882 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
883 {
884         struct thread_trace *ttrace;
885
886         if (thread == NULL)
887                 goto fail;
888
889         if (thread__priv(thread) == NULL)
890                 thread__set_priv(thread, thread_trace__new());
891
892         if (thread__priv(thread) == NULL)
893                 goto fail;
894
895         ttrace = thread__priv(thread);
896         ++ttrace->nr_events;
897
898         return ttrace;
899 fail:
900         color_fprintf(fp, PERF_COLOR_RED,
901                       "WARNING: not enough memory, dropping samples!\n");
902         return NULL;
903 }
904
905 #define TRACE_PFMAJ             (1 << 0)
906 #define TRACE_PFMIN             (1 << 1)
907
908 static const size_t trace__entry_str_size = 2048;
909
910 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
911 {
912         struct thread_trace *ttrace = thread__priv(thread);
913
914         if (fd > ttrace->paths.max) {
915                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
916
917                 if (npath == NULL)
918                         return -1;
919
920                 if (ttrace->paths.max != -1) {
921                         memset(npath + ttrace->paths.max + 1, 0,
922                                (fd - ttrace->paths.max) * sizeof(char *));
923                 } else {
924                         memset(npath, 0, (fd + 1) * sizeof(char *));
925                 }
926
927                 ttrace->paths.table = npath;
928                 ttrace->paths.max   = fd;
929         }
930
931         ttrace->paths.table[fd] = strdup(pathname);
932
933         return ttrace->paths.table[fd] != NULL ? 0 : -1;
934 }
935
936 static int thread__read_fd_path(struct thread *thread, int fd)
937 {
938         char linkname[PATH_MAX], pathname[PATH_MAX];
939         struct stat st;
940         int ret;
941
942         if (thread->pid_ == thread->tid) {
943                 scnprintf(linkname, sizeof(linkname),
944                           "/proc/%d/fd/%d", thread->pid_, fd);
945         } else {
946                 scnprintf(linkname, sizeof(linkname),
947                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
948         }
949
950         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
951                 return -1;
952
953         ret = readlink(linkname, pathname, sizeof(pathname));
954
955         if (ret < 0 || ret > st.st_size)
956                 return -1;
957
958         pathname[ret] = '\0';
959         return trace__set_fd_pathname(thread, fd, pathname);
960 }
961
962 static const char *thread__fd_path(struct thread *thread, int fd,
963                                    struct trace *trace)
964 {
965         struct thread_trace *ttrace = thread__priv(thread);
966
967         if (ttrace == NULL)
968                 return NULL;
969
970         if (fd < 0)
971                 return NULL;
972
973         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
974                 if (!trace->live)
975                         return NULL;
976                 ++trace->stats.proc_getname;
977                 if (thread__read_fd_path(thread, fd))
978                         return NULL;
979         }
980
981         return ttrace->paths.table[fd];
982 }
983
984 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
985                                         struct syscall_arg *arg)
986 {
987         int fd = arg->val;
988         size_t printed = scnprintf(bf, size, "%d", fd);
989         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
990
991         if (path)
992                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
993
994         return printed;
995 }
996
997 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
998                                               struct syscall_arg *arg)
999 {
1000         int fd = arg->val;
1001         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1002         struct thread_trace *ttrace = thread__priv(arg->thread);
1003
1004         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1005                 zfree(&ttrace->paths.table[fd]);
1006
1007         return printed;
1008 }
1009
1010 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1011                                      unsigned long ptr)
1012 {
1013         struct thread_trace *ttrace = thread__priv(thread);
1014
1015         ttrace->filename.ptr = ptr;
1016         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1017 }
1018
1019 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1020                                               struct syscall_arg *arg)
1021 {
1022         unsigned long ptr = arg->val;
1023
1024         if (!arg->trace->vfs_getname)
1025                 return scnprintf(bf, size, "%#x", ptr);
1026
1027         thread__set_filename_pos(arg->thread, bf, ptr);
1028         return 0;
1029 }
1030
1031 static bool trace__filter_duration(struct trace *trace, double t)
1032 {
1033         return t < (trace->duration_filter * NSEC_PER_MSEC);
1034 }
1035
1036 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1037 {
1038         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1039
1040         return fprintf(fp, "%10.3f ", ts);
1041 }
1042
1043 /*
1044  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1045  * using ttrace->entry_time for a thread that receives a sys_exit without
1046  * first having received a sys_enter ("poll" issued before tracing session
1047  * starts, lost sys_enter exit due to ring buffer overflow).
1048  */
1049 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1050 {
1051         if (tstamp > 0)
1052                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1053
1054         return fprintf(fp, "         ? ");
1055 }
1056
1057 static bool done = false;
1058 static bool interrupted = false;
1059
1060 static void sig_handler(int sig)
1061 {
1062         done = true;
1063         interrupted = sig == SIGINT;
1064 }
1065
1066 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1067                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1068 {
1069         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1070         printed += fprintf_duration(duration, duration_calculated, fp);
1071
1072         if (trace->multiple_threads) {
1073                 if (trace->show_comm)
1074                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1075                 printed += fprintf(fp, "%d ", thread->tid);
1076         }
1077
1078         return printed;
1079 }
1080
1081 static int trace__process_event(struct trace *trace, struct machine *machine,
1082                                 union perf_event *event, struct perf_sample *sample)
1083 {
1084         int ret = 0;
1085
1086         switch (event->header.type) {
1087         case PERF_RECORD_LOST:
1088                 color_fprintf(trace->output, PERF_COLOR_RED,
1089                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1090                 ret = machine__process_lost_event(machine, event, sample);
1091                 break;
1092         default:
1093                 ret = machine__process_event(machine, event, sample);
1094                 break;
1095         }
1096
1097         return ret;
1098 }
1099
1100 static int trace__tool_process(struct perf_tool *tool,
1101                                union perf_event *event,
1102                                struct perf_sample *sample,
1103                                struct machine *machine)
1104 {
1105         struct trace *trace = container_of(tool, struct trace, tool);
1106         return trace__process_event(trace, machine, event, sample);
1107 }
1108
1109 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1110 {
1111         struct machine *machine = vmachine;
1112
1113         if (machine->kptr_restrict_warned)
1114                 return NULL;
1115
1116         if (symbol_conf.kptr_restrict) {
1117                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1118                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1119                            "Kernel samples will not be resolved.\n");
1120                 machine->kptr_restrict_warned = true;
1121                 return NULL;
1122         }
1123
1124         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1125 }
1126
1127 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1128 {
1129         int err = symbol__init(NULL);
1130
1131         if (err)
1132                 return err;
1133
1134         trace->host = machine__new_host();
1135         if (trace->host == NULL)
1136                 return -ENOMEM;
1137
1138         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1139                 return -errno;
1140
1141         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1142                                             evlist->threads, trace__tool_process, false,
1143                                             trace->opts.proc_map_timeout);
1144         if (err)
1145                 symbol__exit();
1146
1147         return err;
1148 }
1149
1150 static int syscall__set_arg_fmts(struct syscall *sc)
1151 {
1152         struct format_field *field;
1153         int idx = 0, len;
1154
1155         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1156         if (sc->arg_scnprintf == NULL)
1157                 return -1;
1158
1159         if (sc->fmt)
1160                 sc->arg_parm = sc->fmt->arg_parm;
1161
1162         for (field = sc->args; field; field = field->next) {
1163                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1164                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1165                 else if (strcmp(field->type, "const char *") == 0 &&
1166                          (strcmp(field->name, "filename") == 0 ||
1167                           strcmp(field->name, "path") == 0 ||
1168                           strcmp(field->name, "pathname") == 0))
1169                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1170                 else if (field->flags & FIELD_IS_POINTER)
1171                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1172                 else if (strcmp(field->type, "pid_t") == 0)
1173                         sc->arg_scnprintf[idx] = SCA_PID;
1174                 else if (strcmp(field->type, "umode_t") == 0)
1175                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1176                 else if ((strcmp(field->type, "int") == 0 ||
1177                           strcmp(field->type, "unsigned int") == 0 ||
1178                           strcmp(field->type, "long") == 0) &&
1179                          (len = strlen(field->name)) >= 2 &&
1180                          strcmp(field->name + len - 2, "fd") == 0) {
1181                         /*
1182                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1183                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1184                          * 65 int
1185                          * 23 unsigned int
1186                          * 7 unsigned long
1187                          */
1188                         sc->arg_scnprintf[idx] = SCA_FD;
1189                 }
1190                 ++idx;
1191         }
1192
1193         return 0;
1194 }
1195
1196 static int trace__read_syscall_info(struct trace *trace, int id)
1197 {
1198         char tp_name[128];
1199         struct syscall *sc;
1200         const char *name = syscalltbl__name(trace->sctbl, id);
1201
1202         if (name == NULL)
1203                 return -1;
1204
1205         if (id > trace->syscalls.max) {
1206                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1207
1208                 if (nsyscalls == NULL)
1209                         return -1;
1210
1211                 if (trace->syscalls.max != -1) {
1212                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1213                                (id - trace->syscalls.max) * sizeof(*sc));
1214                 } else {
1215                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1216                 }
1217
1218                 trace->syscalls.table = nsyscalls;
1219                 trace->syscalls.max   = id;
1220         }
1221
1222         sc = trace->syscalls.table + id;
1223         sc->name = name;
1224
1225         sc->fmt  = syscall_fmt__find(sc->name);
1226
1227         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1228         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1229
1230         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1231                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1232                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1233         }
1234
1235         if (IS_ERR(sc->tp_format))
1236                 return -1;
1237
1238         sc->args = sc->tp_format->format.fields;
1239         sc->nr_args = sc->tp_format->format.nr_fields;
1240         /*
1241          * We need to check and discard the first variable '__syscall_nr'
1242          * or 'nr' that mean the syscall number. It is needless here.
1243          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1244          */
1245         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1246                 sc->args = sc->args->next;
1247                 --sc->nr_args;
1248         }
1249
1250         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1251
1252         return syscall__set_arg_fmts(sc);
1253 }
1254
1255 static int trace__validate_ev_qualifier(struct trace *trace)
1256 {
1257         int err = 0, i;
1258         struct str_node *pos;
1259
1260         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1261         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1262                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1263
1264         if (trace->ev_qualifier_ids.entries == NULL) {
1265                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1266                        trace->output);
1267                 err = -EINVAL;
1268                 goto out;
1269         }
1270
1271         i = 0;
1272
1273         strlist__for_each_entry(pos, trace->ev_qualifier) {
1274                 const char *sc = pos->s;
1275                 int id = syscalltbl__id(trace->sctbl, sc);
1276
1277                 if (id < 0) {
1278                         if (err == 0) {
1279                                 fputs("Error:\tInvalid syscall ", trace->output);
1280                                 err = -EINVAL;
1281                         } else {
1282                                 fputs(", ", trace->output);
1283                         }
1284
1285                         fputs(sc, trace->output);
1286                 }
1287
1288                 trace->ev_qualifier_ids.entries[i++] = id;
1289         }
1290
1291         if (err < 0) {
1292                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1293                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1294                 zfree(&trace->ev_qualifier_ids.entries);
1295                 trace->ev_qualifier_ids.nr = 0;
1296         }
1297 out:
1298         return err;
1299 }
1300
1301 /*
1302  * args is to be interpreted as a series of longs but we need to handle
1303  * 8-byte unaligned accesses. args points to raw_data within the event
1304  * and raw_data is guaranteed to be 8-byte unaligned because it is
1305  * preceded by raw_size which is a u32. So we need to copy args to a temp
1306  * variable to read it. Most notably this avoids extended load instructions
1307  * on unaligned addresses
1308  */
1309
1310 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1311                                       unsigned char *args, struct trace *trace,
1312                                       struct thread *thread)
1313 {
1314         size_t printed = 0;
1315         unsigned char *p;
1316         unsigned long val;
1317
1318         if (sc->args != NULL) {
1319                 struct format_field *field;
1320                 u8 bit = 1;
1321                 struct syscall_arg arg = {
1322                         .idx    = 0,
1323                         .mask   = 0,
1324                         .trace  = trace,
1325                         .thread = thread,
1326                 };
1327
1328                 for (field = sc->args; field;
1329                      field = field->next, ++arg.idx, bit <<= 1) {
1330                         if (arg.mask & bit)
1331                                 continue;
1332
1333                         /* special care for unaligned accesses */
1334                         p = args + sizeof(unsigned long) * arg.idx;
1335                         memcpy(&val, p, sizeof(val));
1336
1337                         /*
1338                          * Suppress this argument if its value is zero and
1339                          * and we don't have a string associated in an
1340                          * strarray for it.
1341                          */
1342                         if (val == 0 &&
1343                             !(sc->arg_scnprintf &&
1344                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1345                               sc->arg_parm[arg.idx]))
1346                                 continue;
1347
1348                         printed += scnprintf(bf + printed, size - printed,
1349                                              "%s%s: ", printed ? ", " : "", field->name);
1350                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1351                                 arg.val = val;
1352                                 if (sc->arg_parm)
1353                                         arg.parm = sc->arg_parm[arg.idx];
1354                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1355                                                                       size - printed, &arg);
1356                         } else {
1357                                 printed += scnprintf(bf + printed, size - printed,
1358                                                      "%ld", val);
1359                         }
1360                 }
1361         } else if (IS_ERR(sc->tp_format)) {
1362                 /*
1363                  * If we managed to read the tracepoint /format file, then we
1364                  * may end up not having any args, like with gettid(), so only
1365                  * print the raw args when we didn't manage to read it.
1366                  */
1367                 int i = 0;
1368
1369                 while (i < 6) {
1370                         /* special care for unaligned accesses */
1371                         p = args + sizeof(unsigned long) * i;
1372                         memcpy(&val, p, sizeof(val));
1373                         printed += scnprintf(bf + printed, size - printed,
1374                                              "%sarg%d: %ld",
1375                                              printed ? ", " : "", i, val);
1376                         ++i;
1377                 }
1378         }
1379
1380         return printed;
1381 }
1382
1383 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1384                                   union perf_event *event,
1385                                   struct perf_sample *sample);
1386
1387 static struct syscall *trace__syscall_info(struct trace *trace,
1388                                            struct perf_evsel *evsel, int id)
1389 {
1390
1391         if (id < 0) {
1392
1393                 /*
1394                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1395                  * before that, leaving at a higher verbosity level till that is
1396                  * explained. Reproduced with plain ftrace with:
1397                  *
1398                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1399                  * grep "NR -1 " /t/trace_pipe
1400                  *
1401                  * After generating some load on the machine.
1402                  */
1403                 if (verbose > 1) {
1404                         static u64 n;
1405                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1406                                 id, perf_evsel__name(evsel), ++n);
1407                 }
1408                 return NULL;
1409         }
1410
1411         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1412             trace__read_syscall_info(trace, id))
1413                 goto out_cant_read;
1414
1415         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1416                 goto out_cant_read;
1417
1418         return &trace->syscalls.table[id];
1419
1420 out_cant_read:
1421         if (verbose > 0) {
1422                 fprintf(trace->output, "Problems reading syscall %d", id);
1423                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1424                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1425                 fputs(" information\n", trace->output);
1426         }
1427         return NULL;
1428 }
1429
1430 static void thread__update_stats(struct thread_trace *ttrace,
1431                                  int id, struct perf_sample *sample)
1432 {
1433         struct int_node *inode;
1434         struct stats *stats;
1435         u64 duration = 0;
1436
1437         inode = intlist__findnew(ttrace->syscall_stats, id);
1438         if (inode == NULL)
1439                 return;
1440
1441         stats = inode->priv;
1442         if (stats == NULL) {
1443                 stats = malloc(sizeof(struct stats));
1444                 if (stats == NULL)
1445                         return;
1446                 init_stats(stats);
1447                 inode->priv = stats;
1448         }
1449
1450         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1451                 duration = sample->time - ttrace->entry_time;
1452
1453         update_stats(stats, duration);
1454 }
1455
1456 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1457 {
1458         struct thread_trace *ttrace;
1459         u64 duration;
1460         size_t printed;
1461
1462         if (trace->current == NULL)
1463                 return 0;
1464
1465         ttrace = thread__priv(trace->current);
1466
1467         if (!ttrace->entry_pending)
1468                 return 0;
1469
1470         duration = sample->time - ttrace->entry_time;
1471
1472         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1473         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1474         ttrace->entry_pending = false;
1475
1476         return printed;
1477 }
1478
1479 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1480                             union perf_event *event __maybe_unused,
1481                             struct perf_sample *sample)
1482 {
1483         char *msg;
1484         void *args;
1485         size_t printed = 0;
1486         struct thread *thread;
1487         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1488         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1489         struct thread_trace *ttrace;
1490
1491         if (sc == NULL)
1492                 return -1;
1493
1494         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1495         ttrace = thread__trace(thread, trace->output);
1496         if (ttrace == NULL)
1497                 goto out_put;
1498
1499         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1500
1501         if (ttrace->entry_str == NULL) {
1502                 ttrace->entry_str = malloc(trace__entry_str_size);
1503                 if (!ttrace->entry_str)
1504                         goto out_put;
1505         }
1506
1507         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1508                 trace__printf_interrupted_entry(trace, sample);
1509
1510         ttrace->entry_time = sample->time;
1511         msg = ttrace->entry_str;
1512         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1513
1514         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1515                                            args, trace, thread);
1516
1517         if (sc->is_exit) {
1518                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1519                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1520                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1521                 }
1522         } else {
1523                 ttrace->entry_pending = true;
1524                 /* See trace__vfs_getname & trace__sys_exit */
1525                 ttrace->filename.pending_open = false;
1526         }
1527
1528         if (trace->current != thread) {
1529                 thread__put(trace->current);
1530                 trace->current = thread__get(thread);
1531         }
1532         err = 0;
1533 out_put:
1534         thread__put(thread);
1535         return err;
1536 }
1537
1538 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1539                                     struct perf_sample *sample,
1540                                     struct callchain_cursor *cursor)
1541 {
1542         struct addr_location al;
1543
1544         if (machine__resolve(trace->host, &al, sample) < 0 ||
1545             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1546                 return -1;
1547
1548         return 0;
1549 }
1550
1551 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1552 {
1553         /* TODO: user-configurable print_opts */
1554         const unsigned int print_opts = EVSEL__PRINT_SYM |
1555                                         EVSEL__PRINT_DSO |
1556                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1557
1558         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1559 }
1560
1561 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1562                            union perf_event *event __maybe_unused,
1563                            struct perf_sample *sample)
1564 {
1565         long ret;
1566         u64 duration = 0;
1567         bool duration_calculated = false;
1568         struct thread *thread;
1569         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1570         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1571         struct thread_trace *ttrace;
1572
1573         if (sc == NULL)
1574                 return -1;
1575
1576         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1577         ttrace = thread__trace(thread, trace->output);
1578         if (ttrace == NULL)
1579                 goto out_put;
1580
1581         if (trace->summary)
1582                 thread__update_stats(ttrace, id, sample);
1583
1584         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1585
1586         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1587                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1588                 ttrace->filename.pending_open = false;
1589                 ++trace->stats.vfs_getname;
1590         }
1591
1592         if (ttrace->entry_time) {
1593                 duration = sample->time - ttrace->entry_time;
1594                 if (trace__filter_duration(trace, duration))
1595                         goto out;
1596                 duration_calculated = true;
1597         } else if (trace->duration_filter)
1598                 goto out;
1599
1600         if (sample->callchain) {
1601                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1602                 if (callchain_ret == 0) {
1603                         if (callchain_cursor.nr < trace->min_stack)
1604                                 goto out;
1605                         callchain_ret = 1;
1606                 }
1607         }
1608
1609         if (trace->summary_only)
1610                 goto out;
1611
1612         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1613
1614         if (ttrace->entry_pending) {
1615                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1616         } else {
1617                 fprintf(trace->output, " ... [");
1618                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1619                 fprintf(trace->output, "]: %s()", sc->name);
1620         }
1621
1622         if (sc->fmt == NULL) {
1623 signed_print:
1624                 fprintf(trace->output, ") = %ld", ret);
1625         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1626                 char bf[STRERR_BUFSIZE];
1627                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1628                            *e = audit_errno_to_name(-ret);
1629
1630                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1631         } else if (ret == 0 && sc->fmt->timeout)
1632                 fprintf(trace->output, ") = 0 Timeout");
1633         else if (sc->fmt->hexret)
1634                 fprintf(trace->output, ") = %#lx", ret);
1635         else if (sc->fmt->errpid) {
1636                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1637
1638                 if (child != NULL) {
1639                         fprintf(trace->output, ") = %ld", ret);
1640                         if (child->comm_set)
1641                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1642                         thread__put(child);
1643                 }
1644         } else
1645                 goto signed_print;
1646
1647         fputc('\n', trace->output);
1648
1649         if (callchain_ret > 0)
1650                 trace__fprintf_callchain(trace, sample);
1651         else if (callchain_ret < 0)
1652                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1653 out:
1654         ttrace->entry_pending = false;
1655         err = 0;
1656 out_put:
1657         thread__put(thread);
1658         return err;
1659 }
1660
1661 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1662                               union perf_event *event __maybe_unused,
1663                               struct perf_sample *sample)
1664 {
1665         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1666         struct thread_trace *ttrace;
1667         size_t filename_len, entry_str_len, to_move;
1668         ssize_t remaining_space;
1669         char *pos;
1670         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1671
1672         if (!thread)
1673                 goto out;
1674
1675         ttrace = thread__priv(thread);
1676         if (!ttrace)
1677                 goto out_put;
1678
1679         filename_len = strlen(filename);
1680         if (filename_len == 0)
1681                 goto out_put;
1682
1683         if (ttrace->filename.namelen < filename_len) {
1684                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1685
1686                 if (f == NULL)
1687                         goto out_put;
1688
1689                 ttrace->filename.namelen = filename_len;
1690                 ttrace->filename.name = f;
1691         }
1692
1693         strcpy(ttrace->filename.name, filename);
1694         ttrace->filename.pending_open = true;
1695
1696         if (!ttrace->filename.ptr)
1697                 goto out_put;
1698
1699         entry_str_len = strlen(ttrace->entry_str);
1700         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1701         if (remaining_space <= 0)
1702                 goto out_put;
1703
1704         if (filename_len > (size_t)remaining_space) {
1705                 filename += filename_len - remaining_space;
1706                 filename_len = remaining_space;
1707         }
1708
1709         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1710         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1711         memmove(pos + filename_len, pos, to_move);
1712         memcpy(pos, filename, filename_len);
1713
1714         ttrace->filename.ptr = 0;
1715         ttrace->filename.entry_str_pos = 0;
1716 out_put:
1717         thread__put(thread);
1718 out:
1719         return 0;
1720 }
1721
1722 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1723                                      union perf_event *event __maybe_unused,
1724                                      struct perf_sample *sample)
1725 {
1726         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1727         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1728         struct thread *thread = machine__findnew_thread(trace->host,
1729                                                         sample->pid,
1730                                                         sample->tid);
1731         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1732
1733         if (ttrace == NULL)
1734                 goto out_dump;
1735
1736         ttrace->runtime_ms += runtime_ms;
1737         trace->runtime_ms += runtime_ms;
1738 out_put:
1739         thread__put(thread);
1740         return 0;
1741
1742 out_dump:
1743         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1744                evsel->name,
1745                perf_evsel__strval(evsel, sample, "comm"),
1746                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1747                runtime,
1748                perf_evsel__intval(evsel, sample, "vruntime"));
1749         goto out_put;
1750 }
1751
1752 static void bpf_output__printer(enum binary_printer_ops op,
1753                                 unsigned int val, void *extra)
1754 {
1755         FILE *output = extra;
1756         unsigned char ch = (unsigned char)val;
1757
1758         switch (op) {
1759         case BINARY_PRINT_CHAR_DATA:
1760                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1761                 break;
1762         case BINARY_PRINT_DATA_BEGIN:
1763         case BINARY_PRINT_LINE_BEGIN:
1764         case BINARY_PRINT_ADDR:
1765         case BINARY_PRINT_NUM_DATA:
1766         case BINARY_PRINT_NUM_PAD:
1767         case BINARY_PRINT_SEP:
1768         case BINARY_PRINT_CHAR_PAD:
1769         case BINARY_PRINT_LINE_END:
1770         case BINARY_PRINT_DATA_END:
1771         default:
1772                 break;
1773         }
1774 }
1775
1776 static void bpf_output__fprintf(struct trace *trace,
1777                                 struct perf_sample *sample)
1778 {
1779         print_binary(sample->raw_data, sample->raw_size, 8,
1780                      bpf_output__printer, trace->output);
1781 }
1782
1783 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1784                                 union perf_event *event __maybe_unused,
1785                                 struct perf_sample *sample)
1786 {
1787         int callchain_ret = 0;
1788
1789         if (sample->callchain) {
1790                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1791                 if (callchain_ret == 0) {
1792                         if (callchain_cursor.nr < trace->min_stack)
1793                                 goto out;
1794                         callchain_ret = 1;
1795                 }
1796         }
1797
1798         trace__printf_interrupted_entry(trace, sample);
1799         trace__fprintf_tstamp(trace, sample->time, trace->output);
1800
1801         if (trace->trace_syscalls)
1802                 fprintf(trace->output, "(         ): ");
1803
1804         fprintf(trace->output, "%s:", evsel->name);
1805
1806         if (perf_evsel__is_bpf_output(evsel)) {
1807                 bpf_output__fprintf(trace, sample);
1808         } else if (evsel->tp_format) {
1809                 event_format__fprintf(evsel->tp_format, sample->cpu,
1810                                       sample->raw_data, sample->raw_size,
1811                                       trace->output);
1812         }
1813
1814         fprintf(trace->output, ")\n");
1815
1816         if (callchain_ret > 0)
1817                 trace__fprintf_callchain(trace, sample);
1818         else if (callchain_ret < 0)
1819                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1820 out:
1821         return 0;
1822 }
1823
1824 static void print_location(FILE *f, struct perf_sample *sample,
1825                            struct addr_location *al,
1826                            bool print_dso, bool print_sym)
1827 {
1828
1829         if ((verbose > 0 || print_dso) && al->map)
1830                 fprintf(f, "%s@", al->map->dso->long_name);
1831
1832         if ((verbose > 0 || print_sym) && al->sym)
1833                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1834                         al->addr - al->sym->start);
1835         else if (al->map)
1836                 fprintf(f, "0x%" PRIx64, al->addr);
1837         else
1838                 fprintf(f, "0x%" PRIx64, sample->addr);
1839 }
1840
1841 static int trace__pgfault(struct trace *trace,
1842                           struct perf_evsel *evsel,
1843                           union perf_event *event __maybe_unused,
1844                           struct perf_sample *sample)
1845 {
1846         struct thread *thread;
1847         struct addr_location al;
1848         char map_type = 'd';
1849         struct thread_trace *ttrace;
1850         int err = -1;
1851         int callchain_ret = 0;
1852
1853         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1854
1855         if (sample->callchain) {
1856                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1857                 if (callchain_ret == 0) {
1858                         if (callchain_cursor.nr < trace->min_stack)
1859                                 goto out_put;
1860                         callchain_ret = 1;
1861                 }
1862         }
1863
1864         ttrace = thread__trace(thread, trace->output);
1865         if (ttrace == NULL)
1866                 goto out_put;
1867
1868         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1869                 ttrace->pfmaj++;
1870         else
1871                 ttrace->pfmin++;
1872
1873         if (trace->summary_only)
1874                 goto out;
1875
1876         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1877                               sample->ip, &al);
1878
1879         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1880
1881         fprintf(trace->output, "%sfault [",
1882                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1883                 "maj" : "min");
1884
1885         print_location(trace->output, sample, &al, false, true);
1886
1887         fprintf(trace->output, "] => ");
1888
1889         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1890                                    sample->addr, &al);
1891
1892         if (!al.map) {
1893                 thread__find_addr_location(thread, sample->cpumode,
1894                                            MAP__FUNCTION, sample->addr, &al);
1895
1896                 if (al.map)
1897                         map_type = 'x';
1898                 else
1899                         map_type = '?';
1900         }
1901
1902         print_location(trace->output, sample, &al, true, false);
1903
1904         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1905
1906         if (callchain_ret > 0)
1907                 trace__fprintf_callchain(trace, sample);
1908         else if (callchain_ret < 0)
1909                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1910 out:
1911         err = 0;
1912 out_put:
1913         thread__put(thread);
1914         return err;
1915 }
1916
1917 static void trace__set_base_time(struct trace *trace,
1918                                  struct perf_evsel *evsel,
1919                                  struct perf_sample *sample)
1920 {
1921         /*
1922          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1923          * and don't use sample->time unconditionally, we may end up having
1924          * some other event in the future without PERF_SAMPLE_TIME for good
1925          * reason, i.e. we may not be interested in its timestamps, just in
1926          * it taking place, picking some piece of information when it
1927          * appears in our event stream (vfs_getname comes to mind).
1928          */
1929         if (trace->base_time == 0 && !trace->full_time &&
1930             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1931                 trace->base_time = sample->time;
1932 }
1933
1934 static int trace__process_sample(struct perf_tool *tool,
1935                                  union perf_event *event,
1936                                  struct perf_sample *sample,
1937                                  struct perf_evsel *evsel,
1938                                  struct machine *machine __maybe_unused)
1939 {
1940         struct trace *trace = container_of(tool, struct trace, tool);
1941         struct thread *thread;
1942         int err = 0;
1943
1944         tracepoint_handler handler = evsel->handler;
1945
1946         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1947         if (thread && thread__is_filtered(thread))
1948                 goto out;
1949
1950         trace__set_base_time(trace, evsel, sample);
1951
1952         if (handler) {
1953                 ++trace->nr_events;
1954                 handler(trace, evsel, event, sample);
1955         }
1956 out:
1957         thread__put(thread);
1958         return err;
1959 }
1960
1961 static int trace__record(struct trace *trace, int argc, const char **argv)
1962 {
1963         unsigned int rec_argc, i, j;
1964         const char **rec_argv;
1965         const char * const record_args[] = {
1966                 "record",
1967                 "-R",
1968                 "-m", "1024",
1969                 "-c", "1",
1970         };
1971
1972         const char * const sc_args[] = { "-e", };
1973         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1974         const char * const majpf_args[] = { "-e", "major-faults" };
1975         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1976         const char * const minpf_args[] = { "-e", "minor-faults" };
1977         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1978
1979         /* +1 is for the event string below */
1980         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1981                 majpf_args_nr + minpf_args_nr + argc;
1982         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1983
1984         if (rec_argv == NULL)
1985                 return -ENOMEM;
1986
1987         j = 0;
1988         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1989                 rec_argv[j++] = record_args[i];
1990
1991         if (trace->trace_syscalls) {
1992                 for (i = 0; i < sc_args_nr; i++)
1993                         rec_argv[j++] = sc_args[i];
1994
1995                 /* event string may be different for older kernels - e.g., RHEL6 */
1996                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1997                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1998                 else if (is_valid_tracepoint("syscalls:sys_enter"))
1999                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2000                 else {
2001                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2002                         return -1;
2003                 }
2004         }
2005
2006         if (trace->trace_pgfaults & TRACE_PFMAJ)
2007                 for (i = 0; i < majpf_args_nr; i++)
2008                         rec_argv[j++] = majpf_args[i];
2009
2010         if (trace->trace_pgfaults & TRACE_PFMIN)
2011                 for (i = 0; i < minpf_args_nr; i++)
2012                         rec_argv[j++] = minpf_args[i];
2013
2014         for (i = 0; i < (unsigned int)argc; i++)
2015                 rec_argv[j++] = argv[i];
2016
2017         return cmd_record(j, rec_argv);
2018 }
2019
2020 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2021
2022 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2023 {
2024         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2025
2026         if (IS_ERR(evsel))
2027                 return false;
2028
2029         if (perf_evsel__field(evsel, "pathname") == NULL) {
2030                 perf_evsel__delete(evsel);
2031                 return false;
2032         }
2033
2034         evsel->handler = trace__vfs_getname;
2035         perf_evlist__add(evlist, evsel);
2036         return true;
2037 }
2038
2039 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2040 {
2041         struct perf_evsel *evsel;
2042         struct perf_event_attr attr = {
2043                 .type = PERF_TYPE_SOFTWARE,
2044                 .mmap_data = 1,
2045         };
2046
2047         attr.config = config;
2048         attr.sample_period = 1;
2049
2050         event_attr_init(&attr);
2051
2052         evsel = perf_evsel__new(&attr);
2053         if (evsel)
2054                 evsel->handler = trace__pgfault;
2055
2056         return evsel;
2057 }
2058
2059 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2060 {
2061         const u32 type = event->header.type;
2062         struct perf_evsel *evsel;
2063
2064         if (type != PERF_RECORD_SAMPLE) {
2065                 trace__process_event(trace, trace->host, event, sample);
2066                 return;
2067         }
2068
2069         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2070         if (evsel == NULL) {
2071                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2072                 return;
2073         }
2074
2075         trace__set_base_time(trace, evsel, sample);
2076
2077         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2078             sample->raw_data == NULL) {
2079                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2080                        perf_evsel__name(evsel), sample->tid,
2081                        sample->cpu, sample->raw_size);
2082         } else {
2083                 tracepoint_handler handler = evsel->handler;
2084                 handler(trace, evsel, event, sample);
2085         }
2086 }
2087
2088 static int trace__add_syscall_newtp(struct trace *trace)
2089 {
2090         int ret = -1;
2091         struct perf_evlist *evlist = trace->evlist;
2092         struct perf_evsel *sys_enter, *sys_exit;
2093
2094         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2095         if (sys_enter == NULL)
2096                 goto out;
2097
2098         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2099                 goto out_delete_sys_enter;
2100
2101         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2102         if (sys_exit == NULL)
2103                 goto out_delete_sys_enter;
2104
2105         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2106                 goto out_delete_sys_exit;
2107
2108         perf_evlist__add(evlist, sys_enter);
2109         perf_evlist__add(evlist, sys_exit);
2110
2111         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2112                 /*
2113                  * We're interested only in the user space callchain
2114                  * leading to the syscall, allow overriding that for
2115                  * debugging reasons using --kernel_syscall_callchains
2116                  */
2117                 sys_exit->attr.exclude_callchain_kernel = 1;
2118         }
2119
2120         trace->syscalls.events.sys_enter = sys_enter;
2121         trace->syscalls.events.sys_exit  = sys_exit;
2122
2123         ret = 0;
2124 out:
2125         return ret;
2126
2127 out_delete_sys_exit:
2128         perf_evsel__delete_priv(sys_exit);
2129 out_delete_sys_enter:
2130         perf_evsel__delete_priv(sys_enter);
2131         goto out;
2132 }
2133
2134 static int trace__set_ev_qualifier_filter(struct trace *trace)
2135 {
2136         int err = -1;
2137         struct perf_evsel *sys_exit;
2138         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2139                                                 trace->ev_qualifier_ids.nr,
2140                                                 trace->ev_qualifier_ids.entries);
2141
2142         if (filter == NULL)
2143                 goto out_enomem;
2144
2145         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2146                                           filter)) {
2147                 sys_exit = trace->syscalls.events.sys_exit;
2148                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2149         }
2150
2151         free(filter);
2152 out:
2153         return err;
2154 out_enomem:
2155         errno = ENOMEM;
2156         goto out;
2157 }
2158
2159 static int trace__run(struct trace *trace, int argc, const char **argv)
2160 {
2161         struct perf_evlist *evlist = trace->evlist;
2162         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2163         int err = -1, i;
2164         unsigned long before;
2165         const bool forks = argc > 0;
2166         bool draining = false;
2167
2168         trace->live = true;
2169
2170         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2171                 goto out_error_raw_syscalls;
2172
2173         if (trace->trace_syscalls)
2174                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2175
2176         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2177                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2178                 if (pgfault_maj == NULL)
2179                         goto out_error_mem;
2180                 perf_evlist__add(evlist, pgfault_maj);
2181         }
2182
2183         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2184                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2185                 if (pgfault_min == NULL)
2186                         goto out_error_mem;
2187                 perf_evlist__add(evlist, pgfault_min);
2188         }
2189
2190         if (trace->sched &&
2191             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2192                                    trace__sched_stat_runtime))
2193                 goto out_error_sched_stat_runtime;
2194
2195         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2196         if (err < 0) {
2197                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2198                 goto out_delete_evlist;
2199         }
2200
2201         err = trace__symbols_init(trace, evlist);
2202         if (err < 0) {
2203                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2204                 goto out_delete_evlist;
2205         }
2206
2207         perf_evlist__config(evlist, &trace->opts, NULL);
2208
2209         if (callchain_param.enabled) {
2210                 bool use_identifier = false;
2211
2212                 if (trace->syscalls.events.sys_exit) {
2213                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2214                                                      &trace->opts, &callchain_param);
2215                         use_identifier = true;
2216                 }
2217
2218                 if (pgfault_maj) {
2219                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2220                         use_identifier = true;
2221                 }
2222
2223                 if (pgfault_min) {
2224                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2225                         use_identifier = true;
2226                 }
2227
2228                 if (use_identifier) {
2229                        /*
2230                         * Now we have evsels with different sample_ids, use
2231                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2232                         * from a fixed position in each ring buffer record.
2233                         *
2234                         * As of this the changeset introducing this comment, this
2235                         * isn't strictly needed, as the fields that can come before
2236                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2237                         * some of those for things like copying the payload of
2238                         * pointer syscall arguments, and for vfs_getname we don't
2239                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2240                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2241                         */
2242                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2243                         perf_evlist__reset_sample_bit(evlist, ID);
2244                 }
2245         }
2246
2247         signal(SIGCHLD, sig_handler);
2248         signal(SIGINT, sig_handler);
2249
2250         if (forks) {
2251                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2252                                                     argv, false, NULL);
2253                 if (err < 0) {
2254                         fprintf(trace->output, "Couldn't run the workload!\n");
2255                         goto out_delete_evlist;
2256                 }
2257         }
2258
2259         err = perf_evlist__open(evlist);
2260         if (err < 0)
2261                 goto out_error_open;
2262
2263         err = bpf__apply_obj_config();
2264         if (err) {
2265                 char errbuf[BUFSIZ];
2266
2267                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2268                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2269                          errbuf);
2270                 goto out_error_open;
2271         }
2272
2273         /*
2274          * Better not use !target__has_task() here because we need to cover the
2275          * case where no threads were specified in the command line, but a
2276          * workload was, and in that case we will fill in the thread_map when
2277          * we fork the workload in perf_evlist__prepare_workload.
2278          */
2279         if (trace->filter_pids.nr > 0)
2280                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2281         else if (thread_map__pid(evlist->threads, 0) == -1)
2282                 err = perf_evlist__set_filter_pid(evlist, getpid());
2283
2284         if (err < 0)
2285                 goto out_error_mem;
2286
2287         if (trace->ev_qualifier_ids.nr > 0) {
2288                 err = trace__set_ev_qualifier_filter(trace);
2289                 if (err < 0)
2290                         goto out_errno;
2291
2292                 pr_debug("event qualifier tracepoint filter: %s\n",
2293                          trace->syscalls.events.sys_exit->filter);
2294         }
2295
2296         err = perf_evlist__apply_filters(evlist, &evsel);
2297         if (err < 0)
2298                 goto out_error_apply_filters;
2299
2300         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2301         if (err < 0)
2302                 goto out_error_mmap;
2303
2304         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2305                 perf_evlist__enable(evlist);
2306
2307         if (forks)
2308                 perf_evlist__start_workload(evlist);
2309
2310         if (trace->opts.initial_delay) {
2311                 usleep(trace->opts.initial_delay * 1000);
2312                 perf_evlist__enable(evlist);
2313         }
2314
2315         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2316                                   evlist->threads->nr > 1 ||
2317                                   perf_evlist__first(evlist)->attr.inherit;
2318 again:
2319         before = trace->nr_events;
2320
2321         for (i = 0; i < evlist->nr_mmaps; i++) {
2322                 union perf_event *event;
2323
2324                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2325                         struct perf_sample sample;
2326
2327                         ++trace->nr_events;
2328
2329                         err = perf_evlist__parse_sample(evlist, event, &sample);
2330                         if (err) {
2331                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2332                                 goto next_event;
2333                         }
2334
2335                         trace__handle_event(trace, event, &sample);
2336 next_event:
2337                         perf_evlist__mmap_consume(evlist, i);
2338
2339                         if (interrupted)
2340                                 goto out_disable;
2341
2342                         if (done && !draining) {
2343                                 perf_evlist__disable(evlist);
2344                                 draining = true;
2345                         }
2346                 }
2347         }
2348
2349         if (trace->nr_events == before) {
2350                 int timeout = done ? 100 : -1;
2351
2352                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2353                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2354                                 draining = true;
2355
2356                         goto again;
2357                 }
2358         } else {
2359                 goto again;
2360         }
2361
2362 out_disable:
2363         thread__zput(trace->current);
2364
2365         perf_evlist__disable(evlist);
2366
2367         if (!err) {
2368                 if (trace->summary)
2369                         trace__fprintf_thread_summary(trace, trace->output);
2370
2371                 if (trace->show_tool_stats) {
2372                         fprintf(trace->output, "Stats:\n "
2373                                                " vfs_getname : %" PRIu64 "\n"
2374                                                " proc_getname: %" PRIu64 "\n",
2375                                 trace->stats.vfs_getname,
2376                                 trace->stats.proc_getname);
2377                 }
2378         }
2379
2380 out_delete_evlist:
2381         perf_evlist__delete(evlist);
2382         trace->evlist = NULL;
2383         trace->live = false;
2384         return err;
2385 {
2386         char errbuf[BUFSIZ];
2387
2388 out_error_sched_stat_runtime:
2389         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2390         goto out_error;
2391
2392 out_error_raw_syscalls:
2393         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2394         goto out_error;
2395
2396 out_error_mmap:
2397         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2398         goto out_error;
2399
2400 out_error_open:
2401         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2402
2403 out_error:
2404         fprintf(trace->output, "%s\n", errbuf);
2405         goto out_delete_evlist;
2406
2407 out_error_apply_filters:
2408         fprintf(trace->output,
2409                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2410                 evsel->filter, perf_evsel__name(evsel), errno,
2411                 str_error_r(errno, errbuf, sizeof(errbuf)));
2412         goto out_delete_evlist;
2413 }
2414 out_error_mem:
2415         fprintf(trace->output, "Not enough memory to run!\n");
2416         goto out_delete_evlist;
2417
2418 out_errno:
2419         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2420         goto out_delete_evlist;
2421 }
2422
2423 static int trace__replay(struct trace *trace)
2424 {
2425         const struct perf_evsel_str_handler handlers[] = {
2426                 { "probe:vfs_getname",       trace__vfs_getname, },
2427         };
2428         struct perf_data_file file = {
2429                 .path  = input_name,
2430                 .mode  = PERF_DATA_MODE_READ,
2431                 .force = trace->force,
2432         };
2433         struct perf_session *session;
2434         struct perf_evsel *evsel;
2435         int err = -1;
2436
2437         trace->tool.sample        = trace__process_sample;
2438         trace->tool.mmap          = perf_event__process_mmap;
2439         trace->tool.mmap2         = perf_event__process_mmap2;
2440         trace->tool.comm          = perf_event__process_comm;
2441         trace->tool.exit          = perf_event__process_exit;
2442         trace->tool.fork          = perf_event__process_fork;
2443         trace->tool.attr          = perf_event__process_attr;
2444         trace->tool.tracing_data  = perf_event__process_tracing_data;
2445         trace->tool.build_id      = perf_event__process_build_id;
2446         trace->tool.namespaces    = perf_event__process_namespaces;
2447
2448         trace->tool.ordered_events = true;
2449         trace->tool.ordering_requires_timestamps = true;
2450
2451         /* add tid to output */
2452         trace->multiple_threads = true;
2453
2454         session = perf_session__new(&file, false, &trace->tool);
2455         if (session == NULL)
2456                 return -1;
2457
2458         if (trace->opts.target.pid)
2459                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2460
2461         if (trace->opts.target.tid)
2462                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2463
2464         if (symbol__init(&session->header.env) < 0)
2465                 goto out;
2466
2467         trace->host = &session->machines.host;
2468
2469         err = perf_session__set_tracepoints_handlers(session, handlers);
2470         if (err)
2471                 goto out;
2472
2473         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2474                                                      "raw_syscalls:sys_enter");
2475         /* older kernels have syscalls tp versus raw_syscalls */
2476         if (evsel == NULL)
2477                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2478                                                              "syscalls:sys_enter");
2479
2480         if (evsel &&
2481             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2482             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2483                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2484                 goto out;
2485         }
2486
2487         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2488                                                      "raw_syscalls:sys_exit");
2489         if (evsel == NULL)
2490                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2491                                                              "syscalls:sys_exit");
2492         if (evsel &&
2493             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2494             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2495                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2496                 goto out;
2497         }
2498
2499         evlist__for_each_entry(session->evlist, evsel) {
2500                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2501                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2502                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2503                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2504                         evsel->handler = trace__pgfault;
2505         }
2506
2507         setup_pager();
2508
2509         err = perf_session__process_events(session);
2510         if (err)
2511                 pr_err("Failed to process events, error %d", err);
2512
2513         else if (trace->summary)
2514                 trace__fprintf_thread_summary(trace, trace->output);
2515
2516 out:
2517         perf_session__delete(session);
2518
2519         return err;
2520 }
2521
2522 static size_t trace__fprintf_threads_header(FILE *fp)
2523 {
2524         size_t printed;
2525
2526         printed  = fprintf(fp, "\n Summary of events:\n\n");
2527
2528         return printed;
2529 }
2530
2531 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2532         struct stats    *stats;
2533         double          msecs;
2534         int             syscall;
2535 )
2536 {
2537         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2538         struct stats *stats = source->priv;
2539
2540         entry->syscall = source->i;
2541         entry->stats   = stats;
2542         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2543 }
2544
2545 static size_t thread__dump_stats(struct thread_trace *ttrace,
2546                                  struct trace *trace, FILE *fp)
2547 {
2548         size_t printed = 0;
2549         struct syscall *sc;
2550         struct rb_node *nd;
2551         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2552
2553         if (syscall_stats == NULL)
2554                 return 0;
2555
2556         printed += fprintf(fp, "\n");
2557
2558         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2559         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2560         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2561
2562         resort_rb__for_each_entry(nd, syscall_stats) {
2563                 struct stats *stats = syscall_stats_entry->stats;
2564                 if (stats) {
2565                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2566                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2567                         double avg = avg_stats(stats);
2568                         double pct;
2569                         u64 n = (u64) stats->n;
2570
2571                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2572                         avg /= NSEC_PER_MSEC;
2573
2574                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2575                         printed += fprintf(fp, "   %-15s", sc->name);
2576                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2577                                            n, syscall_stats_entry->msecs, min, avg);
2578                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2579                 }
2580         }
2581
2582         resort_rb__delete(syscall_stats);
2583         printed += fprintf(fp, "\n\n");
2584
2585         return printed;
2586 }
2587
2588 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2589 {
2590         size_t printed = 0;
2591         struct thread_trace *ttrace = thread__priv(thread);
2592         double ratio;
2593
2594         if (ttrace == NULL)
2595                 return 0;
2596
2597         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2598
2599         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2600         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2601         printed += fprintf(fp, "%.1f%%", ratio);
2602         if (ttrace->pfmaj)
2603                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2604         if (ttrace->pfmin)
2605                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2606         if (trace->sched)
2607                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2608         else if (fputc('\n', fp) != EOF)
2609                 ++printed;
2610
2611         printed += thread__dump_stats(ttrace, trace, fp);
2612
2613         return printed;
2614 }
2615
2616 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2617 {
2618         return ttrace ? ttrace->nr_events : 0;
2619 }
2620
2621 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2622         struct thread *thread;
2623 )
2624 {
2625         entry->thread = rb_entry(nd, struct thread, rb_node);
2626 }
2627
2628 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2629 {
2630         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2631         size_t printed = trace__fprintf_threads_header(fp);
2632         struct rb_node *nd;
2633
2634         if (threads == NULL) {
2635                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2636                 return 0;
2637         }
2638
2639         resort_rb__for_each_entry(nd, threads)
2640                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2641
2642         resort_rb__delete(threads);
2643
2644         return printed;
2645 }
2646
2647 static int trace__set_duration(const struct option *opt, const char *str,
2648                                int unset __maybe_unused)
2649 {
2650         struct trace *trace = opt->value;
2651
2652         trace->duration_filter = atof(str);
2653         return 0;
2654 }
2655
2656 static int trace__set_filter_pids(const struct option *opt, const char *str,
2657                                   int unset __maybe_unused)
2658 {
2659         int ret = -1;
2660         size_t i;
2661         struct trace *trace = opt->value;
2662         /*
2663          * FIXME: introduce a intarray class, plain parse csv and create a
2664          * { int nr, int entries[] } struct...
2665          */
2666         struct intlist *list = intlist__new(str);
2667
2668         if (list == NULL)
2669                 return -1;
2670
2671         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2672         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2673
2674         if (trace->filter_pids.entries == NULL)
2675                 goto out;
2676
2677         trace->filter_pids.entries[0] = getpid();
2678
2679         for (i = 1; i < trace->filter_pids.nr; ++i)
2680                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2681
2682         intlist__delete(list);
2683         ret = 0;
2684 out:
2685         return ret;
2686 }
2687
2688 static int trace__open_output(struct trace *trace, const char *filename)
2689 {
2690         struct stat st;
2691
2692         if (!stat(filename, &st) && st.st_size) {
2693                 char oldname[PATH_MAX];
2694
2695                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2696                 unlink(oldname);
2697                 rename(filename, oldname);
2698         }
2699
2700         trace->output = fopen(filename, "w");
2701
2702         return trace->output == NULL ? -errno : 0;
2703 }
2704
2705 static int parse_pagefaults(const struct option *opt, const char *str,
2706                             int unset __maybe_unused)
2707 {
2708         int *trace_pgfaults = opt->value;
2709
2710         if (strcmp(str, "all") == 0)
2711                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2712         else if (strcmp(str, "maj") == 0)
2713                 *trace_pgfaults |= TRACE_PFMAJ;
2714         else if (strcmp(str, "min") == 0)
2715                 *trace_pgfaults |= TRACE_PFMIN;
2716         else
2717                 return -1;
2718
2719         return 0;
2720 }
2721
2722 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2723 {
2724         struct perf_evsel *evsel;
2725
2726         evlist__for_each_entry(evlist, evsel)
2727                 evsel->handler = handler;
2728 }
2729
2730 /*
2731  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2732  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2733  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2734  *
2735  * It'd be better to introduce a parse_options() variant that would return a
2736  * list with the terms it didn't match to an event...
2737  */
2738 static int trace__parse_events_option(const struct option *opt, const char *str,
2739                                       int unset __maybe_unused)
2740 {
2741         struct trace *trace = (struct trace *)opt->value;
2742         const char *s = str;
2743         char *sep = NULL, *lists[2] = { NULL, NULL, };
2744         int len = strlen(str), err = -1, list;
2745         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2746         char group_name[PATH_MAX];
2747
2748         if (strace_groups_dir == NULL)
2749                 return -1;
2750
2751         if (*s == '!') {
2752                 ++s;
2753                 trace->not_ev_qualifier = true;
2754         }
2755
2756         while (1) {
2757                 if ((sep = strchr(s, ',')) != NULL)
2758                         *sep = '\0';
2759
2760                 list = 0;
2761                 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2762                         list = 1;
2763                 } else {
2764                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2765                         if (access(group_name, R_OK) == 0)
2766                                 list = 1;
2767                 }
2768
2769                 if (lists[list]) {
2770                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2771                 } else {
2772                         lists[list] = malloc(len);
2773                         if (lists[list] == NULL)
2774                                 goto out;
2775                         strcpy(lists[list], s);
2776                 }
2777
2778                 if (!sep)
2779                         break;
2780
2781                 *sep = ',';
2782                 s = sep + 1;
2783         }
2784
2785         if (lists[1] != NULL) {
2786                 struct strlist_config slist_config = {
2787                         .dirname = strace_groups_dir,
2788                 };
2789
2790                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2791                 if (trace->ev_qualifier == NULL) {
2792                         fputs("Not enough memory to parse event qualifier", trace->output);
2793                         goto out;
2794                 }
2795
2796                 if (trace__validate_ev_qualifier(trace))
2797                         goto out;
2798         }
2799
2800         err = 0;
2801
2802         if (lists[0]) {
2803                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2804                                                "event selector. use 'perf list' to list available events",
2805                                                parse_events_option);
2806                 err = parse_events_option(&o, lists[0], 0);
2807         }
2808 out:
2809         if (sep)
2810                 *sep = ',';
2811
2812         return err;
2813 }
2814
2815 int cmd_trace(int argc, const char **argv)
2816 {
2817         const char *trace_usage[] = {
2818                 "perf trace [<options>] [<command>]",
2819                 "perf trace [<options>] -- <command> [<options>]",
2820                 "perf trace record [<options>] [<command>]",
2821                 "perf trace record [<options>] -- <command> [<options>]",
2822                 NULL
2823         };
2824         struct trace trace = {
2825                 .syscalls = {
2826                         . max = -1,
2827                 },
2828                 .opts = {
2829                         .target = {
2830                                 .uid       = UINT_MAX,
2831                                 .uses_mmap = true,
2832                         },
2833                         .user_freq     = UINT_MAX,
2834                         .user_interval = ULLONG_MAX,
2835                         .no_buffering  = true,
2836                         .mmap_pages    = UINT_MAX,
2837                         .proc_map_timeout  = 500,
2838                 },
2839                 .output = stderr,
2840                 .show_comm = true,
2841                 .trace_syscalls = true,
2842                 .kernel_syscallchains = false,
2843                 .max_stack = UINT_MAX,
2844         };
2845         const char *output_name = NULL;
2846         const struct option trace_options[] = {
2847         OPT_CALLBACK('e', "event", &trace, "event",
2848                      "event/syscall selector. use 'perf list' to list available events",
2849                      trace__parse_events_option),
2850         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2851                     "show the thread COMM next to its id"),
2852         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2853         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2854                      trace__parse_events_option),
2855         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2856         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2857         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2858                     "trace events on existing process id"),
2859         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2860                     "trace events on existing thread id"),
2861         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2862                      "pids to filter (by the kernel)", trace__set_filter_pids),
2863         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2864                     "system-wide collection from all CPUs"),
2865         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2866                     "list of cpus to monitor"),
2867         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2868                     "child tasks do not inherit counters"),
2869         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2870                      "number of mmap data pages",
2871                      perf_evlist__parse_mmap_pages),
2872         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2873                    "user to profile"),
2874         OPT_CALLBACK(0, "duration", &trace, "float",
2875                      "show only events with duration > N.M ms",
2876                      trace__set_duration),
2877         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2878         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2879         OPT_BOOLEAN('T', "time", &trace.full_time,
2880                     "Show full timestamp, not time relative to first start"),
2881         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2882                     "Show only syscall summary with statistics"),
2883         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2884                     "Show all syscalls and summary with statistics"),
2885         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2886                      "Trace pagefaults", parse_pagefaults, "maj"),
2887         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2888         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2889         OPT_CALLBACK(0, "call-graph", &trace.opts,
2890                      "record_mode[,record_size]", record_callchain_help,
2891                      &record_parse_callchain_opt),
2892         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2893                     "Show the kernel callchains on the syscall exit path"),
2894         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2895                      "Set the minimum stack depth when parsing the callchain, "
2896                      "anything below the specified depth will be ignored."),
2897         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2898                      "Set the maximum stack depth when parsing the callchain, "
2899                      "anything beyond the specified depth will be ignored. "
2900                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2901         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2902                         "per thread proc mmap processing timeout in ms"),
2903         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2904                      "ms to wait before starting measurement after program "
2905                      "start"),
2906         OPT_END()
2907         };
2908         bool __maybe_unused max_stack_user_set = true;
2909         bool mmap_pages_user_set = true;
2910         const char * const trace_subcommands[] = { "record", NULL };
2911         int err;
2912         char bf[BUFSIZ];
2913
2914         signal(SIGSEGV, sighandler_dump_stack);
2915         signal(SIGFPE, sighandler_dump_stack);
2916
2917         trace.evlist = perf_evlist__new();
2918         trace.sctbl = syscalltbl__new();
2919
2920         if (trace.evlist == NULL || trace.sctbl == NULL) {
2921                 pr_err("Not enough memory to run!\n");
2922                 err = -ENOMEM;
2923                 goto out;
2924         }
2925
2926         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2927                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2928
2929         err = bpf__setup_stdout(trace.evlist);
2930         if (err) {
2931                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2932                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2933                 goto out;
2934         }
2935
2936         err = -1;
2937
2938         if (trace.trace_pgfaults) {
2939                 trace.opts.sample_address = true;
2940                 trace.opts.sample_time = true;
2941         }
2942
2943         if (trace.opts.mmap_pages == UINT_MAX)
2944                 mmap_pages_user_set = false;
2945
2946         if (trace.max_stack == UINT_MAX) {
2947                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2948                 max_stack_user_set = false;
2949         }
2950
2951 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2952         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2953                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2954 #endif
2955
2956         if (callchain_param.enabled) {
2957                 if (!mmap_pages_user_set && geteuid() == 0)
2958                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2959
2960                 symbol_conf.use_callchain = true;
2961         }
2962
2963         if (trace.evlist->nr_entries > 0)
2964                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2965
2966         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2967                 return trace__record(&trace, argc-1, &argv[1]);
2968
2969         /* summary_only implies summary option, but don't overwrite summary if set */
2970         if (trace.summary_only)
2971                 trace.summary = trace.summary_only;
2972
2973         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2974             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2975                 pr_err("Please specify something to trace.\n");
2976                 return -1;
2977         }
2978
2979         if (!trace.trace_syscalls && trace.ev_qualifier) {
2980                 pr_err("The -e option can't be used with --no-syscalls.\n");
2981                 goto out;
2982         }
2983
2984         if (output_name != NULL) {
2985                 err = trace__open_output(&trace, output_name);
2986                 if (err < 0) {
2987                         perror("failed to create output file");
2988                         goto out;
2989                 }
2990         }
2991
2992         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2993
2994         err = target__validate(&trace.opts.target);
2995         if (err) {
2996                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2997                 fprintf(trace.output, "%s", bf);
2998                 goto out_close;
2999         }
3000
3001         err = target__parse_uid(&trace.opts.target);
3002         if (err) {
3003                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3004                 fprintf(trace.output, "%s", bf);
3005                 goto out_close;
3006         }
3007
3008         if (!argc && target__none(&trace.opts.target))
3009                 trace.opts.target.system_wide = true;
3010
3011         if (input_name)
3012                 err = trace__replay(&trace);
3013         else
3014                 err = trace__run(&trace, argc, argv);
3015
3016 out_close:
3017         if (output_name != NULL)
3018                 fclose(trace.output);
3019 out:
3020         return err;
3021 }