4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
8 #define _FILE_OFFSET_BITS 64
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/symbol.h"
26 #include "util/cpumap.h"
27 #include "util/thread_map.h"
38 static u64 user_interval = ULLONG_MAX;
39 static u64 default_interval = 0;
41 static unsigned int page_size;
42 static unsigned int mmap_pages = UINT_MAX;
43 static unsigned int user_freq = UINT_MAX;
44 static int freq = 1000;
46 static int pipe_output = 0;
47 static const char *output_name = NULL;
48 static bool group = false;
49 static int realtime_prio = 0;
50 static bool nodelay = false;
51 static bool raw_samples = false;
52 static bool sample_id_all_avail = true;
53 static bool system_wide = false;
54 static pid_t target_pid = -1;
55 static pid_t target_tid = -1;
56 static pid_t child_pid = -1;
57 static bool no_inherit = false;
58 static enum write_mode_t write_mode = WRITE_FORCE;
59 static bool call_graph = false;
60 static bool inherit_stat = false;
61 static bool no_samples = false;
62 static bool sample_address = false;
63 static bool sample_time = false;
64 static bool no_buildid = false;
65 static bool no_buildid_cache = false;
66 static struct perf_evlist *evsel_list;
68 static long samples = 0;
69 static u64 bytes_written = 0;
71 static int file_new = 1;
72 static off_t post_processing_offset;
74 static struct perf_session *session;
75 static const char *cpu_list;
76 static const char *progname;
78 static void advance_output(size_t size)
80 bytes_written += size;
83 static void write_output(void *buf, size_t size)
86 int ret = write(output, buf, size);
89 die("failed to write");
98 static int process_synthesized_event(union perf_event *event,
99 struct perf_sample *sample __used,
100 struct perf_session *self __used)
102 write_output(event, event->header.size);
106 static void mmap_read(struct perf_mmap *md)
108 unsigned int head = perf_mmap__read_head(md);
109 unsigned int old = md->prev;
110 unsigned char *data = md->base + page_size;
121 if ((old & md->mask) + size != (head & md->mask)) {
122 buf = &data[old & md->mask];
123 size = md->mask + 1 - (old & md->mask);
126 write_output(buf, size);
129 buf = &data[old & md->mask];
133 write_output(buf, size);
136 perf_mmap__write_tail(md, old);
139 static volatile int done = 0;
140 static volatile int signr = -1;
141 static volatile int child_finished = 0;
143 static void sig_handler(int sig)
152 static void sig_atexit(void)
158 kill(child_pid, SIGTERM);
161 if (WIFSIGNALED(status))
162 psignal(WTERMSIG(status), progname);
165 if (signr == -1 || signr == SIGUSR1)
168 signal(signr, SIG_DFL);
169 kill(getpid(), signr);
172 static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
174 struct perf_event_attr *attr = &evsel->attr;
175 int track = !evsel->idx; /* only the first counter needs these */
178 attr->inherit = !no_inherit;
179 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
180 PERF_FORMAT_TOTAL_TIME_RUNNING |
183 attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
185 if (evlist->nr_entries > 1)
186 attr->sample_type |= PERF_SAMPLE_ID;
189 * We default some events to a 1 default interval. But keep
190 * it a weak assumption overridable by the user.
192 if (!attr->sample_period || (user_freq != UINT_MAX &&
193 user_interval != ULLONG_MAX)) {
195 attr->sample_type |= PERF_SAMPLE_PERIOD;
197 attr->sample_freq = freq;
199 attr->sample_period = default_interval;
204 attr->sample_freq = 0;
207 attr->inherit_stat = 1;
209 if (sample_address) {
210 attr->sample_type |= PERF_SAMPLE_ADDR;
211 attr->mmap_data = track;
215 attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
218 attr->sample_type |= PERF_SAMPLE_CPU;
220 if (sample_id_all_avail &&
221 (sample_time || system_wide || !no_inherit || cpu_list))
222 attr->sample_type |= PERF_SAMPLE_TIME;
225 attr->sample_type |= PERF_SAMPLE_TIME;
226 attr->sample_type |= PERF_SAMPLE_RAW;
227 attr->sample_type |= PERF_SAMPLE_CPU;
232 attr->wakeup_events = 1;
238 if (target_pid == -1 && target_tid == -1 && !system_wide) {
240 attr->enable_on_exec = 1;
244 static bool perf_evlist__equal(struct perf_evlist *evlist,
245 struct perf_evlist *other)
247 struct perf_evsel *pos, *pair;
249 if (evlist->nr_entries != other->nr_entries)
252 pair = list_entry(other->entries.next, struct perf_evsel, node);
254 list_for_each_entry(pos, &evlist->entries, node) {
255 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
257 pair = list_entry(pair->node.next, struct perf_evsel, node);
263 static void open_counters(struct perf_evlist *evlist)
265 struct perf_evsel *pos;
267 if (evlist->cpus->map[0] < 0)
270 list_for_each_entry(pos, &evlist->entries, node) {
271 struct perf_event_attr *attr = &pos->attr;
273 * Check if parse_single_tracepoint_event has already asked for
276 * XXX this is kludgy but short term fix for problems introduced by
277 * eac23d1c that broke 'perf script' by having different sample_types
278 * when using multiple tracepoint events when we use a perf binary
279 * that tries to use sample_id_all on an older kernel.
281 * We need to move counter creation to perf_session, support
282 * different sample_types, etc.
284 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
286 config_attr(pos, evlist);
288 attr->sample_id_all = sample_id_all_avail ? 1 : 0;
290 if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group) < 0) {
293 if (err == EPERM || err == EACCES) {
294 ui__warning_paranoid();
296 } else if (err == ENODEV && cpu_list) {
297 die("No such device - did you specify"
298 " an out-of-range profile CPU?\n");
299 } else if (err == EINVAL && sample_id_all_avail) {
301 * Old kernel, no attr->sample_id_type_all field
303 sample_id_all_avail = false;
304 if (!sample_time && !raw_samples && !time_needed)
305 attr->sample_type &= ~PERF_SAMPLE_TIME;
307 goto retry_sample_id;
311 * If it's cycles then fall back to hrtimer
312 * based cpu-clock-tick sw counter, which
313 * is always available even if no PMU support:
315 if (attr->type == PERF_TYPE_HARDWARE
316 && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
319 ui__warning("The cycles event is not supported, "
320 "trying to fall back to cpu-clock-ticks\n");
321 attr->type = PERF_TYPE_SOFTWARE;
322 attr->config = PERF_COUNT_SW_CPU_CLOCK;
327 ui__warning("The %s event is not supported.\n",
333 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
336 #if defined(__i386__) || defined(__x86_64__)
337 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
338 die("No hardware sampling interrupt available."
339 " No APIC? If so then you can boot the kernel"
340 " with the \"lapic\" boot parameter to"
341 " force-enable it.\n");
344 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
348 if (perf_evlist__set_filters(evlist)) {
349 error("failed to set filter with %d (%s)\n", errno,
354 if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
355 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
358 session->evlist = evlist;
360 if (!perf_evlist__equal(session->evlist, evlist)) {
361 fprintf(stderr, "incompatible append\n");
366 perf_session__update_sample_type(session);
369 static int process_buildids(void)
371 u64 size = lseek(output, 0, SEEK_CUR);
376 session->fd = output;
377 return __perf_session__process_events(session, post_processing_offset,
378 size - post_processing_offset,
379 size, &build_id__mark_dso_hit_ops);
382 static void atexit_header(void)
385 session->header.data_size += bytes_written;
389 perf_session__write_header(session, evsel_list, output, true);
390 perf_session__delete(session);
391 perf_evlist__delete(evsel_list);
396 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
399 struct perf_session *psession = data;
401 if (machine__is_host(machine))
405 *As for guest kernel when processing subcommand record&report,
406 *we arrange module mmap prior to guest kernel mmap and trigger
407 *a preload dso because default guest module symbols are loaded
408 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
409 *method is used to avoid symbol missing when the first addr is
410 *in module instead of in guest kernel.
412 err = perf_event__synthesize_modules(process_synthesized_event,
415 pr_err("Couldn't record guest kernel [%d]'s reference"
416 " relocation symbol.\n", machine->pid);
419 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
420 * have no _text sometimes.
422 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
423 psession, machine, "_text");
425 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
429 pr_err("Couldn't record guest kernel [%d]'s reference"
430 " relocation symbol.\n", machine->pid);
433 static struct perf_event_header finished_round_event = {
434 .size = sizeof(struct perf_event_header),
435 .type = PERF_RECORD_FINISHED_ROUND,
438 static void mmap_read_all(void)
442 for (i = 0; i < evsel_list->nr_mmaps; i++) {
443 if (evsel_list->mmap[i].base)
444 mmap_read(&evsel_list->mmap[i]);
447 if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
448 write_output(&finished_round_event, sizeof(finished_round_event));
451 static int __cmd_record(int argc, const char **argv)
456 unsigned long waking = 0;
457 int child_ready_pipe[2], go_pipe[2];
458 const bool forks = argc > 0;
460 struct machine *machine;
464 page_size = sysconf(_SC_PAGE_SIZE);
467 signal(SIGCHLD, sig_handler);
468 signal(SIGINT, sig_handler);
469 signal(SIGUSR1, sig_handler);
471 if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
472 perror("failed to create pipes");
477 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
480 output_name = "perf.data";
483 if (!strcmp(output_name, "-"))
485 else if (!stat(output_name, &st) && st.st_size) {
486 if (write_mode == WRITE_FORCE) {
487 char oldname[PATH_MAX];
488 snprintf(oldname, sizeof(oldname), "%s.old",
491 rename(output_name, oldname);
493 } else if (write_mode == WRITE_APPEND) {
494 write_mode = WRITE_FORCE;
498 flags = O_CREAT|O_RDWR;
499 if (write_mode == WRITE_APPEND)
505 output = STDOUT_FILENO;
507 output = open(output_name, flags, S_IRUSR | S_IWUSR);
509 perror("failed to create output file");
513 session = perf_session__new(output_name, O_WRONLY,
514 write_mode == WRITE_FORCE, false, NULL);
515 if (session == NULL) {
516 pr_err("Not enough memory for reading perf file header\n");
521 perf_header__set_feat(&session->header, HEADER_BUILD_ID);
524 err = perf_session__read_header(session, output);
526 goto out_delete_session;
529 if (have_tracepoints(&evsel_list->entries))
530 perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
532 /* 512 kiB: default amount of unprivileged mlocked memory */
533 if (mmap_pages == UINT_MAX)
534 mmap_pages = (512 * 1024) / page_size;
539 perror("failed to fork");
546 close(child_ready_pipe[0]);
548 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
551 * Do a dummy execvp to get the PLT entry resolved,
552 * so we avoid the resolver overhead on the real
555 execvp("", (char **)argv);
558 * Tell the parent we're ready to go
560 close(child_ready_pipe[1]);
563 * Wait until the parent tells us to go.
565 if (read(go_pipe[0], &buf, 1) == -1)
566 perror("unable to read pipe");
568 execvp(argv[0], (char **)argv);
571 kill(getppid(), SIGUSR1);
575 if (!system_wide && target_tid == -1 && target_pid == -1)
576 evsel_list->threads->map[0] = child_pid;
578 close(child_ready_pipe[1]);
581 * wait for child to settle
583 if (read(child_ready_pipe[0], &buf, 1) == -1) {
584 perror("unable to read pipe");
587 close(child_ready_pipe[0]);
590 open_counters(evsel_list);
593 * perf_session__delete(session) will be called at atexit_header()
595 atexit(atexit_header);
598 err = perf_header__write_pipe(output);
601 } else if (file_new) {
602 err = perf_session__write_header(session, evsel_list,
608 post_processing_offset = lseek(output, 0, SEEK_CUR);
611 err = perf_session__synthesize_attrs(session,
612 process_synthesized_event);
614 pr_err("Couldn't synthesize attrs.\n");
618 err = perf_event__synthesize_event_types(process_synthesized_event,
621 pr_err("Couldn't synthesize event_types.\n");
625 if (have_tracepoints(&evsel_list->entries)) {
627 * FIXME err <= 0 here actually means that
628 * there were no tracepoints so its not really
629 * an error, just that we don't need to
630 * synthesize anything. We really have to
631 * return this more properly and also
632 * propagate errors that now are calling die()
634 err = perf_event__synthesize_tracing_data(output, evsel_list,
635 process_synthesized_event,
638 pr_err("Couldn't record tracing data.\n");
645 machine = perf_session__find_host_machine(session);
647 pr_err("Couldn't find native kernel information.\n");
651 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
652 session, machine, "_text");
654 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
655 session, machine, "_stext");
657 pr_err("Couldn't record kernel reference relocation symbol\n"
658 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
659 "Check /proc/kallsyms permission or run as root.\n");
661 err = perf_event__synthesize_modules(process_synthesized_event,
664 pr_err("Couldn't record kernel module information.\n"
665 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
666 "Check /proc/modules permission or run as root.\n");
669 perf_session__process_machines(session,
670 perf_event__synthesize_guest_os);
673 perf_event__synthesize_thread_map(evsel_list->threads,
674 process_synthesized_event,
677 perf_event__synthesize_threads(process_synthesized_event,
681 struct sched_param param;
683 param.sched_priority = realtime_prio;
684 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
685 pr_err("Could not set realtime priority.\n");
690 perf_evlist__enable(evsel_list);
703 if (hits == samples) {
706 err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
711 perf_evlist__disable(evsel_list);
714 if (quiet || signr == SIGUSR1)
717 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
720 * Approximate RIP event size: 24 bytes.
723 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
724 (double)bytes_written / 1024.0 / 1024.0,
731 perf_session__delete(session);
735 static const char * const record_usage[] = {
736 "perf record [<options>] [<command>]",
737 "perf record [<options>] -- <command> [<options>]",
741 static bool force, append_file;
743 const struct option record_options[] = {
744 OPT_CALLBACK('e', "event", &evsel_list, "event",
745 "event selector. use 'perf list' to list available events",
746 parse_events_option),
747 OPT_CALLBACK(0, "filter", &evsel_list, "filter",
748 "event filter", parse_filter),
749 OPT_INTEGER('p', "pid", &target_pid,
750 "record events on existing process id"),
751 OPT_INTEGER('t', "tid", &target_tid,
752 "record events on existing thread id"),
753 OPT_INTEGER('r', "realtime", &realtime_prio,
754 "collect data with this RT SCHED_FIFO priority"),
755 OPT_BOOLEAN('D', "no-delay", &nodelay,
756 "collect data without buffering"),
757 OPT_BOOLEAN('R', "raw-samples", &raw_samples,
758 "collect raw sample records from all opened counters"),
759 OPT_BOOLEAN('a', "all-cpus", &system_wide,
760 "system-wide collection from all CPUs"),
761 OPT_BOOLEAN('A', "append", &append_file,
762 "append to the output file to do incremental profiling"),
763 OPT_STRING('C', "cpu", &cpu_list, "cpu",
764 "list of cpus to monitor"),
765 OPT_BOOLEAN('f', "force", &force,
766 "overwrite existing data file (deprecated)"),
767 OPT_U64('c', "count", &user_interval, "event period to sample"),
768 OPT_STRING('o', "output", &output_name, "file",
770 OPT_BOOLEAN('i', "no-inherit", &no_inherit,
771 "child tasks do not inherit counters"),
772 OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"),
773 OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
774 OPT_BOOLEAN(0, "group", &group,
775 "put the counters into a counter group"),
776 OPT_BOOLEAN('g', "call-graph", &call_graph,
777 "do call-graph (stack chain/backtrace) recording"),
778 OPT_INCR('v', "verbose", &verbose,
779 "be more verbose (show counter open errors, etc)"),
780 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
781 OPT_BOOLEAN('s', "stat", &inherit_stat,
782 "per thread counts"),
783 OPT_BOOLEAN('d', "data", &sample_address,
785 OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
786 OPT_BOOLEAN('n', "no-samples", &no_samples,
788 OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
789 "do not update the buildid cache"),
790 OPT_BOOLEAN('B', "no-buildid", &no_buildid,
791 "do not collect buildids in perf.data"),
792 OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
793 "monitor event in cgroup name only",
798 int cmd_record(int argc, const char **argv, const char *prefix __used)
801 struct perf_evsel *pos;
803 evsel_list = perf_evlist__new(NULL, NULL);
804 if (evsel_list == NULL)
807 argc = parse_options(argc, argv, record_options, record_usage,
808 PARSE_OPT_STOP_AT_NON_OPTION);
809 if (!argc && target_pid == -1 && target_tid == -1 &&
810 !system_wide && !cpu_list)
811 usage_with_options(record_usage, record_options);
813 if (force && append_file) {
814 fprintf(stderr, "Can't overwrite and append at the same time."
815 " You need to choose between -f and -A");
816 usage_with_options(record_usage, record_options);
817 } else if (append_file) {
818 write_mode = WRITE_APPEND;
820 write_mode = WRITE_FORCE;
823 if (nr_cgroups && !system_wide) {
824 fprintf(stderr, "cgroup monitoring only available in"
825 " system-wide mode\n");
826 usage_with_options(record_usage, record_options);
831 if (symbol_conf.kptr_restrict)
833 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
834 "check /proc/sys/kernel/kptr_restrict.\n\n"
835 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
836 "file is not found in the buildid cache or in the vmlinux path.\n\n"
837 "Samples in kernel modules won't be resolved at all.\n\n"
838 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
839 "even with a suitable vmlinux or kallsyms file.\n\n");
841 if (no_buildid_cache || no_buildid)
842 disable_buildid_cache();
844 if (evsel_list->nr_entries == 0 &&
845 perf_evlist__add_default(evsel_list) < 0) {
846 pr_err("Not enough memory for event selector list\n");
847 goto out_symbol_exit;
850 if (target_pid != -1)
851 target_tid = target_pid;
853 if (perf_evlist__create_maps(evsel_list, target_pid,
854 target_tid, cpu_list) < 0)
855 usage_with_options(record_usage, record_options);
857 list_for_each_entry(pos, &evsel_list->entries, node) {
858 if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
859 evsel_list->threads->nr) < 0)
861 if (perf_header__push_event(pos->attr.config, event_name(pos)))
865 if (perf_evlist__alloc_pollfd(evsel_list) < 0)
868 if (user_interval != ULLONG_MAX)
869 default_interval = user_interval;
870 if (user_freq != UINT_MAX)
874 * User specified count overrides default frequency.
876 if (default_interval)
879 default_interval = freq;
881 fprintf(stderr, "frequency and count are zero, aborting\n");
886 err = __cmd_record(argc, argv);
888 perf_evlist__delete_maps(evsel_list);