4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
8 #define _FILE_OFFSET_BITS 64
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/symbol.h"
26 #include "util/cpumap.h"
27 #include "util/thread_map.h"
38 static u64 user_interval = ULLONG_MAX;
39 static u64 default_interval = 0;
41 static unsigned int page_size;
42 static unsigned int mmap_pages = UINT_MAX;
43 static unsigned int user_freq = UINT_MAX;
44 static int freq = 1000;
46 static int pipe_output = 0;
47 static const char *output_name = NULL;
48 static bool group = false;
49 static int realtime_prio = 0;
50 static bool nodelay = false;
51 static bool raw_samples = false;
52 static bool sample_id_all_avail = true;
53 static bool system_wide = false;
54 static pid_t target_pid = -1;
55 static pid_t target_tid = -1;
56 static pid_t child_pid = -1;
57 static bool no_inherit = false;
58 static enum write_mode_t write_mode = WRITE_FORCE;
59 static bool call_graph = false;
60 static bool inherit_stat = false;
61 static bool no_samples = false;
62 static bool sample_address = false;
63 static bool sample_time = false;
64 static bool no_buildid = false;
65 static bool no_buildid_cache = false;
66 static struct perf_evlist *evsel_list;
68 static long samples = 0;
69 static u64 bytes_written = 0;
71 static int file_new = 1;
72 static off_t post_processing_offset;
74 static struct perf_session *session;
75 static const char *cpu_list;
76 static const char *progname;
78 static void advance_output(size_t size)
80 bytes_written += size;
83 static void write_output(void *buf, size_t size)
86 int ret = write(output, buf, size);
89 die("failed to write");
98 static int process_synthesized_event(union perf_event *event,
99 struct perf_sample *sample __used,
100 struct perf_session *self __used)
102 write_output(event, event->header.size);
106 static void mmap_read(struct perf_mmap *md)
108 unsigned int head = perf_mmap__read_head(md);
109 unsigned int old = md->prev;
110 unsigned char *data = md->base + page_size;
121 if ((old & md->mask) + size != (head & md->mask)) {
122 buf = &data[old & md->mask];
123 size = md->mask + 1 - (old & md->mask);
126 write_output(buf, size);
129 buf = &data[old & md->mask];
133 write_output(buf, size);
136 perf_mmap__write_tail(md, old);
139 static volatile int done = 0;
140 static volatile int signr = -1;
141 static volatile int child_finished = 0;
143 static void sig_handler(int sig)
152 static void sig_atexit(void)
158 kill(child_pid, SIGTERM);
161 if (WIFSIGNALED(status))
162 psignal(WTERMSIG(status), progname);
165 if (signr == -1 || signr == SIGUSR1)
168 signal(signr, SIG_DFL);
169 kill(getpid(), signr);
172 static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
174 struct perf_event_attr *attr = &evsel->attr;
175 int track = !evsel->idx; /* only the first counter needs these */
178 attr->inherit = !no_inherit;
179 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
180 PERF_FORMAT_TOTAL_TIME_RUNNING |
183 attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
185 if (evlist->nr_entries > 1)
186 attr->sample_type |= PERF_SAMPLE_ID;
189 * We default some events to a 1 default interval. But keep
190 * it a weak assumption overridable by the user.
192 if (!attr->sample_period || (user_freq != UINT_MAX &&
193 user_interval != ULLONG_MAX)) {
195 attr->sample_type |= PERF_SAMPLE_PERIOD;
197 attr->sample_freq = freq;
199 attr->sample_period = default_interval;
204 attr->sample_freq = 0;
207 attr->inherit_stat = 1;
209 if (sample_address) {
210 attr->sample_type |= PERF_SAMPLE_ADDR;
211 attr->mmap_data = track;
215 attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
218 attr->sample_type |= PERF_SAMPLE_CPU;
220 if (sample_id_all_avail &&
221 (sample_time || system_wide || !no_inherit || cpu_list))
222 attr->sample_type |= PERF_SAMPLE_TIME;
225 attr->sample_type |= PERF_SAMPLE_TIME;
226 attr->sample_type |= PERF_SAMPLE_RAW;
227 attr->sample_type |= PERF_SAMPLE_CPU;
232 attr->wakeup_events = 1;
238 if (target_pid == -1 && target_tid == -1 && !system_wide) {
240 attr->enable_on_exec = 1;
244 static bool perf_evlist__equal(struct perf_evlist *evlist,
245 struct perf_evlist *other)
247 struct perf_evsel *pos, *pair;
249 if (evlist->nr_entries != other->nr_entries)
252 pair = list_entry(other->entries.next, struct perf_evsel, node);
254 list_for_each_entry(pos, &evlist->entries, node) {
255 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
257 pair = list_entry(pair->node.next, struct perf_evsel, node);
263 static void open_counters(struct perf_evlist *evlist)
265 struct perf_evsel *pos;
267 if (evlist->cpus->map[0] < 0)
270 list_for_each_entry(pos, &evlist->entries, node) {
271 struct perf_event_attr *attr = &pos->attr;
273 * Check if parse_single_tracepoint_event has already asked for
276 * XXX this is kludgy but short term fix for problems introduced by
277 * eac23d1c that broke 'perf script' by having different sample_types
278 * when using multiple tracepoint events when we use a perf binary
279 * that tries to use sample_id_all on an older kernel.
281 * We need to move counter creation to perf_session, support
282 * different sample_types, etc.
284 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
286 config_attr(pos, evlist);
288 attr->sample_id_all = sample_id_all_avail ? 1 : 0;
290 if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group) < 0) {
293 if (err == EPERM || err == EACCES) {
294 ui__warning_paranoid();
296 } else if (err == ENODEV && cpu_list) {
297 die("No such device - did you specify"
298 " an out-of-range profile CPU?\n");
299 } else if (err == EINVAL && sample_id_all_avail) {
301 * Old kernel, no attr->sample_id_type_all field
303 sample_id_all_avail = false;
304 if (!sample_time && !raw_samples && !time_needed)
305 attr->sample_type &= ~PERF_SAMPLE_TIME;
307 goto retry_sample_id;
311 * If it's cycles then fall back to hrtimer
312 * based cpu-clock-tick sw counter, which
313 * is always available even if no PMU support:
315 if (attr->type == PERF_TYPE_HARDWARE
316 && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
319 ui__warning("The cycles event is not supported, "
320 "trying to fall back to cpu-clock-ticks\n");
321 attr->type = PERF_TYPE_SOFTWARE;
322 attr->config = PERF_COUNT_SW_CPU_CLOCK;
327 ui__warning("The %s event is not supported.\n",
333 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
336 #if defined(__i386__) || defined(__x86_64__)
337 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
338 die("No hardware sampling interrupt available."
339 " No APIC? If so then you can boot the kernel"
340 " with the \"lapic\" boot parameter to"
341 " force-enable it.\n");
344 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
348 if (perf_evlist__set_filters(evlist)) {
349 error("failed to set filter with %d (%s)\n", errno,
354 if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
355 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
358 session->evlist = evlist;
360 if (!perf_evlist__equal(session->evlist, evlist)) {
361 fprintf(stderr, "incompatible append\n");
366 perf_session__update_sample_type(session);
369 static int process_buildids(void)
371 u64 size = lseek(output, 0, SEEK_CUR);
376 session->fd = output;
377 return __perf_session__process_events(session, post_processing_offset,
378 size - post_processing_offset,
379 size, &build_id__mark_dso_hit_ops);
382 static void atexit_header(void)
385 session->header.data_size += bytes_written;
389 perf_session__write_header(session, evsel_list, output, true);
390 perf_session__delete(session);
391 perf_evlist__delete(evsel_list);
396 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
399 struct perf_session *psession = data;
401 if (machine__is_host(machine))
405 *As for guest kernel when processing subcommand record&report,
406 *we arrange module mmap prior to guest kernel mmap and trigger
407 *a preload dso because default guest module symbols are loaded
408 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
409 *method is used to avoid symbol missing when the first addr is
410 *in module instead of in guest kernel.
412 err = perf_event__synthesize_modules(process_synthesized_event,
415 pr_err("Couldn't record guest kernel [%d]'s reference"
416 " relocation symbol.\n", machine->pid);
419 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
420 * have no _text sometimes.
422 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
423 psession, machine, "_text");
425 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
429 pr_err("Couldn't record guest kernel [%d]'s reference"
430 " relocation symbol.\n", machine->pid);
433 static struct perf_event_header finished_round_event = {
434 .size = sizeof(struct perf_event_header),
435 .type = PERF_RECORD_FINISHED_ROUND,
438 static void mmap_read_all(void)
442 for (i = 0; i < evsel_list->nr_mmaps; i++) {
443 if (evsel_list->mmap[i].base)
444 mmap_read(&evsel_list->mmap[i]);
447 if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
448 write_output(&finished_round_event, sizeof(finished_round_event));
451 static int __cmd_record(int argc, const char **argv)
456 unsigned long waking = 0;
457 int child_ready_pipe[2], go_pipe[2];
458 const bool forks = argc > 0;
460 struct machine *machine;
464 page_size = sysconf(_SC_PAGE_SIZE);
467 signal(SIGCHLD, sig_handler);
468 signal(SIGINT, sig_handler);
469 signal(SIGUSR1, sig_handler);
471 if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
472 perror("failed to create pipes");
477 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
480 output_name = "perf.data";
483 if (!strcmp(output_name, "-"))
485 else if (!stat(output_name, &st) && st.st_size) {
486 if (write_mode == WRITE_FORCE) {
487 char oldname[PATH_MAX];
488 snprintf(oldname, sizeof(oldname), "%s.old",
491 rename(output_name, oldname);
493 } else if (write_mode == WRITE_APPEND) {
494 write_mode = WRITE_FORCE;
498 flags = O_CREAT|O_RDWR;
499 if (write_mode == WRITE_APPEND)
505 output = STDOUT_FILENO;
507 output = open(output_name, flags, S_IRUSR | S_IWUSR);
509 perror("failed to create output file");
513 session = perf_session__new(output_name, O_WRONLY,
514 write_mode == WRITE_FORCE, false, NULL);
515 if (session == NULL) {
516 pr_err("Not enough memory for reading perf file header\n");
521 perf_header__set_feat(&session->header, HEADER_BUILD_ID);
524 err = perf_session__read_header(session, output);
526 goto out_delete_session;
529 if (have_tracepoints(&evsel_list->entries))
530 perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
532 perf_header__set_feat(&session->header, HEADER_HOSTNAME);
533 perf_header__set_feat(&session->header, HEADER_OSRELEASE);
534 perf_header__set_feat(&session->header, HEADER_ARCH);
535 perf_header__set_feat(&session->header, HEADER_CPUDESC);
536 perf_header__set_feat(&session->header, HEADER_NRCPUS);
537 perf_header__set_feat(&session->header, HEADER_EVENT_DESC);
538 perf_header__set_feat(&session->header, HEADER_CMDLINE);
539 perf_header__set_feat(&session->header, HEADER_VERSION);
540 perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
541 perf_header__set_feat(&session->header, HEADER_TOTAL_MEM);
542 perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
543 perf_header__set_feat(&session->header, HEADER_CPUID);
545 /* 512 kiB: default amount of unprivileged mlocked memory */
546 if (mmap_pages == UINT_MAX)
547 mmap_pages = (512 * 1024) / page_size;
552 perror("failed to fork");
559 close(child_ready_pipe[0]);
561 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
564 * Do a dummy execvp to get the PLT entry resolved,
565 * so we avoid the resolver overhead on the real
568 execvp("", (char **)argv);
571 * Tell the parent we're ready to go
573 close(child_ready_pipe[1]);
576 * Wait until the parent tells us to go.
578 if (read(go_pipe[0], &buf, 1) == -1)
579 perror("unable to read pipe");
581 execvp(argv[0], (char **)argv);
584 kill(getppid(), SIGUSR1);
588 if (!system_wide && target_tid == -1 && target_pid == -1)
589 evsel_list->threads->map[0] = child_pid;
591 close(child_ready_pipe[1]);
594 * wait for child to settle
596 if (read(child_ready_pipe[0], &buf, 1) == -1) {
597 perror("unable to read pipe");
600 close(child_ready_pipe[0]);
603 open_counters(evsel_list);
606 * perf_session__delete(session) will be called at atexit_header()
608 atexit(atexit_header);
611 err = perf_header__write_pipe(output);
614 } else if (file_new) {
615 err = perf_session__write_header(session, evsel_list,
621 post_processing_offset = lseek(output, 0, SEEK_CUR);
624 err = perf_session__synthesize_attrs(session,
625 process_synthesized_event);
627 pr_err("Couldn't synthesize attrs.\n");
631 err = perf_event__synthesize_event_types(process_synthesized_event,
634 pr_err("Couldn't synthesize event_types.\n");
638 if (have_tracepoints(&evsel_list->entries)) {
640 * FIXME err <= 0 here actually means that
641 * there were no tracepoints so its not really
642 * an error, just that we don't need to
643 * synthesize anything. We really have to
644 * return this more properly and also
645 * propagate errors that now are calling die()
647 err = perf_event__synthesize_tracing_data(output, evsel_list,
648 process_synthesized_event,
651 pr_err("Couldn't record tracing data.\n");
658 machine = perf_session__find_host_machine(session);
660 pr_err("Couldn't find native kernel information.\n");
664 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
665 session, machine, "_text");
667 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
668 session, machine, "_stext");
670 pr_err("Couldn't record kernel reference relocation symbol\n"
671 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
672 "Check /proc/kallsyms permission or run as root.\n");
674 err = perf_event__synthesize_modules(process_synthesized_event,
677 pr_err("Couldn't record kernel module information.\n"
678 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
679 "Check /proc/modules permission or run as root.\n");
682 perf_session__process_machines(session,
683 perf_event__synthesize_guest_os);
686 perf_event__synthesize_thread_map(evsel_list->threads,
687 process_synthesized_event,
690 perf_event__synthesize_threads(process_synthesized_event,
694 struct sched_param param;
696 param.sched_priority = realtime_prio;
697 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
698 pr_err("Could not set realtime priority.\n");
703 perf_evlist__enable(evsel_list);
716 if (hits == samples) {
719 err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
724 perf_evlist__disable(evsel_list);
727 if (quiet || signr == SIGUSR1)
730 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
733 * Approximate RIP event size: 24 bytes.
736 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
737 (double)bytes_written / 1024.0 / 1024.0,
744 perf_session__delete(session);
748 static const char * const record_usage[] = {
749 "perf record [<options>] [<command>]",
750 "perf record [<options>] -- <command> [<options>]",
754 static bool force, append_file;
756 const struct option record_options[] = {
757 OPT_CALLBACK('e', "event", &evsel_list, "event",
758 "event selector. use 'perf list' to list available events",
759 parse_events_option),
760 OPT_CALLBACK(0, "filter", &evsel_list, "filter",
761 "event filter", parse_filter),
762 OPT_INTEGER('p', "pid", &target_pid,
763 "record events on existing process id"),
764 OPT_INTEGER('t', "tid", &target_tid,
765 "record events on existing thread id"),
766 OPT_INTEGER('r', "realtime", &realtime_prio,
767 "collect data with this RT SCHED_FIFO priority"),
768 OPT_BOOLEAN('D', "no-delay", &nodelay,
769 "collect data without buffering"),
770 OPT_BOOLEAN('R', "raw-samples", &raw_samples,
771 "collect raw sample records from all opened counters"),
772 OPT_BOOLEAN('a', "all-cpus", &system_wide,
773 "system-wide collection from all CPUs"),
774 OPT_BOOLEAN('A', "append", &append_file,
775 "append to the output file to do incremental profiling"),
776 OPT_STRING('C', "cpu", &cpu_list, "cpu",
777 "list of cpus to monitor"),
778 OPT_BOOLEAN('f', "force", &force,
779 "overwrite existing data file (deprecated)"),
780 OPT_U64('c', "count", &user_interval, "event period to sample"),
781 OPT_STRING('o', "output", &output_name, "file",
783 OPT_BOOLEAN('i', "no-inherit", &no_inherit,
784 "child tasks do not inherit counters"),
785 OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"),
786 OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
787 OPT_BOOLEAN(0, "group", &group,
788 "put the counters into a counter group"),
789 OPT_BOOLEAN('g', "call-graph", &call_graph,
790 "do call-graph (stack chain/backtrace) recording"),
791 OPT_INCR('v', "verbose", &verbose,
792 "be more verbose (show counter open errors, etc)"),
793 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
794 OPT_BOOLEAN('s', "stat", &inherit_stat,
795 "per thread counts"),
796 OPT_BOOLEAN('d', "data", &sample_address,
798 OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
799 OPT_BOOLEAN('n', "no-samples", &no_samples,
801 OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
802 "do not update the buildid cache"),
803 OPT_BOOLEAN('B', "no-buildid", &no_buildid,
804 "do not collect buildids in perf.data"),
805 OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
806 "monitor event in cgroup name only",
811 int cmd_record(int argc, const char **argv, const char *prefix __used)
814 struct perf_evsel *pos;
816 perf_header__set_cmdline(argc, argv);
818 evsel_list = perf_evlist__new(NULL, NULL);
819 if (evsel_list == NULL)
822 argc = parse_options(argc, argv, record_options, record_usage,
823 PARSE_OPT_STOP_AT_NON_OPTION);
824 if (!argc && target_pid == -1 && target_tid == -1 &&
825 !system_wide && !cpu_list)
826 usage_with_options(record_usage, record_options);
828 if (force && append_file) {
829 fprintf(stderr, "Can't overwrite and append at the same time."
830 " You need to choose between -f and -A");
831 usage_with_options(record_usage, record_options);
832 } else if (append_file) {
833 write_mode = WRITE_APPEND;
835 write_mode = WRITE_FORCE;
838 if (nr_cgroups && !system_wide) {
839 fprintf(stderr, "cgroup monitoring only available in"
840 " system-wide mode\n");
841 usage_with_options(record_usage, record_options);
846 if (symbol_conf.kptr_restrict)
848 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
849 "check /proc/sys/kernel/kptr_restrict.\n\n"
850 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
851 "file is not found in the buildid cache or in the vmlinux path.\n\n"
852 "Samples in kernel modules won't be resolved at all.\n\n"
853 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
854 "even with a suitable vmlinux or kallsyms file.\n\n");
856 if (no_buildid_cache || no_buildid)
857 disable_buildid_cache();
859 if (evsel_list->nr_entries == 0 &&
860 perf_evlist__add_default(evsel_list) < 0) {
861 pr_err("Not enough memory for event selector list\n");
862 goto out_symbol_exit;
865 if (target_pid != -1)
866 target_tid = target_pid;
868 if (perf_evlist__create_maps(evsel_list, target_pid,
869 target_tid, cpu_list) < 0)
870 usage_with_options(record_usage, record_options);
872 list_for_each_entry(pos, &evsel_list->entries, node) {
873 if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
874 evsel_list->threads->nr) < 0)
876 if (perf_header__push_event(pos->attr.config, event_name(pos)))
880 if (perf_evlist__alloc_pollfd(evsel_list) < 0)
883 if (user_interval != ULLONG_MAX)
884 default_interval = user_interval;
885 if (user_freq != UINT_MAX)
889 * User specified count overrides default frequency.
891 if (default_interval)
894 default_interval = freq;
896 fprintf(stderr, "frequency and count are zero, aborting\n");
901 err = __cmd_record(argc, argv);
903 perf_evlist__delete_maps(evsel_list);