kernel/perf_event.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/sysfs.h>
  19 #include <linux/dcache.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/vmstat.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/hardirq.h>
  25 #include <linux/rculist.h>
  26 #include <linux/uaccess.h>
  27 #include <linux/syscalls.h>
  28 #include <linux/anon_inodes.h>
  29 #include <linux/kernel_stat.h>
  30 #include <linux/perf_event.h>
  31 #include <linux/ftrace_event.h>
  32 #include <linux/hw_breakpoint.h>
  33
  34 #include <asm/irq_regs.h>
  35
  36 /*
  37  * Each CPU has a list of per CPU events:
  38  */
  39 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  40
  41 int perf_max_events __read_mostly = 1;
  42 static int perf_reserved_percpu __read_mostly;
  43 static int perf_overcommit __read_mostly = 1;
  44
  45 static atomic_t nr_events __read_mostly;
  46 static atomic_t nr_mmap_events __read_mostly;
  47 static atomic_t nr_comm_events __read_mostly;
  48 static atomic_t nr_task_events __read_mostly;
  49
  50 /*
  51  * perf event paranoia level:
  52  *  -1 - not paranoid at all
  53  *   0 - disallow raw tracepoint access for unpriv
  54  *   1 - disallow cpu events for unpriv
  55  *   2 - disallow kernel profiling for unpriv
  56  */
  57 int sysctl_perf_event_paranoid __read_mostly = 1;
  58
  59 static inline bool perf_paranoid_tracepoint_raw(void)
  60 {
  61         return sysctl_perf_event_paranoid > -1;
  62 }
  63
  64 static inline bool perf_paranoid_cpu(void)
  65 {
  66         return sysctl_perf_event_paranoid > 0;
  67 }
  68
  69 static inline bool perf_paranoid_kernel(void)
  70 {
  71         return sysctl_perf_event_paranoid > 1;
  72 }
  73
  74 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
  75
  76 /*
  77  * max perf event sample rate
  78  */
  79 int sysctl_perf_event_sample_rate __read_mostly = 100000;
  80
  81 static atomic64_t perf_event_id;
  82
  83 /*
  84  * Lock for (sysadmin-configurable) event reservations:
  85  */
  86 static DEFINE_SPINLOCK(perf_resource_lock);
  87
  88 /*
  89  * Architecture provided APIs - weak aliases:
  90  */
  91 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  92 {
  93         return NULL;
  94 }
  95
  96 void __weak hw_perf_disable(void)               { barrier(); }
  97 void __weak hw_perf_enable(void)                { barrier(); }
  98
  99 void __weak hw_perf_event_setup(int cpu)        { barrier(); }
 100 void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
 101
 102 int __weak
 103 hw_perf_group_sched_in(struct perf_event *group_leader,
 104                struct perf_cpu_context *cpuctx,
 105                struct perf_event_context *ctx, int cpu)
 106 {
 107         return 0;
 108 }
 109
 110 void __weak perf_event_print_debug(void)        { }
 111
 112 static DEFINE_PER_CPU(int, perf_disable_count);
 113
 114 void __perf_disable(void)
 115 {
 116         __get_cpu_var(perf_disable_count)++;
 117 }
 118
 119 bool __perf_enable(void)
 120 {
 121         return !--__get_cpu_var(perf_disable_count);
 122 }
 123
 124 void perf_disable(void)
 125 {
 126         __perf_disable();
 127         hw_perf_disable();
 128 }
 129
 130 void perf_enable(void)
 131 {
 132         if (__perf_enable())
 133                 hw_perf_enable();
 134 }
 135
 136 static void get_ctx(struct perf_event_context *ctx)
 137 {
 138         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 139 }
 140
 141 static void free_ctx(struct rcu_head *head)
 142 {
 143         struct perf_event_context *ctx;
 144
 145         ctx = container_of(head, struct perf_event_context, rcu_head);
 146         kfree(ctx);
 147 }
 148
 149 static void put_ctx(struct perf_event_context *ctx)
 150 {
 151         if (atomic_dec_and_test(&ctx->refcount)) {
 152                 if (ctx->parent_ctx)
 153                         put_ctx(ctx->parent_ctx);
 154                 if (ctx->task)
 155                         put_task_struct(ctx->task);
 156                 call_rcu(&ctx->rcu_head, free_ctx);
 157         }
 158 }
 159
 160 static void unclone_ctx(struct perf_event_context *ctx)
 161 {
 162         if (ctx->parent_ctx) {
 163                 put_ctx(ctx->parent_ctx);
 164                 ctx->parent_ctx = NULL;
 165         }
 166 }
 167
 168 /*
 169  * If we inherit events we want to return the parent event id
 170  * to userspace.
 171  */
 172 static u64 primary_event_id(struct perf_event *event)
 173 {
 174         u64 id = event->id;
 175
 176         if (event->parent)
 177                 id = event->parent->id;
 178
 179         return id;
 180 }
 181
 182 /*
 183  * Get the perf_event_context for a task and lock it.
 184  * This has to cope with with the fact that until it is locked,
 185  * the context could get moved to another task.
 186  */
 187 static struct perf_event_context *
 188 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 189 {
 190         struct perf_event_context *ctx;
 191
 192         rcu_read_lock();
 193  retry:
 194         ctx = rcu_dereference(task->perf_event_ctxp);
 195         if (ctx) {
 196                 /*
 197                  * If this context is a clone of another, it might
 198                  * get swapped for another underneath us by
 199                  * perf_event_task_sched_out, though the
 200                  * rcu_read_lock() protects us from any context
 201                  * getting freed.  Lock the context and check if it
 202                  * got swapped before we could get the lock, and retry
 203                  * if so.  If we locked the right context, then it
 204                  * can't get swapped on us any more.
 205                  */
 206                 raw_spin_lock_irqsave(&ctx->lock, *flags);
 207                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 208                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 209                         goto retry;
 210                 }
 211
 212                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 213                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 214                         ctx = NULL;
 215                 }
 216         }
 217         rcu_read_unlock();
 218         return ctx;
 219 }
 220
 221 /*
 222  * Get the context for a task and increment its pin_count so it
 223  * can't get swapped to another task.  This also increments its
 224  * reference count so that the context can't get freed.
 225  */
 226 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 227 {
 228         struct perf_event_context *ctx;
 229         unsigned long flags;
 230
 231         ctx = perf_lock_task_context(task, &flags);
 232         if (ctx) {
 233                 ++ctx->pin_count;
 234                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
 235         }
 236         return ctx;
 237 }
 238
 239 static void perf_unpin_context(struct perf_event_context *ctx)
 240 {
 241         unsigned long flags;
 242
 243         raw_spin_lock_irqsave(&ctx->lock, flags);
 244         --ctx->pin_count;
 245         raw_spin_unlock_irqrestore(&ctx->lock, flags);
 246         put_ctx(ctx);
 247 }
 248
 249 static inline u64 perf_clock(void)
 250 {
 251         return cpu_clock(smp_processor_id());
 252 }
 253
 254 /*
 255  * Update the record of the current time in a context.
 256  */
 257 static void update_context_time(struct perf_event_context *ctx)
 258 {
 259         u64 now = perf_clock();
 260
 261         ctx->time += now - ctx->timestamp;
 262         ctx->timestamp = now;
 263 }
 264
 265 /*
 266  * Update the total_time_enabled and total_time_running fields for a event.
 267  */
 268 static void update_event_times(struct perf_event *event)
 269 {
 270         struct perf_event_context *ctx = event->ctx;
 271         u64 run_end;
 272
 273         if (event->state < PERF_EVENT_STATE_INACTIVE ||
 274             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 275                 return;
 276
 277         if (ctx->is_active)
 278                 run_end = ctx->time;
 279         else
 280                 run_end = event->tstamp_stopped;
 281
 282         event->total_time_enabled = run_end - event->tstamp_enabled;
 283
 284         if (event->state == PERF_EVENT_STATE_INACTIVE)
 285                 run_end = event->tstamp_stopped;
 286         else
 287                 run_end = ctx->time;
 288
 289         event->total_time_running = run_end - event->tstamp_running;
 290 }
 291
 292 static struct list_head *
 293 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 294 {
 295         if (event->attr.pinned)
 296                 return &ctx->pinned_groups;
 297         else
 298                 return &ctx->flexible_groups;
 299 }
 300
 301 /*
 302  * Add a event from the lists for its context.
 303  * Must be called with ctx->mutex and ctx->lock held.
 304  */
 305 static void
 306 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 307 {
 308         struct perf_event *group_leader = event->group_leader;
 309
 310         /*
 311          * Depending on whether it is a standalone or sibling event,
 312          * add it straight to the context's event list, or to the group
 313          * leader's sibling list:
 314          */
 315         if (group_leader == event) {
 316                 struct list_head *list;
 317
 318                 if (is_software_event(event))
 319                         event->group_flags |= PERF_GROUP_SOFTWARE;
 320
 321                 list = ctx_group_list(event, ctx);
 322                 list_add_tail(&event->group_entry, list);
 323         } else {
 324                 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
 325                     !is_software_event(event))
 326                         group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
 327
 328                 list_add_tail(&event->group_entry, &group_leader->sibling_list);
 329                 group_leader->nr_siblings++;
 330         }
 331
 332         list_add_rcu(&event->event_entry, &ctx->event_list);
 333         ctx->nr_events++;
 334         if (event->attr.inherit_stat)
 335                 ctx->nr_stat++;
 336 }
 337
 338 /*
 339  * Remove a event from the lists for its context.
 340  * Must be called with ctx->mutex and ctx->lock held.
 341  */
 342 static void
 343 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 344 {
 345         struct perf_event *sibling, *tmp;
 346
 347         if (list_empty(&event->group_entry))
 348                 return;
 349         ctx->nr_events--;
 350         if (event->attr.inherit_stat)
 351                 ctx->nr_stat--;
 352
 353         list_del_init(&event->group_entry);
 354         list_del_rcu(&event->event_entry);
 355
 356         if (event->group_leader != event)
 357                 event->group_leader->nr_siblings--;
 358
 359         update_event_times(event);
 360
 361         /*
 362          * If event was in error state, then keep it
 363          * that way, otherwise bogus counts will be
 364          * returned on read(). The only way to get out
 365          * of error state is by explicit re-enabling
 366          * of the event
 367          */
 368         if (event->state > PERF_EVENT_STATE_OFF)
 369                 event->state = PERF_EVENT_STATE_OFF;
 370
 371         /*
 372          * If this was a group event with sibling events then
 373          * upgrade the siblings to singleton events by adding them
 374          * to the context list directly:
 375          */
 376         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 377                 struct list_head *list;
 378
 379                 list = ctx_group_list(event, ctx);
 380                 list_move_tail(&sibling->group_entry, list);
 381                 sibling->group_leader = sibling;
 382
 383                 /* Inherit group flags from the previous leader */
 384                 sibling->group_flags = event->group_flags;
 385         }
 386 }
 387
 388 static void
 389 event_sched_out(struct perf_event *event,
 390                   struct perf_cpu_context *cpuctx,
 391                   struct perf_event_context *ctx)
 392 {
 393         if (event->state != PERF_EVENT_STATE_ACTIVE)
 394                 return;
 395
 396         event->state = PERF_EVENT_STATE_INACTIVE;
 397         if (event->pending_disable) {
 398                 event->pending_disable = 0;
 399                 event->state = PERF_EVENT_STATE_OFF;
 400         }
 401         event->tstamp_stopped = ctx->time;
 402         event->pmu->disable(event);
 403         event->oncpu = -1;
 404
 405         if (!is_software_event(event))
 406                 cpuctx->active_oncpu--;
 407         ctx->nr_active--;
 408         if (event->attr.exclusive || !cpuctx->active_oncpu)
 409                 cpuctx->exclusive = 0;
 410 }
 411
 412 static void
 413 group_sched_out(struct perf_event *group_event,
 414                 struct perf_cpu_context *cpuctx,
 415                 struct perf_event_context *ctx)
 416 {
 417         struct perf_event *event;
 418
 419         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 420                 return;
 421
 422         event_sched_out(group_event, cpuctx, ctx);
 423
 424         /*
 425          * Schedule out siblings (if any):
 426          */
 427         list_for_each_entry(event, &group_event->sibling_list, group_entry)
 428                 event_sched_out(event, cpuctx, ctx);
 429
 430         if (group_event->attr.exclusive)
 431                 cpuctx->exclusive = 0;
 432 }
 433
 434 /*
 435  * Cross CPU call to remove a performance event
 436  *
 437  * We disable the event on the hardware level first. After that we
 438  * remove it from the context list.
 439  */
 440 static void __perf_event_remove_from_context(void *info)
 441 {
 442         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 443         struct perf_event *event = info;
 444         struct perf_event_context *ctx = event->ctx;
 445
 446         /*
 447          * If this is a task context, we need to check whether it is
 448          * the current task context of this cpu. If not it has been
 449          * scheduled out before the smp call arrived.
 450          */
 451         if (ctx->task && cpuctx->task_ctx != ctx)
 452                 return;
 453
 454         raw_spin_lock(&ctx->lock);
 455         /*
 456          * Protect the list operation against NMI by disabling the
 457          * events on a global level.
 458          */
 459         perf_disable();
 460
 461         event_sched_out(event, cpuctx, ctx);
 462
 463         list_del_event(event, ctx);
 464
 465         if (!ctx->task) {
 466                 /*
 467                  * Allow more per task events with respect to the
 468                  * reservation:
 469                  */
 470                 cpuctx->max_pertask =
 471                         min(perf_max_events - ctx->nr_events,
 472                             perf_max_events - perf_reserved_percpu);
 473         }
 474
 475         perf_enable();
 476         raw_spin_unlock(&ctx->lock);
 477 }
 478
 479
 480 /*
 481  * Remove the event from a task's (or a CPU's) list of events.
 482  *
 483  * Must be called with ctx->mutex held.
 484  *
 485  * CPU events are removed with a smp call. For task events we only
 486  * call when the task is on a CPU.
 487  *
 488  * If event->ctx is a cloned context, callers must make sure that
 489  * every task struct that event->ctx->task could possibly point to
 490  * remains valid.  This is OK when called from perf_release since
 491  * that only calls us on the top-level context, which can't be a clone.
 492  * When called from perf_event_exit_task, it's OK because the
 493  * context has been detached from its task.
 494  */
 495 static void perf_event_remove_from_context(struct perf_event *event)
 496 {
 497         struct perf_event_context *ctx = event->ctx;
 498         struct task_struct *task = ctx->task;
 499
 500         if (!task) {
 501                 /*
 502                  * Per cpu events are removed via an smp call and
 503                  * the removal is always successful.
 504                  */
 505                 smp_call_function_single(event->cpu,
 506                                          __perf_event_remove_from_context,
 507                                          event, 1);
 508                 return;
 509         }
 510
 511 retry:
 512         task_oncpu_function_call(task, __perf_event_remove_from_context,
 513                                  event);
 514
 515         raw_spin_lock_irq(&ctx->lock);
 516         /*
 517          * If the context is active we need to retry the smp call.
 518          */
 519         if (ctx->nr_active && !list_empty(&event->group_entry)) {
 520                 raw_spin_unlock_irq(&ctx->lock);
 521                 goto retry;
 522         }
 523
 524         /*
 525          * The lock prevents that this context is scheduled in so we
 526          * can remove the event safely, if the call above did not
 527          * succeed.
 528          */
 529         if (!list_empty(&event->group_entry))
 530                 list_del_event(event, ctx);
 531         raw_spin_unlock_irq(&ctx->lock);
 532 }
 533
 534 /*
 535  * Update total_time_enabled and total_time_running for all events in a group.
 536  */
 537 static void update_group_times(struct perf_event *leader)
 538 {
 539         struct perf_event *event;
 540
 541         update_event_times(leader);
 542         list_for_each_entry(event, &leader->sibling_list, group_entry)
 543                 update_event_times(event);
 544 }
 545
 546 /*
 547  * Cross CPU call to disable a performance event
 548  */
 549 static void __perf_event_disable(void *info)
 550 {
 551         struct perf_event *event = info;
 552         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 553         struct perf_event_context *ctx = event->ctx;
 554
 555         /*
 556          * If this is a per-task event, need to check whether this
 557          * event's task is the current task on this cpu.
 558          */
 559         if (ctx->task && cpuctx->task_ctx != ctx)
 560                 return;
 561
 562         raw_spin_lock(&ctx->lock);
 563
 564         /*
 565          * If the event is on, turn it off.
 566          * If it is in error state, leave it in error state.
 567          */
 568         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 569                 update_context_time(ctx);
 570                 update_group_times(event);
 571                 if (event == event->group_leader)
 572                         group_sched_out(event, cpuctx, ctx);
 573                 else
 574                         event_sched_out(event, cpuctx, ctx);
 575                 event->state = PERF_EVENT_STATE_OFF;
 576         }
 577
 578         raw_spin_unlock(&ctx->lock);
 579 }
 580
 581 /*
 582  * Disable a event.
 583  *
 584  * If event->ctx is a cloned context, callers must make sure that
 585  * every task struct that event->ctx->task could possibly point to
 586  * remains valid.  This condition is satisifed when called through
 587  * perf_event_for_each_child or perf_event_for_each because they
 588  * hold the top-level event's child_mutex, so any descendant that
 589  * goes to exit will block in sync_child_event.
 590  * When called from perf_pending_event it's OK because event->ctx
 591  * is the current context on this CPU and preemption is disabled,
 592  * hence we can't get into perf_event_task_sched_out for this context.
 593  */
 594 void perf_event_disable(struct perf_event *event)
 595 {
 596         struct perf_event_context *ctx = event->ctx;
 597         struct task_struct *task = ctx->task;
 598
 599         if (!task) {
 600                 /*
 601                  * Disable the event on the cpu that it's on
 602                  */
 603                 smp_call_function_single(event->cpu, __perf_event_disable,
 604                                          event, 1);
 605                 return;
 606         }
 607
 608  retry:
 609         task_oncpu_function_call(task, __perf_event_disable, event);
 610
 611         raw_spin_lock_irq(&ctx->lock);
 612         /*
 613          * If the event is still active, we need to retry the cross-call.
 614          */
 615         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 616                 raw_spin_unlock_irq(&ctx->lock);
 617                 goto retry;
 618         }
 619
 620         /*
 621          * Since we have the lock this context can't be scheduled
 622          * in, so we can change the state safely.
 623          */
 624         if (event->state == PERF_EVENT_STATE_INACTIVE) {
 625                 update_group_times(event);
 626                 event->state = PERF_EVENT_STATE_OFF;
 627         }
 628
 629         raw_spin_unlock_irq(&ctx->lock);
 630 }
 631
 632 static int
 633 event_sched_in(struct perf_event *event,
 634                  struct perf_cpu_context *cpuctx,
 635                  struct perf_event_context *ctx,
 636                  int cpu)
 637 {
 638         if (event->state <= PERF_EVENT_STATE_OFF)
 639                 return 0;
 640
 641         event->state = PERF_EVENT_STATE_ACTIVE;
 642         event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
 643         /*
 644          * The new state must be visible before we turn it on in the hardware:
 645          */
 646         smp_wmb();
 647
 648         if (event->pmu->enable(event)) {
 649                 event->state = PERF_EVENT_STATE_INACTIVE;
 650                 event->oncpu = -1;
 651                 return -EAGAIN;
 652         }
 653
 654         event->tstamp_running += ctx->time - event->tstamp_stopped;
 655
 656         if (!is_software_event(event))
 657                 cpuctx->active_oncpu++;
 658         ctx->nr_active++;
 659
 660         if (event->attr.exclusive)
 661                 cpuctx->exclusive = 1;
 662
 663         return 0;
 664 }
 665
 666 static int
 667 group_sched_in(struct perf_event *group_event,
 668                struct perf_cpu_context *cpuctx,
 669                struct perf_event_context *ctx,
 670                int cpu)
 671 {
 672         struct perf_event *event, *partial_group;
 673         int ret;
 674
 675         if (group_event->state == PERF_EVENT_STATE_OFF)
 676                 return 0;
 677
 678         ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
 679         if (ret)
 680                 return ret < 0 ? ret : 0;
 681
 682         if (event_sched_in(group_event, cpuctx, ctx, cpu))
 683                 return -EAGAIN;
 684
 685         /*
 686          * Schedule in siblings as one group (if any):
 687          */
 688         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 689                 if (event_sched_in(event, cpuctx, ctx, cpu)) {
 690                         partial_group = event;
 691                         goto group_error;
 692                 }
 693         }
 694
 695         return 0;
 696
 697 group_error:
 698         /*
 699          * Groups can be scheduled in as one unit only, so undo any
 700          * partial group before returning:
 701          */
 702         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 703                 if (event == partial_group)
 704                         break;
 705                 event_sched_out(event, cpuctx, ctx);
 706         }
 707         event_sched_out(group_event, cpuctx, ctx);
 708
 709         return -EAGAIN;
 710 }
 711
 712 /*
 713  * Work out whether we can put this event group on the CPU now.
 714  */
 715 static int group_can_go_on(struct perf_event *event,
 716                            struct perf_cpu_context *cpuctx,
 717                            int can_add_hw)
 718 {
 719         /*
 720          * Groups consisting entirely of software events can always go on.
 721          */
 722         if (event->group_flags & PERF_GROUP_SOFTWARE)
 723                 return 1;
 724         /*
 725          * If an exclusive group is already on, no other hardware
 726          * events can go on.
 727          */
 728         if (cpuctx->exclusive)
 729                 return 0;
 730         /*
 731          * If this group is exclusive and there are already
 732          * events on the CPU, it can't go on.
 733          */
 734         if (event->attr.exclusive && cpuctx->active_oncpu)
 735                 return 0;
 736         /*
 737          * Otherwise, try to add it if all previous groups were able
 738          * to go on.
 739          */
 740         return can_add_hw;
 741 }
 742
 743 static void add_event_to_ctx(struct perf_event *event,
 744                                struct perf_event_context *ctx)
 745 {
 746         list_add_event(event, ctx);
 747         event->tstamp_enabled = ctx->time;
 748         event->tstamp_running = ctx->time;
 749         event->tstamp_stopped = ctx->time;
 750 }
 751
 752 /*
 753  * Cross CPU call to install and enable a performance event
 754  *
 755  * Must be called with ctx->mutex held
 756  */
 757 static void __perf_install_in_context(void *info)
 758 {
 759         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 760         struct perf_event *event = info;
 761         struct perf_event_context *ctx = event->ctx;
 762         struct perf_event *leader = event->group_leader;
 763         int cpu = smp_processor_id();
 764         int err;
 765
 766         /*
 767          * If this is a task context, we need to check whether it is
 768          * the current task context of this cpu. If not it has been
 769          * scheduled out before the smp call arrived.
 770          * Or possibly this is the right context but it isn't
 771          * on this cpu because it had no events.
 772          */
 773         if (ctx->task && cpuctx->task_ctx != ctx) {
 774                 if (cpuctx->task_ctx || ctx->task != current)
 775                         return;
 776                 cpuctx->task_ctx = ctx;
 777         }
 778
 779         raw_spin_lock(&ctx->lock);
 780         ctx->is_active = 1;
 781         update_context_time(ctx);
 782
 783         /*
 784          * Protect the list operation against NMI by disabling the
 785          * events on a global level. NOP for non NMI based events.
 786          */
 787         perf_disable();
 788
 789         add_event_to_ctx(event, ctx);
 790
 791         if (event->cpu != -1 && event->cpu != smp_processor_id())
 792                 goto unlock;
 793
 794         /*
 795          * Don't put the event on if it is disabled or if
 796          * it is in a group and the group isn't on.
 797          */
 798         if (event->state != PERF_EVENT_STATE_INACTIVE ||
 799             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 800                 goto unlock;
 801
 802         /*
 803          * An exclusive event can't go on if there are already active
 804          * hardware events, and no hardware event can go on if there
 805          * is already an exclusive event on.
 806          */
 807         if (!group_can_go_on(event, cpuctx, 1))
 808                 err = -EEXIST;
 809         else
 810                 err = event_sched_in(event, cpuctx, ctx, cpu);
 811
 812         if (err) {
 813                 /*
 814                  * This event couldn't go on.  If it is in a group
 815                  * then we have to pull the whole group off.
 816                  * If the event group is pinned then put it in error state.
 817                  */
 818                 if (leader != event)
 819                         group_sched_out(leader, cpuctx, ctx);
 820                 if (leader->attr.pinned) {
 821                         update_group_times(leader);
 822                         leader->state = PERF_EVENT_STATE_ERROR;
 823                 }
 824         }
 825
 826         if (!err && !ctx->task && cpuctx->max_pertask)
 827                 cpuctx->max_pertask--;
 828
 829  unlock:
 830         perf_enable();
 831
 832         raw_spin_unlock(&ctx->lock);
 833 }
 834
 835 /*
 836  * Attach a performance event to a context
 837  *
 838  * First we add the event to the list with the hardware enable bit
 839  * in event->hw_config cleared.
 840  *
 841  * If the event is attached to a task which is on a CPU we use a smp
 842  * call to enable it in the task context. The task might have been
 843  * scheduled away, but we check this in the smp call again.
 844  *
 845  * Must be called with ctx->mutex held.
 846  */
 847 static void
 848 perf_install_in_context(struct perf_event_context *ctx,
 849                         struct perf_event *event,
 850                         int cpu)
 851 {
 852         struct task_struct *task = ctx->task;
 853
 854         if (!task) {
 855                 /*
 856                  * Per cpu events are installed via an smp call and
 857                  * the install is always successful.
 858                  */
 859                 smp_call_function_single(cpu, __perf_install_in_context,
 860                                          event, 1);
 861                 return;
 862         }
 863
 864 retry:
 865         task_oncpu_function_call(task, __perf_install_in_context,
 866                                  event);
 867
 868         raw_spin_lock_irq(&ctx->lock);
 869         /*
 870          * we need to retry the smp call.
 871          */
 872         if (ctx->is_active && list_empty(&event->group_entry)) {
 873                 raw_spin_unlock_irq(&ctx->lock);
 874                 goto retry;
 875         }
 876
 877         /*
 878          * The lock prevents that this context is scheduled in so we
 879          * can add the event safely, if it the call above did not
 880          * succeed.
 881          */
 882         if (list_empty(&event->group_entry))
 883                 add_event_to_ctx(event, ctx);
 884         raw_spin_unlock_irq(&ctx->lock);
 885 }
 886
 887 /*
 888  * Put a event into inactive state and update time fields.
 889  * Enabling the leader of a group effectively enables all
 890  * the group members that aren't explicitly disabled, so we
 891  * have to update their ->tstamp_enabled also.
 892  * Note: this works for group members as well as group leaders
 893  * since the non-leader members' sibling_lists will be empty.
 894  */
 895 static void __perf_event_mark_enabled(struct perf_event *event,
 896                                         struct perf_event_context *ctx)
 897 {
 898         struct perf_event *sub;
 899
 900         event->state = PERF_EVENT_STATE_INACTIVE;
 901         event->tstamp_enabled = ctx->time - event->total_time_enabled;
 902         list_for_each_entry(sub, &event->sibling_list, group_entry)
 903                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 904                         sub->tstamp_enabled =
 905                                 ctx->time - sub->total_time_enabled;
 906 }
 907
 908 /*
 909  * Cross CPU call to enable a performance event
 910  */
 911 static void __perf_event_enable(void *info)
 912 {
 913         struct perf_event *event = info;
 914         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 915         struct perf_event_context *ctx = event->ctx;
 916         struct perf_event *leader = event->group_leader;
 917         int err;
 918
 919         /*
 920          * If this is a per-task event, need to check whether this
 921          * event's task is the current task on this cpu.
 922          */
 923         if (ctx->task && cpuctx->task_ctx != ctx) {
 924                 if (cpuctx->task_ctx || ctx->task != current)
 925                         return;
 926                 cpuctx->task_ctx = ctx;
 927         }
 928
 929         raw_spin_lock(&ctx->lock);
 930         ctx->is_active = 1;
 931         update_context_time(ctx);
 932
 933         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 934                 goto unlock;
 935         __perf_event_mark_enabled(event, ctx);
 936
 937         if (event->cpu != -1 && event->cpu != smp_processor_id())
 938                 goto unlock;
 939
 940         /*
 941          * If the event is in a group and isn't the group leader,
 942          * then don't put it on unless the group is on.
 943          */
 944         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 945                 goto unlock;
 946
 947         if (!group_can_go_on(event, cpuctx, 1)) {
 948                 err = -EEXIST;
 949         } else {
 950                 perf_disable();
 951                 if (event == leader)
 952                         err = group_sched_in(event, cpuctx, ctx,
 953                                              smp_processor_id());
 954                 else
 955                         err = event_sched_in(event, cpuctx, ctx,
 956                                                smp_processor_id());
 957                 perf_enable();
 958         }
 959
 960         if (err) {
 961                 /*
 962                  * If this event can't go on and it's part of a
 963                  * group, then the whole group has to come off.
 964                  */
 965                 if (leader != event)
 966                         group_sched_out(leader, cpuctx, ctx);
 967                 if (leader->attr.pinned) {
 968                         update_group_times(leader);
 969                         leader->state = PERF_EVENT_STATE_ERROR;
 970                 }
 971         }
 972
 973  unlock:
 974         raw_spin_unlock(&ctx->lock);
 975 }
 976
 977 /*
 978  * Enable a event.
 979  *
 980  * If event->ctx is a cloned context, callers must make sure that
 981  * every task struct that event->ctx->task could possibly point to
 982  * remains valid.  This condition is satisfied when called through
 983  * perf_event_for_each_child or perf_event_for_each as described
 984  * for perf_event_disable.
 985  */
 986 void perf_event_enable(struct perf_event *event)
 987 {
 988         struct perf_event_context *ctx = event->ctx;
 989         struct task_struct *task = ctx->task;
 990
 991         if (!task) {
 992                 /*
 993                  * Enable the event on the cpu that it's on
 994                  */
 995                 smp_call_function_single(event->cpu, __perf_event_enable,
 996                                          event, 1);
 997                 return;
 998         }
 999
1000         raw_spin_lock_irq(&ctx->lock);
1001         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1002                 goto out;
1003
1004         /*
1005          * If the event is in error state, clear that first.
1006          * That way, if we see the event in error state below, we
1007          * know that it has gone back into error state, as distinct
1008          * from the task having been scheduled away before the
1009          * cross-call arrived.
1010          */
1011         if (event->state == PERF_EVENT_STATE_ERROR)
1012                 event->state = PERF_EVENT_STATE_OFF;
1013
1014  retry:
1015         raw_spin_unlock_irq(&ctx->lock);
1016         task_oncpu_function_call(task, __perf_event_enable, event);
1017
1018         raw_spin_lock_irq(&ctx->lock);
1019
1020         /*
1021          * If the context is active and the event is still off,
1022          * we need to retry the cross-call.
1023          */
1024         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1025                 goto retry;
1026
1027         /*
1028          * Since we have the lock this context can't be scheduled
1029          * in, so we can change the state safely.
1030          */
1031         if (event->state == PERF_EVENT_STATE_OFF)
1032                 __perf_event_mark_enabled(event, ctx);
1033
1034  out:
1035         raw_spin_unlock_irq(&ctx->lock);
1036 }
1037
1038 static int perf_event_refresh(struct perf_event *event, int refresh)
1039 {
1040         /*
1041          * not supported on inherited events
1042          */
1043         if (event->attr.inherit)
1044                 return -EINVAL;
1045
1046         atomic_add(refresh, &event->event_limit);
1047         perf_event_enable(event);
1048
1049         return 0;
1050 }
1051
1052 enum event_type_t {
1053         EVENT_FLEXIBLE = 0x1,
1054         EVENT_PINNED = 0x2,
1055         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1056 };
1057
1058 static void ctx_sched_out(struct perf_event_context *ctx,
1059                           struct perf_cpu_context *cpuctx,
1060                           enum event_type_t event_type)
1061 {
1062         struct perf_event *event;
1063
1064         raw_spin_lock(&ctx->lock);
1065         ctx->is_active = 0;
1066         if (likely(!ctx->nr_events))
1067                 goto out;
1068         update_context_time(ctx);
1069
1070         perf_disable();
1071         if (!ctx->nr_active)
1072                 goto out_enable;
1073
1074         if (event_type & EVENT_PINNED)
1075                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1076                         group_sched_out(event, cpuctx, ctx);
1077
1078         if (event_type & EVENT_FLEXIBLE)
1079                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1080                         group_sched_out(event, cpuctx, ctx);
1081
1082  out_enable:
1083         perf_enable();
1084  out:
1085         raw_spin_unlock(&ctx->lock);
1086 }
1087
1088 /*
1089  * Test whether two contexts are equivalent, i.e. whether they
1090  * have both been cloned from the same version of the same context
1091  * and they both have the same number of enabled events.
1092  * If the number of enabled events is the same, then the set
1093  * of enabled events should be the same, because these are both
1094  * inherited contexts, therefore we can't access individual events
1095  * in them directly with an fd; we can only enable/disable all
1096  * events via prctl, or enable/disable all events in a family
1097  * via ioctl, which will have the same effect on both contexts.
1098  */
1099 static int context_equiv(struct perf_event_context *ctx1,
1100                          struct perf_event_context *ctx2)
1101 {
1102         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1103                 && ctx1->parent_gen == ctx2->parent_gen
1104                 && !ctx1->pin_count && !ctx2->pin_count;
1105 }
1106
1107 static void __perf_event_sync_stat(struct perf_event *event,
1108                                      struct perf_event *next_event)
1109 {
1110         u64 value;
1111
1112         if (!event->attr.inherit_stat)
1113                 return;
1114
1115         /*
1116          * Update the event value, we cannot use perf_event_read()
1117          * because we're in the middle of a context switch and have IRQs
1118          * disabled, which upsets smp_call_function_single(), however
1119          * we know the event must be on the current CPU, therefore we
1120          * don't need to use it.
1121          */
1122         switch (event->state) {
1123         case PERF_EVENT_STATE_ACTIVE:
1124                 event->pmu->read(event);
1125                 /* fall-through */
1126
1127         case PERF_EVENT_STATE_INACTIVE:
1128                 update_event_times(event);
1129                 break;
1130
1131         default:
1132                 break;
1133         }
1134
1135         /*
1136          * In order to keep per-task stats reliable we need to flip the event
1137          * values when we flip the contexts.
1138          */
1139         value = atomic64_read(&next_event->count);
1140         value = atomic64_xchg(&event->count, value);
1141         atomic64_set(&next_event->count, value);
1142
1143         swap(event->total_time_enabled, next_event->total_time_enabled);
1144         swap(event->total_time_running, next_event->total_time_running);
1145
1146         /*
1147          * Since we swizzled the values, update the user visible data too.
1148          */
1149         perf_event_update_userpage(event);
1150         perf_event_update_userpage(next_event);
1151 }
1152
1153 #define list_next_entry(pos, member) \
1154         list_entry(pos->member.next, typeof(*pos), member)
1155
1156 static void perf_event_sync_stat(struct perf_event_context *ctx,
1157                                    struct perf_event_context *next_ctx)
1158 {
1159         struct perf_event *event, *next_event;
1160
1161         if (!ctx->nr_stat)
1162                 return;
1163
1164         update_context_time(ctx);
1165
1166         event = list_first_entry(&ctx->event_list,
1167                                    struct perf_event, event_entry);
1168
1169         next_event = list_first_entry(&next_ctx->event_list,
1170                                         struct perf_event, event_entry);
1171
1172         while (&event->event_entry != &ctx->event_list &&
1173                &next_event->event_entry != &next_ctx->event_list) {
1174
1175                 __perf_event_sync_stat(event, next_event);
1176
1177                 event = list_next_entry(event, event_entry);
1178                 next_event = list_next_entry(next_event, event_entry);
1179         }
1180 }
1181
1182 /*
1183  * Called from scheduler to remove the events of the current task,
1184  * with interrupts disabled.
1185  *
1186  * We stop each event and update the event value in event->count.
1187  *
1188  * This does not protect us against NMI, but disable()
1189  * sets the disabled bit in the control field of event _before_
1190  * accessing the event control register. If a NMI hits, then it will
1191  * not restart the event.
1192  */
1193 void perf_event_task_sched_out(struct task_struct *task,
1194                                  struct task_struct *next)
1195 {
1196         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1197         struct perf_event_context *ctx = task->perf_event_ctxp;
1198         struct perf_event_context *next_ctx;
1199         struct perf_event_context *parent;
1200         struct pt_regs *regs;
1201         int do_switch = 1;
1202
1203         regs = task_pt_regs(task);
1204         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1205
1206         if (likely(!ctx || !cpuctx->task_ctx))
1207                 return;
1208
1209         rcu_read_lock();
1210         parent = rcu_dereference(ctx->parent_ctx);
1211         next_ctx = next->perf_event_ctxp;
1212         if (parent && next_ctx &&
1213             rcu_dereference(next_ctx->parent_ctx) == parent) {
1214                 /*
1215                  * Looks like the two contexts are clones, so we might be
1216                  * able to optimize the context switch.  We lock both
1217                  * contexts and check that they are clones under the
1218                  * lock (including re-checking that neither has been
1219                  * uncloned in the meantime).  It doesn't matter which
1220                  * order we take the locks because no other cpu could
1221                  * be trying to lock both of these tasks.
1222                  */
1223                 raw_spin_lock(&ctx->lock);
1224                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1225                 if (context_equiv(ctx, next_ctx)) {
1226                         /*
1227                          * XXX do we need a memory barrier of sorts
1228                          * wrt to rcu_dereference() of perf_event_ctxp
1229                          */
1230                         task->perf_event_ctxp = next_ctx;
1231                         next->perf_event_ctxp = ctx;
1232                         ctx->task = next;
1233                         next_ctx->task = task;
1234                         do_switch = 0;
1235
1236                         perf_event_sync_stat(ctx, next_ctx);
1237                 }
1238                 raw_spin_unlock(&next_ctx->lock);
1239                 raw_spin_unlock(&ctx->lock);
1240         }
1241         rcu_read_unlock();
1242
1243         if (do_switch) {
1244                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1245                 cpuctx->task_ctx = NULL;
1246         }
1247 }
1248
1249 static void task_ctx_sched_out(struct perf_event_context *ctx,
1250                                enum event_type_t event_type)
1251 {
1252         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1253
1254         if (!cpuctx->task_ctx)
1255                 return;
1256
1257         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1258                 return;
1259
1260         ctx_sched_out(ctx, cpuctx, event_type);
1261         cpuctx->task_ctx = NULL;
1262 }
1263
1264 /*
1265  * Called with IRQs disabled
1266  */
1267 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1268 {
1269         task_ctx_sched_out(ctx, EVENT_ALL);
1270 }
1271
1272 /*
1273  * Called with IRQs disabled
1274  */
1275 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1276                               enum event_type_t event_type)
1277 {
1278         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1279 }
1280
1281 static void
1282 ctx_pinned_sched_in(struct perf_event_context *ctx,
1283                     struct perf_cpu_context *cpuctx,
1284                     int cpu)
1285 {
1286         struct perf_event *event;
1287
1288         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1289                 if (event->state <= PERF_EVENT_STATE_OFF)
1290                         continue;
1291                 if (event->cpu != -1 && event->cpu != cpu)
1292                         continue;
1293
1294                 if (group_can_go_on(event, cpuctx, 1))
1295                         group_sched_in(event, cpuctx, ctx, cpu);
1296
1297                 /*
1298                  * If this pinned group hasn't been scheduled,
1299                  * put it in error state.
1300                  */
1301                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1302                         update_group_times(event);
1303                         event->state = PERF_EVENT_STATE_ERROR;
1304                 }
1305         }
1306 }
1307
1308 static void
1309 ctx_flexible_sched_in(struct perf_event_context *ctx,
1310                       struct perf_cpu_context *cpuctx,
1311                       int cpu)
1312 {
1313         struct perf_event *event;
1314         int can_add_hw = 1;
1315
1316         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1317                 /* Ignore events in OFF or ERROR state */
1318                 if (event->state <= PERF_EVENT_STATE_OFF)
1319                         continue;
1320                 /*
1321                  * Listen to the 'cpu' scheduling filter constraint
1322                  * of events:
1323                  */
1324                 if (event->cpu != -1 && event->cpu != cpu)
1325                         continue;
1326
1327                 if (group_can_go_on(event, cpuctx, can_add_hw))
1328                         if (group_sched_in(event, cpuctx, ctx, cpu))
1329                                 can_add_hw = 0;
1330         }
1331 }
1332
1333 static void
1334 ctx_sched_in(struct perf_event_context *ctx,
1335              struct perf_cpu_context *cpuctx,
1336              enum event_type_t event_type)
1337 {
1338         int cpu = smp_processor_id();
1339
1340         raw_spin_lock(&ctx->lock);
1341         ctx->is_active = 1;
1342         if (likely(!ctx->nr_events))
1343                 goto out;
1344
1345         ctx->timestamp = perf_clock();
1346
1347         perf_disable();
1348
1349         /*
1350          * First go through the list and put on any pinned groups
1351          * in order to give them the best chance of going on.
1352          */
1353         if (event_type & EVENT_PINNED)
1354                 ctx_pinned_sched_in(ctx, cpuctx, cpu);
1355
1356         /* Then walk through the lower prio flexible groups */
1357         if (event_type & EVENT_FLEXIBLE)
1358                 ctx_flexible_sched_in(ctx, cpuctx, cpu);
1359
1360         perf_enable();
1361  out:
1362         raw_spin_unlock(&ctx->lock);
1363 }
1364
1365 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1366                              enum event_type_t event_type)
1367 {
1368         struct perf_event_context *ctx = &cpuctx->ctx;
1369
1370         ctx_sched_in(ctx, cpuctx, event_type);
1371 }
1372
1373 static void task_ctx_sched_in(struct task_struct *task,
1374                               enum event_type_t event_type)
1375 {
1376         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1377         struct perf_event_context *ctx = task->perf_event_ctxp;
1378
1379         if (likely(!ctx))
1380                 return;
1381         if (cpuctx->task_ctx == ctx)
1382                 return;
1383         ctx_sched_in(ctx, cpuctx, event_type);
1384         cpuctx->task_ctx = ctx;
1385 }
1386 /*
1387  * Called from scheduler to add the events of the current task
1388  * with interrupts disabled.
1389  *
1390  * We restore the event value and then enable it.
1391  *
1392  * This does not protect us against NMI, but enable()
1393  * sets the enabled bit in the control field of event _before_
1394  * accessing the event control register. If a NMI hits, then it will
1395  * keep the event running.
1396  */
1397 void perf_event_task_sched_in(struct task_struct *task)
1398 {
1399         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1400         struct perf_event_context *ctx = task->perf_event_ctxp;
1401
1402         if (likely(!ctx))
1403                 return;
1404
1405         if (cpuctx->task_ctx == ctx)
1406                 return;
1407
1408         /*
1409          * We want to keep the following priority order:
1410          * cpu pinned (that don't need to move), task pinned,
1411          * cpu flexible, task flexible.
1412          */
1413         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1414
1415         ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1416         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1417         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1418
1419         cpuctx->task_ctx = ctx;
1420 }
1421
1422 #define MAX_INTERRUPTS (~0ULL)
1423
1424 static void perf_log_throttle(struct perf_event *event, int enable);
1425
1426 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1427 {
1428         u64 frequency = event->attr.sample_freq;
1429         u64 sec = NSEC_PER_SEC;
1430         u64 divisor, dividend;
1431
1432         int count_fls, nsec_fls, frequency_fls, sec_fls;
1433
1434         count_fls = fls64(count);
1435         nsec_fls = fls64(nsec);
1436         frequency_fls = fls64(frequency);
1437         sec_fls = 30;
1438
1439         /*
1440          * We got @count in @nsec, with a target of sample_freq HZ
1441          * the target period becomes:
1442          *
1443          *             @count * 10^9
1444          * period = -------------------
1445          *          @nsec * sample_freq
1446          *
1447          */
1448
1449         /*
1450          * Reduce accuracy by one bit such that @a and @b converge
1451          * to a similar magnitude.
1452          */
1453 #define REDUCE_FLS(a, b)                \
1454 do {                                    \
1455         if (a##_fls > b##_fls) {        \
1456                 a >>= 1;                \
1457                 a##_fls--;              \
1458         } else {                        \
1459                 b >>= 1;                \
1460                 b##_fls--;              \
1461         }                               \
1462 } while (0)
1463
1464         /*
1465          * Reduce accuracy until either term fits in a u64, then proceed with
1466          * the other, so that finally we can do a u64/u64 division.
1467          */
1468         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1469                 REDUCE_FLS(nsec, frequency);
1470                 REDUCE_FLS(sec, count);
1471         }
1472
1473         if (count_fls + sec_fls > 64) {
1474                 divisor = nsec * frequency;
1475
1476                 while (count_fls + sec_fls > 64) {
1477                         REDUCE_FLS(count, sec);
1478                         divisor >>= 1;
1479                 }
1480
1481                 dividend = count * sec;
1482         } else {
1483                 dividend = count * sec;
1484
1485                 while (nsec_fls + frequency_fls > 64) {
1486                         REDUCE_FLS(nsec, frequency);
1487                         dividend >>= 1;
1488                 }
1489
1490                 divisor = nsec * frequency;
1491         }
1492
1493         return div64_u64(dividend, divisor);
1494 }
1495
1496 static void perf_event_stop(struct perf_event *event)
1497 {
1498         if (!event->pmu->stop)
1499                 return event->pmu->disable(event);
1500
1501         return event->pmu->stop(event);
1502 }
1503
1504 static int perf_event_start(struct perf_event *event)
1505 {
1506         if (!event->pmu->start)
1507                 return event->pmu->enable(event);
1508
1509         return event->pmu->start(event);
1510 }
1511
1512 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1513 {
1514         struct hw_perf_event *hwc = &event->hw;
1515         u64 period, sample_period;
1516         s64 delta;
1517
1518         period = perf_calculate_period(event, nsec, count);
1519
1520         delta = (s64)(period - hwc->sample_period);
1521         delta = (delta + 7) / 8; /* low pass filter */
1522
1523         sample_period = hwc->sample_period + delta;
1524
1525         if (!sample_period)
1526                 sample_period = 1;
1527
1528         hwc->sample_period = sample_period;
1529
1530         if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1531                 perf_disable();
1532                 perf_event_stop(event);
1533                 atomic64_set(&hwc->period_left, 0);
1534                 perf_event_start(event);
1535                 perf_enable();
1536         }
1537 }
1538
1539 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1540 {
1541         struct perf_event *event;
1542         struct hw_perf_event *hwc;
1543         u64 interrupts, now;
1544         s64 delta;
1545
1546         raw_spin_lock(&ctx->lock);
1547         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1548                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1549                         continue;
1550
1551                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1552                         continue;
1553
1554                 hwc = &event->hw;
1555
1556                 interrupts = hwc->interrupts;
1557                 hwc->interrupts = 0;
1558
1559                 /*
1560                  * unthrottle events on the tick
1561                  */
1562                 if (interrupts == MAX_INTERRUPTS) {
1563                         perf_log_throttle(event, 1);
1564                         event->pmu->unthrottle(event);
1565                 }
1566
1567                 if (!event->attr.freq || !event->attr.sample_freq)
1568                         continue;
1569
1570                 event->pmu->read(event);
1571                 now = atomic64_read(&event->count);
1572                 delta = now - hwc->freq_count_stamp;
1573                 hwc->freq_count_stamp = now;
1574
1575                 if (delta > 0)
1576                         perf_adjust_period(event, TICK_NSEC, delta);
1577         }
1578         raw_spin_unlock(&ctx->lock);
1579 }
1580
1581 /*
1582  * Round-robin a context's events:
1583  */
1584 static void rotate_ctx(struct perf_event_context *ctx)
1585 {
1586         if (!ctx->nr_events)
1587                 return;
1588
1589         raw_spin_lock(&ctx->lock);
1590
1591         /* Rotate the first entry last of non-pinned groups */
1592         list_rotate_left(&ctx->flexible_groups);
1593
1594         raw_spin_unlock(&ctx->lock);
1595 }
1596
1597 void perf_event_task_tick(struct task_struct *curr)
1598 {
1599         struct perf_cpu_context *cpuctx;
1600         struct perf_event_context *ctx;
1601
1602         if (!atomic_read(&nr_events))
1603                 return;
1604
1605         cpuctx = &__get_cpu_var(perf_cpu_context);
1606         ctx = curr->perf_event_ctxp;
1607
1608         perf_disable();
1609
1610         perf_ctx_adjust_freq(&cpuctx->ctx);
1611         if (ctx)
1612                 perf_ctx_adjust_freq(ctx);
1613
1614         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1615         if (ctx)
1616                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1617
1618         rotate_ctx(&cpuctx->ctx);
1619         if (ctx)
1620                 rotate_ctx(ctx);
1621
1622         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1623         if (ctx)
1624                 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1625
1626         perf_enable();
1627 }
1628
1629 static int event_enable_on_exec(struct perf_event *event,
1630                                 struct perf_event_context *ctx)
1631 {
1632         if (!event->attr.enable_on_exec)
1633                 return 0;
1634
1635         event->attr.enable_on_exec = 0;
1636         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1637                 return 0;
1638
1639         __perf_event_mark_enabled(event, ctx);
1640
1641         return 1;
1642 }
1643
1644 /*
1645  * Enable all of a task's events that have been marked enable-on-exec.
1646  * This expects task == current.
1647  */
1648 static void perf_event_enable_on_exec(struct task_struct *task)
1649 {
1650         struct perf_event_context *ctx;
1651         struct perf_event *event;
1652         unsigned long flags;
1653         int enabled = 0;
1654         int ret;
1655
1656         local_irq_save(flags);
1657         ctx = task->perf_event_ctxp;
1658         if (!ctx || !ctx->nr_events)
1659                 goto out;
1660
1661         __perf_event_task_sched_out(ctx);
1662
1663         raw_spin_lock(&ctx->lock);
1664
1665         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1666                 ret = event_enable_on_exec(event, ctx);
1667                 if (ret)
1668                         enabled = 1;
1669         }
1670
1671         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1672                 ret = event_enable_on_exec(event, ctx);
1673                 if (ret)
1674                         enabled = 1;
1675         }
1676
1677         /*
1678          * Unclone this context if we enabled any event.
1679          */
1680         if (enabled)
1681                 unclone_ctx(ctx);
1682
1683         raw_spin_unlock(&ctx->lock);
1684
1685         perf_event_task_sched_in(task);
1686  out:
1687         local_irq_restore(flags);
1688 }
1689
1690 /*
1691  * Cross CPU call to read the hardware event
1692  */
1693 static void __perf_event_read(void *info)
1694 {
1695         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1696         struct perf_event *event = info;
1697         struct perf_event_context *ctx = event->ctx;
1698
1699         /*
1700          * If this is a task context, we need to check whether it is
1701          * the current task context of this cpu.  If not it has been
1702          * scheduled out before the smp call arrived.  In that case
1703          * event->count would have been updated to a recent sample
1704          * when the event was scheduled out.
1705          */
1706         if (ctx->task && cpuctx->task_ctx != ctx)
1707                 return;
1708
1709         raw_spin_lock(&ctx->lock);
1710         update_context_time(ctx);
1711         update_event_times(event);
1712         raw_spin_unlock(&ctx->lock);
1713
1714         event->pmu->read(event);
1715 }
1716
1717 static u64 perf_event_read(struct perf_event *event)
1718 {
1719         /*
1720          * If event is enabled and currently active on a CPU, update the
1721          * value in the event structure:
1722          */
1723         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1724                 smp_call_function_single(event->oncpu,
1725                                          __perf_event_read, event, 1);
1726         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1727                 struct perf_event_context *ctx = event->ctx;
1728                 unsigned long flags;
1729
1730                 raw_spin_lock_irqsave(&ctx->lock, flags);
1731                 update_context_time(ctx);
1732                 update_event_times(event);
1733                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1734         }
1735
1736         return atomic64_read(&event->count);
1737 }
1738
1739 /*
1740  * Initialize the perf_event context in a task_struct:
1741  */
1742 static void
1743 __perf_event_init_context(struct perf_event_context *ctx,
1744                             struct task_struct *task)
1745 {
1746         raw_spin_lock_init(&ctx->lock);
1747         mutex_init(&ctx->mutex);
1748         INIT_LIST_HEAD(&ctx->pinned_groups);
1749         INIT_LIST_HEAD(&ctx->flexible_groups);
1750         INIT_LIST_HEAD(&ctx->event_list);
1751         atomic_set(&ctx->refcount, 1);
1752         ctx->task = task;
1753 }
1754
1755 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1756 {
1757         struct perf_event_context *ctx;
1758         struct perf_cpu_context *cpuctx;
1759         struct task_struct *task;
1760         unsigned long flags;
1761         int err;
1762
1763         if (pid == -1 && cpu != -1) {
1764                 /* Must be root to operate on a CPU event: */
1765                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1766                         return ERR_PTR(-EACCES);
1767
1768                 if (cpu < 0 || cpu >= nr_cpumask_bits)
1769                         return ERR_PTR(-EINVAL);
1770
1771                 /*
1772                  * We could be clever and allow to attach a event to an
1773                  * offline CPU and activate it when the CPU comes up, but
1774                  * that's for later.
1775                  */
1776                 if (!cpu_online(cpu))
1777                         return ERR_PTR(-ENODEV);
1778
1779                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1780                 ctx = &cpuctx->ctx;
1781                 get_ctx(ctx);
1782
1783                 return ctx;
1784         }
1785
1786         rcu_read_lock();
1787         if (!pid)
1788                 task = current;
1789         else
1790                 task = find_task_by_vpid(pid);
1791         if (task)
1792                 get_task_struct(task);
1793         rcu_read_unlock();
1794
1795         if (!task)
1796                 return ERR_PTR(-ESRCH);
1797
1798         /*
1799          * Can't attach events to a dying task.
1800          */
1801         err = -ESRCH;
1802         if (task->flags & PF_EXITING)
1803                 goto errout;
1804
1805         /* Reuse ptrace permission checks for now. */
1806         err = -EACCES;
1807         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1808                 goto errout;
1809
1810  retry:
1811         ctx = perf_lock_task_context(task, &flags);
1812         if (ctx) {
1813                 unclone_ctx(ctx);
1814                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1815         }
1816
1817         if (!ctx) {
1818                 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1819                 err = -ENOMEM;
1820                 if (!ctx)
1821                         goto errout;
1822                 __perf_event_init_context(ctx, task);
1823                 get_ctx(ctx);
1824                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1825                         /*
1826                          * We raced with some other task; use
1827                          * the context they set.
1828                          */
1829                         kfree(ctx);
1830                         goto retry;
1831                 }
1832                 get_task_struct(task);
1833         }
1834
1835         put_task_struct(task);
1836         return ctx;
1837
1838  errout:
1839         put_task_struct(task);
1840         return ERR_PTR(err);
1841 }
1842
1843 static void perf_event_free_filter(struct perf_event *event);
1844
1845 static void free_event_rcu(struct rcu_head *head)
1846 {
1847         struct perf_event *event;
1848
1849         event = container_of(head, struct perf_event, rcu_head);
1850         if (event->ns)
1851                 put_pid_ns(event->ns);
1852         perf_event_free_filter(event);
1853         kfree(event);
1854 }
1855
1856 static void perf_pending_sync(struct perf_event *event);
1857
1858 static void free_event(struct perf_event *event)
1859 {
1860         perf_pending_sync(event);
1861
1862         if (!event->parent) {
1863                 atomic_dec(&nr_events);
1864                 if (event->attr.mmap)
1865                         atomic_dec(&nr_mmap_events);
1866                 if (event->attr.comm)
1867                         atomic_dec(&nr_comm_events);
1868                 if (event->attr.task)
1869                         atomic_dec(&nr_task_events);
1870         }
1871
1872         if (event->output) {
1873                 fput(event->output->filp);
1874                 event->output = NULL;
1875         }
1876
1877         if (event->destroy)
1878                 event->destroy(event);
1879
1880         put_ctx(event->ctx);
1881         call_rcu(&event->rcu_head, free_event_rcu);
1882 }
1883
1884 int perf_event_release_kernel(struct perf_event *event)
1885 {
1886         struct perf_event_context *ctx = event->ctx;
1887
1888         WARN_ON_ONCE(ctx->parent_ctx);
1889         mutex_lock(&ctx->mutex);
1890         perf_event_remove_from_context(event);
1891         mutex_unlock(&ctx->mutex);
1892
1893         mutex_lock(&event->owner->perf_event_mutex);
1894         list_del_init(&event->owner_entry);
1895         mutex_unlock(&event->owner->perf_event_mutex);
1896         put_task_struct(event->owner);
1897
1898         free_event(event);
1899
1900         return 0;
1901 }
1902 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1903
1904 /*
1905  * Called when the last reference to the file is gone.
1906  */
1907 static int perf_release(struct inode *inode, struct file *file)
1908 {
1909         struct perf_event *event = file->private_data;
1910
1911         file->private_data = NULL;
1912
1913         return perf_event_release_kernel(event);
1914 }
1915
1916 static int perf_event_read_size(struct perf_event *event)
1917 {
1918         int entry = sizeof(u64); /* value */
1919         int size = 0;
1920         int nr = 1;
1921
1922         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1923                 size += sizeof(u64);
1924
1925         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1926                 size += sizeof(u64);
1927
1928         if (event->attr.read_format & PERF_FORMAT_ID)
1929                 entry += sizeof(u64);
1930
1931         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1932                 nr += event->group_leader->nr_siblings;
1933                 size += sizeof(u64);
1934         }
1935
1936         size += entry * nr;
1937
1938         return size;
1939 }
1940
1941 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1942 {
1943         struct perf_event *child;
1944         u64 total = 0;
1945
1946         *enabled = 0;
1947         *running = 0;
1948
1949         mutex_lock(&event->child_mutex);
1950         total += perf_event_read(event);
1951         *enabled += event->total_time_enabled +
1952                         atomic64_read(&event->child_total_time_enabled);
1953         *running += event->total_time_running +
1954                         atomic64_read(&event->child_total_time_running);
1955
1956         list_for_each_entry(child, &event->child_list, child_list) {
1957                 total += perf_event_read(child);
1958                 *enabled += child->total_time_enabled;
1959                 *running += child->total_time_running;
1960         }
1961         mutex_unlock(&event->child_mutex);
1962
1963         return total;
1964 }
1965 EXPORT_SYMBOL_GPL(perf_event_read_value);
1966
1967 static int perf_event_read_group(struct perf_event *event,
1968                                    u64 read_format, char __user *buf)
1969 {
1970         struct perf_event *leader = event->group_leader, *sub;
1971         int n = 0, size = 0, ret = -EFAULT;
1972         struct perf_event_context *ctx = leader->ctx;
1973         u64 values[5];
1974         u64 count, enabled, running;
1975
1976         mutex_lock(&ctx->mutex);
1977         count = perf_event_read_value(leader, &enabled, &running);
1978
1979         values[n++] = 1 + leader->nr_siblings;
1980         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1981                 values[n++] = enabled;
1982         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1983                 values[n++] = running;
1984         values[n++] = count;
1985         if (read_format & PERF_FORMAT_ID)
1986                 values[n++] = primary_event_id(leader);
1987
1988         size = n * sizeof(u64);
1989
1990         if (copy_to_user(buf, values, size))
1991                 goto unlock;
1992
1993         ret = size;
1994
1995         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1996                 n = 0;
1997
1998                 values[n++] = perf_event_read_value(sub, &enabled, &running);
1999                 if (read_format & PERF_FORMAT_ID)
2000                         values[n++] = primary_event_id(sub);
2001
2002                 size = n * sizeof(u64);
2003
2004                 if (copy_to_user(buf + ret, values, size)) {
2005                         ret = -EFAULT;
2006                         goto unlock;
2007                 }
2008
2009                 ret += size;
2010         }
2011 unlock:
2012         mutex_unlock(&ctx->mutex);
2013
2014         return ret;
2015 }
2016
2017 static int perf_event_read_one(struct perf_event *event,
2018                                  u64 read_format, char __user *buf)
2019 {
2020         u64 enabled, running;
2021         u64 values[4];
2022         int n = 0;
2023
2024         values[n++] = perf_event_read_value(event, &enabled, &running);
2025         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2026                 values[n++] = enabled;
2027         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2028                 values[n++] = running;
2029         if (read_format & PERF_FORMAT_ID)
2030                 values[n++] = primary_event_id(event);
2031
2032         if (copy_to_user(buf, values, n * sizeof(u64)))
2033                 return -EFAULT;
2034
2035         return n * sizeof(u64);
2036 }
2037
2038 /*
2039  * Read the performance event - simple non blocking version for now
2040  */
2041 static ssize_t
2042 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2043 {
2044         u64 read_format = event->attr.read_format;
2045         int ret;
2046
2047         /*
2048          * Return end-of-file for a read on a event that is in
2049          * error state (i.e. because it was pinned but it couldn't be
2050          * scheduled on to the CPU at some point).
2051          */
2052         if (event->state == PERF_EVENT_STATE_ERROR)
2053                 return 0;
2054
2055         if (count < perf_event_read_size(event))
2056                 return -ENOSPC;
2057
2058         WARN_ON_ONCE(event->ctx->parent_ctx);
2059         if (read_format & PERF_FORMAT_GROUP)
2060                 ret = perf_event_read_group(event, read_format, buf);
2061         else
2062                 ret = perf_event_read_one(event, read_format, buf);
2063
2064         return ret;
2065 }
2066
2067 static ssize_t
2068 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2069 {
2070         struct perf_event *event = file->private_data;
2071
2072         return perf_read_hw(event, buf, count);
2073 }
2074
2075 static unsigned int perf_poll(struct file *file, poll_table *wait)
2076 {
2077         struct perf_event *event = file->private_data;
2078         struct perf_mmap_data *data;
2079         unsigned int events = POLL_HUP;
2080
2081         rcu_read_lock();
2082         data = rcu_dereference(event->data);
2083         if (data)
2084                 events = atomic_xchg(&data->poll, 0);
2085         rcu_read_unlock();
2086
2087         poll_wait(file, &event->waitq, wait);
2088
2089         return events;
2090 }
2091
2092 static void perf_event_reset(struct perf_event *event)
2093 {
2094         (void)perf_event_read(event);
2095         atomic64_set(&event->count, 0);
2096         perf_event_update_userpage(event);
2097 }
2098
2099 /*
2100  * Holding the top-level event's child_mutex means that any
2101  * descendant process that has inherited this event will block
2102  * in sync_child_event if it goes to exit, thus satisfying the
2103  * task existence requirements of perf_event_enable/disable.
2104  */
2105 static void perf_event_for_each_child(struct perf_event *event,
2106                                         void (*func)(struct perf_event *))
2107 {
2108         struct perf_event *child;
2109
2110         WARN_ON_ONCE(event->ctx->parent_ctx);
2111         mutex_lock(&event->child_mutex);
2112         func(event);
2113         list_for_each_entry(child, &event->child_list, child_list)
2114                 func(child);
2115         mutex_unlock(&event->child_mutex);
2116 }
2117
2118 static void perf_event_for_each(struct perf_event *event,
2119                                   void (*func)(struct perf_event *))
2120 {
2121         struct perf_event_context *ctx = event->ctx;
2122         struct perf_event *sibling;
2123
2124         WARN_ON_ONCE(ctx->parent_ctx);
2125         mutex_lock(&ctx->mutex);
2126         event = event->group_leader;
2127
2128         perf_event_for_each_child(event, func);
2129         func(event);
2130         list_for_each_entry(sibling, &event->sibling_list, group_entry)
2131                 perf_event_for_each_child(event, func);
2132         mutex_unlock(&ctx->mutex);
2133 }
2134
2135 static int perf_event_period(struct perf_event *event, u64 __user *arg)
2136 {
2137         struct perf_event_context *ctx = event->ctx;
2138         unsigned long size;
2139         int ret = 0;
2140         u64 value;
2141
2142         if (!event->attr.sample_period)
2143                 return -EINVAL;
2144
2145         size = copy_from_user(&value, arg, sizeof(value));
2146         if (size != sizeof(value))
2147                 return -EFAULT;
2148
2149         if (!value)
2150                 return -EINVAL;
2151
2152         raw_spin_lock_irq(&ctx->lock);
2153         if (event->attr.freq) {
2154                 if (value > sysctl_perf_event_sample_rate) {
2155                         ret = -EINVAL;
2156                         goto unlock;
2157                 }
2158
2159                 event->attr.sample_freq = value;
2160         } else {
2161                 event->attr.sample_period = value;
2162                 event->hw.sample_period = value;
2163         }
2164 unlock:
2165         raw_spin_unlock_irq(&ctx->lock);
2166
2167         return ret;
2168 }
2169
2170 static int perf_event_set_output(struct perf_event *event, int output_fd);
2171 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2172
2173 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2174 {
2175         struct perf_event *event = file->private_data;
2176         void (*func)(struct perf_event *);
2177         u32 flags = arg;
2178
2179         switch (cmd) {
2180         case PERF_EVENT_IOC_ENABLE:
2181                 func = perf_event_enable;
2182                 break;
2183         case PERF_EVENT_IOC_DISABLE:
2184                 func = perf_event_disable;
2185                 break;
2186         case PERF_EVENT_IOC_RESET:
2187                 func = perf_event_reset;
2188                 break;
2189
2190         case PERF_EVENT_IOC_REFRESH:
2191                 return perf_event_refresh(event, arg);
2192
2193         case PERF_EVENT_IOC_PERIOD:
2194                 return perf_event_period(event, (u64 __user *)arg);
2195
2196         case PERF_EVENT_IOC_SET_OUTPUT:
2197                 return perf_event_set_output(event, arg);
2198
2199         case PERF_EVENT_IOC_SET_FILTER:
2200                 return perf_event_set_filter(event, (void __user *)arg);
2201
2202         default:
2203                 return -ENOTTY;
2204         }
2205
2206         if (flags & PERF_IOC_FLAG_GROUP)
2207                 perf_event_for_each(event, func);
2208         else
2209                 perf_event_for_each_child(event, func);
2210
2211         return 0;
2212 }
2213
2214 int perf_event_task_enable(void)
2215 {
2216         struct perf_event *event;
2217
2218         mutex_lock(&current->perf_event_mutex);
2219         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2220                 perf_event_for_each_child(event, perf_event_enable);
2221         mutex_unlock(&current->perf_event_mutex);
2222
2223         return 0;
2224 }
2225
2226 int perf_event_task_disable(void)
2227 {
2228         struct perf_event *event;
2229
2230         mutex_lock(&current->perf_event_mutex);
2231         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2232                 perf_event_for_each_child(event, perf_event_disable);
2233         mutex_unlock(&current->perf_event_mutex);
2234
2235         return 0;
2236 }
2237
2238 #ifndef PERF_EVENT_INDEX_OFFSET
2239 # define PERF_EVENT_INDEX_OFFSET 0
2240 #endif
2241
2242 static int perf_event_index(struct perf_event *event)
2243 {
2244         if (event->state != PERF_EVENT_STATE_ACTIVE)
2245                 return 0;
2246
2247         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2248 }
2249
2250 /*
2251  * Callers need to ensure there can be no nesting of this function, otherwise
2252  * the seqlock logic goes bad. We can not serialize this because the arch
2253  * code calls this from NMI context.
2254  */
2255 void perf_event_update_userpage(struct perf_event *event)
2256 {
2257         struct perf_event_mmap_page *userpg;
2258         struct perf_mmap_data *data;
2259
2260         rcu_read_lock();
2261         data = rcu_dereference(event->data);
2262         if (!data)
2263                 goto unlock;
2264
2265         userpg = data->user_page;
2266
2267         /*
2268          * Disable preemption so as to not let the corresponding user-space
2269          * spin too long if we get preempted.
2270          */
2271         preempt_disable();
2272         ++userpg->lock;
2273         barrier();
2274         userpg->index = perf_event_index(event);
2275         userpg->offset = atomic64_read(&event->count);
2276         if (event->state == PERF_EVENT_STATE_ACTIVE)
2277                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2278
2279         userpg->time_enabled = event->total_time_enabled +
2280                         atomic64_read(&event->child_total_time_enabled);
2281
2282         userpg->time_running = event->total_time_running +
2283                         atomic64_read(&event->child_total_time_running);
2284
2285         barrier();
2286         ++userpg->lock;
2287         preempt_enable();
2288 unlock:
2289         rcu_read_unlock();
2290 }
2291
2292 static unsigned long perf_data_size(struct perf_mmap_data *data)
2293 {
2294         return data->nr_pages << (PAGE_SHIFT + data->data_order);
2295 }
2296
2297 #ifndef CONFIG_PERF_USE_VMALLOC
2298
2299 /*
2300  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2301  */
2302
2303 static struct page *
2304 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2305 {
2306         if (pgoff > data->nr_pages)
2307                 return NULL;
2308
2309         if (pgoff == 0)
2310                 return virt_to_page(data->user_page);
2311
2312         return virt_to_page(data->data_pages[pgoff - 1]);
2313 }
2314
2315 static struct perf_mmap_data *
2316 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2317 {
2318         struct perf_mmap_data *data;
2319         unsigned long size;
2320         int i;
2321
2322         WARN_ON(atomic_read(&event->mmap_count));
2323
2324         size = sizeof(struct perf_mmap_data);
2325         size += nr_pages * sizeof(void *);
2326
2327         data = kzalloc(size, GFP_KERNEL);
2328         if (!data)
2329                 goto fail;
2330
2331         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2332         if (!data->user_page)
2333                 goto fail_user_page;
2334
2335         for (i = 0; i < nr_pages; i++) {
2336                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2337                 if (!data->data_pages[i])
2338                         goto fail_data_pages;
2339         }
2340
2341         data->data_order = 0;
2342         data->nr_pages = nr_pages;
2343
2344         return data;
2345
2346 fail_data_pages:
2347         for (i--; i >= 0; i--)
2348                 free_page((unsigned long)data->data_pages[i]);
2349
2350         free_page((unsigned long)data->user_page);
2351
2352 fail_user_page:
2353         kfree(data);
2354
2355 fail:
2356         return NULL;
2357 }
2358
2359 static void perf_mmap_free_page(unsigned long addr)
2360 {
2361         struct page *page = virt_to_page((void *)addr);
2362
2363         page->mapping = NULL;
2364         __free_page(page);
2365 }
2366
2367 static void perf_mmap_data_free(struct perf_mmap_data *data)
2368 {
2369         int i;
2370
2371         perf_mmap_free_page((unsigned long)data->user_page);
2372         for (i = 0; i < data->nr_pages; i++)
2373                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2374         kfree(data);
2375 }
2376
2377 #else
2378
2379 /*
2380  * Back perf_mmap() with vmalloc memory.
2381  *
2382  * Required for architectures that have d-cache aliasing issues.
2383  */
2384
2385 static struct page *
2386 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2387 {
2388         if (pgoff > (1UL << data->data_order))
2389                 return NULL;
2390
2391         return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2392 }
2393
2394 static void perf_mmap_unmark_page(void *addr)
2395 {
2396         struct page *page = vmalloc_to_page(addr);
2397
2398         page->mapping = NULL;
2399 }
2400
2401 static void perf_mmap_data_free_work(struct work_struct *work)
2402 {
2403         struct perf_mmap_data *data;
2404         void *base;
2405         int i, nr;
2406
2407         data = container_of(work, struct perf_mmap_data, work);
2408         nr = 1 << data->data_order;
2409
2410         base = data->user_page;
2411         for (i = 0; i < nr + 1; i++)
2412                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2413
2414         vfree(base);
2415         kfree(data);
2416 }
2417
2418 static void perf_mmap_data_free(struct perf_mmap_data *data)
2419 {
2420         schedule_work(&data->work);
2421 }
2422
2423 static struct perf_mmap_data *
2424 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2425 {
2426         struct perf_mmap_data *data;
2427         unsigned long size;
2428         void *all_buf;
2429
2430         WARN_ON(atomic_read(&event->mmap_count));
2431
2432         size = sizeof(struct perf_mmap_data);
2433         size += sizeof(void *);
2434
2435         data = kzalloc(size, GFP_KERNEL);
2436         if (!data)
2437                 goto fail;
2438
2439         INIT_WORK(&data->work, perf_mmap_data_free_work);
2440
2441         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2442         if (!all_buf)
2443                 goto fail_all_buf;
2444
2445         data->user_page = all_buf;
2446         data->data_pages[0] = all_buf + PAGE_SIZE;
2447         data->data_order = ilog2(nr_pages);
2448         data->nr_pages = 1;
2449
2450         return data;
2451
2452 fail_all_buf:
2453         kfree(data);
2454
2455 fail:
2456         return NULL;
2457 }
2458
2459 #endif
2460
2461 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2462 {
2463         struct perf_event *event = vma->vm_file->private_data;
2464         struct perf_mmap_data *data;
2465         int ret = VM_FAULT_SIGBUS;
2466
2467         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2468                 if (vmf->pgoff == 0)
2469                         ret = 0;
2470                 return ret;
2471         }
2472
2473         rcu_read_lock();
2474         data = rcu_dereference(event->data);
2475         if (!data)
2476                 goto unlock;
2477
2478         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2479                 goto unlock;
2480
2481         vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2482         if (!vmf->page)
2483                 goto unlock;
2484
2485         get_page(vmf->page);
2486         vmf->page->mapping = vma->vm_file->f_mapping;
2487         vmf->page->index   = vmf->pgoff;
2488
2489         ret = 0;
2490 unlock:
2491         rcu_read_unlock();
2492
2493         return ret;
2494 }
2495
2496 static void
2497 perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2498 {
2499         long max_size = perf_data_size(data);
2500
2501         atomic_set(&data->lock, -1);
2502
2503         if (event->attr.watermark) {
2504                 data->watermark = min_t(long, max_size,
2505                                         event->attr.wakeup_watermark);
2506         }
2507
2508         if (!data->watermark)
2509                 data->watermark = max_size / 2;
2510
2511
2512         rcu_assign_pointer(event->data, data);
2513 }
2514
2515 static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2516 {
2517         struct perf_mmap_data *data;
2518
2519         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2520         perf_mmap_data_free(data);
2521 }
2522
2523 static void perf_mmap_data_release(struct perf_event *event)
2524 {
2525         struct perf_mmap_data *data = event->data;
2526
2527         WARN_ON(atomic_read(&event->mmap_count));
2528
2529         rcu_assign_pointer(event->data, NULL);
2530         call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2531 }
2532
2533 static void perf_mmap_open(struct vm_area_struct *vma)
2534 {
2535         struct perf_event *event = vma->vm_file->private_data;
2536
2537         atomic_inc(&event->mmap_count);
2538 }
2539
2540 static void perf_mmap_close(struct vm_area_struct *vma)
2541 {
2542         struct perf_event *event = vma->vm_file->private_data;
2543
2544         WARN_ON_ONCE(event->ctx->parent_ctx);
2545         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2546                 unsigned long size = perf_data_size(event->data);
2547                 struct user_struct *user = current_user();
2548
2549                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2550                 vma->vm_mm->locked_vm -= event->data->nr_locked;
2551                 perf_mmap_data_release(event);
2552                 mutex_unlock(&event->mmap_mutex);
2553         }
2554 }
2555
2556 static const struct vm_operations_struct perf_mmap_vmops = {
2557         .open           = perf_mmap_open,
2558         .close          = perf_mmap_close,
2559         .fault          = perf_mmap_fault,
2560         .page_mkwrite   = perf_mmap_fault,
2561 };
2562
2563 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2564 {
2565         struct perf_event *event = file->private_data;
2566         unsigned long user_locked, user_lock_limit;
2567         struct user_struct *user = current_user();
2568         unsigned long locked, lock_limit;
2569         struct perf_mmap_data *data;
2570         unsigned long vma_size;
2571         unsigned long nr_pages;
2572         long user_extra, extra;
2573         int ret = 0;
2574
2575         if (!(vma->vm_flags & VM_SHARED))
2576                 return -EINVAL;
2577
2578         vma_size = vma->vm_end - vma->vm_start;
2579         nr_pages = (vma_size / PAGE_SIZE) - 1;
2580
2581         /*
2582          * If we have data pages ensure they're a power-of-two number, so we
2583          * can do bitmasks instead of modulo.
2584          */
2585         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2586                 return -EINVAL;
2587
2588         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2589                 return -EINVAL;
2590
2591         if (vma->vm_pgoff != 0)
2592                 return -EINVAL;
2593
2594         WARN_ON_ONCE(event->ctx->parent_ctx);
2595         mutex_lock(&event->mmap_mutex);
2596         if (event->output) {
2597                 ret = -EINVAL;
2598                 goto unlock;
2599         }
2600
2601         if (atomic_inc_not_zero(&event->mmap_count)) {
2602                 if (nr_pages != event->data->nr_pages)
2603                         ret = -EINVAL;
2604                 goto unlock;
2605         }
2606
2607         user_extra = nr_pages + 1;
2608         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2609
2610         /*
2611          * Increase the limit linearly with more CPUs:
2612          */
2613         user_lock_limit *= num_online_cpus();
2614
2615         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2616
2617         extra = 0;
2618         if (user_locked > user_lock_limit)
2619                 extra = user_locked - user_lock_limit;
2620
2621         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2622         lock_limit >>= PAGE_SHIFT;
2623         locked = vma->vm_mm->locked_vm + extra;
2624
2625         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2626                 !capable(CAP_IPC_LOCK)) {
2627                 ret = -EPERM;
2628                 goto unlock;
2629         }
2630
2631         WARN_ON(event->data);
2632
2633         data = perf_mmap_data_alloc(event, nr_pages);
2634         ret = -ENOMEM;
2635         if (!data)
2636                 goto unlock;
2637
2638         ret = 0;
2639         perf_mmap_data_init(event, data);
2640
2641         atomic_set(&event->mmap_count, 1);
2642         atomic_long_add(user_extra, &user->locked_vm);
2643         vma->vm_mm->locked_vm += extra;
2644         event->data->nr_locked = extra;
2645         if (vma->vm_flags & VM_WRITE)
2646                 event->data->writable = 1;
2647
2648 unlock:
2649         mutex_unlock(&event->mmap_mutex);
2650
2651         vma->vm_flags |= VM_RESERVED;
2652         vma->vm_ops = &perf_mmap_vmops;
2653
2654         return ret;
2655 }
2656
2657 static int perf_fasync(int fd, struct file *filp, int on)
2658 {
2659         struct inode *inode = filp->f_path.dentry->d_inode;
2660         struct perf_event *event = filp->private_data;
2661         int retval;
2662
2663         mutex_lock(&inode->i_mutex);
2664         retval = fasync_helper(fd, filp, on, &event->fasync);
2665         mutex_unlock(&inode->i_mutex);
2666
2667         if (retval < 0)
2668                 return retval;
2669
2670         return 0;
2671 }
2672
2673 static const struct file_operations perf_fops = {
2674         .release                = perf_release,
2675         .read                   = perf_read,
2676         .poll                   = perf_poll,
2677         .unlocked_ioctl         = perf_ioctl,
2678         .compat_ioctl           = perf_ioctl,
2679         .mmap                   = perf_mmap,
2680         .fasync                 = perf_fasync,
2681 };
2682
2683 /*
2684  * Perf event wakeup
2685  *
2686  * If there's data, ensure we set the poll() state and publish everything
2687  * to user-space before waking everybody up.
2688  */
2689
2690 void perf_event_wakeup(struct perf_event *event)
2691 {
2692         wake_up_all(&event->waitq);
2693
2694         if (event->pending_kill) {
2695                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2696                 event->pending_kill = 0;
2697         }
2698 }
2699
2700 /*
2701  * Pending wakeups
2702  *
2703  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2704  *
2705  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2706  * single linked list and use cmpxchg() to add entries lockless.
2707  */
2708
2709 static void perf_pending_event(struct perf_pending_entry *entry)
2710 {
2711         struct perf_event *event = container_of(entry,
2712                         struct perf_event, pending);
2713
2714         if (event->pending_disable) {
2715                 event->pending_disable = 0;
2716                 __perf_event_disable(event);
2717         }
2718
2719         if (event->pending_wakeup) {
2720                 event->pending_wakeup = 0;
2721                 perf_event_wakeup(event);
2722         }
2723 }
2724
2725 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2726
2727 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2728         PENDING_TAIL,
2729 };
2730
2731 static void perf_pending_queue(struct perf_pending_entry *entry,
2732                                void (*func)(struct perf_pending_entry *))
2733 {
2734         struct perf_pending_entry **head;
2735
2736         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2737                 return;
2738
2739         entry->func = func;
2740
2741         head = &get_cpu_var(perf_pending_head);
2742
2743         do {
2744                 entry->next = *head;
2745         } while (cmpxchg(head, entry->next, entry) != entry->next);
2746
2747         set_perf_event_pending();
2748
2749         put_cpu_var(perf_pending_head);
2750 }
2751
2752 static int __perf_pending_run(void)
2753 {
2754         struct perf_pending_entry *list;
2755         int nr = 0;
2756
2757         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2758         while (list != PENDING_TAIL) {
2759                 void (*func)(struct perf_pending_entry *);
2760                 struct perf_pending_entry *entry = list;
2761
2762                 list = list->next;
2763
2764                 func = entry->func;
2765                 entry->next = NULL;
2766                 /*
2767                  * Ensure we observe the unqueue before we issue the wakeup,
2768                  * so that we won't be waiting forever.
2769                  * -- see perf_not_pending().
2770                  */
2771                 smp_wmb();
2772
2773                 func(entry);
2774                 nr++;
2775         }
2776
2777         return nr;
2778 }
2779
2780 static inline int perf_not_pending(struct perf_event *event)
2781 {
2782         /*
2783          * If we flush on whatever cpu we run, there is a chance we don't
2784          * need to wait.
2785          */
2786         get_cpu();
2787         __perf_pending_run();
2788         put_cpu();
2789
2790         /*
2791          * Ensure we see the proper queue state before going to sleep
2792          * so that we do not miss the wakeup. -- see perf_pending_handle()
2793          */
2794         smp_rmb();
2795         return event->pending.next == NULL;
2796 }
2797
2798 static void perf_pending_sync(struct perf_event *event)
2799 {
2800         wait_event(event->waitq, perf_not_pending(event));
2801 }
2802
2803 void perf_event_do_pending(void)
2804 {
2805         __perf_pending_run();
2806 }
2807
2808 /*
2809  * Callchain support -- arch specific
2810  */
2811
2812 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2813 {
2814         return NULL;
2815 }
2816
2817 /*
2818  * Output
2819  */
2820 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2821                               unsigned long offset, unsigned long head)
2822 {
2823         unsigned long mask;
2824
2825         if (!data->writable)
2826                 return true;
2827
2828         mask = perf_data_size(data) - 1;
2829
2830         offset = (offset - tail) & mask;
2831         head   = (head   - tail) & mask;
2832
2833         if ((int)(head - offset) < 0)
2834                 return false;
2835
2836         return true;
2837 }
2838
2839 static void perf_output_wakeup(struct perf_output_handle *handle)
2840 {
2841         atomic_set(&handle->data->poll, POLL_IN);
2842
2843         if (handle->nmi) {
2844                 handle->event->pending_wakeup = 1;
2845                 perf_pending_queue(&handle->event->pending,
2846                                    perf_pending_event);
2847         } else
2848                 perf_event_wakeup(handle->event);
2849 }
2850
2851 /*
2852  * Curious locking construct.
2853  *
2854  * We need to ensure a later event_id doesn't publish a head when a former
2855  * event_id isn't done writing. However since we need to deal with NMIs we
2856  * cannot fully serialize things.
2857  *
2858  * What we do is serialize between CPUs so we only have to deal with NMI
2859  * nesting on a single CPU.
2860  *
2861  * We only publish the head (and generate a wakeup) when the outer-most
2862  * event_id completes.
2863  */
2864 static void perf_output_lock(struct perf_output_handle *handle)
2865 {
2866         struct perf_mmap_data *data = handle->data;
2867         int cur, cpu = get_cpu();
2868
2869         handle->locked = 0;
2870
2871         for (;;) {
2872                 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2873                 if (cur == -1) {
2874                         handle->locked = 1;
2875                         break;
2876                 }
2877                 if (cur == cpu)
2878                         break;
2879
2880                 cpu_relax();
2881         }
2882 }
2883
2884 static void perf_output_unlock(struct perf_output_handle *handle)
2885 {
2886         struct perf_mmap_data *data = handle->data;
2887         unsigned long head;
2888         int cpu;
2889
2890         data->done_head = data->head;
2891
2892         if (!handle->locked)
2893                 goto out;
2894
2895 again:
2896         /*
2897          * The xchg implies a full barrier that ensures all writes are done
2898          * before we publish the new head, matched by a rmb() in userspace when
2899          * reading this position.
2900          */
2901         while ((head = atomic_long_xchg(&data->done_head, 0)))
2902                 data->user_page->data_head = head;
2903
2904         /*
2905          * NMI can happen here, which means we can miss a done_head update.
2906          */
2907
2908         cpu = atomic_xchg(&data->lock, -1);
2909         WARN_ON_ONCE(cpu != smp_processor_id());
2910
2911         /*
2912          * Therefore we have to validate we did not indeed do so.
2913          */
2914         if (unlikely(atomic_long_read(&data->done_head))) {
2915                 /*
2916                  * Since we had it locked, we can lock it again.
2917                  */
2918                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2919                         cpu_relax();
2920
2921                 goto again;
2922         }
2923
2924         if (atomic_xchg(&data->wakeup, 0))
2925                 perf_output_wakeup(handle);
2926 out:
2927         put_cpu();
2928 }
2929
2930 void perf_output_copy(struct perf_output_handle *handle,
2931                       const void *buf, unsigned int len)
2932 {
2933         unsigned int pages_mask;
2934         unsigned long offset;
2935         unsigned int size;
2936         void **pages;
2937
2938         offset          = handle->offset;
2939         pages_mask      = handle->data->nr_pages - 1;
2940         pages           = handle->data->data_pages;
2941
2942         do {
2943                 unsigned long page_offset;
2944                 unsigned long page_size;
2945                 int nr;
2946
2947                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2948                 page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
2949                 page_offset = offset & (page_size - 1);
2950                 size        = min_t(unsigned int, page_size - page_offset, len);
2951
2952                 memcpy(pages[nr] + page_offset, buf, size);
2953
2954                 len         -= size;
2955                 buf         += size;
2956                 offset      += size;
2957         } while (len);
2958
2959         handle->offset = offset;
2960
2961         /*
2962          * Check we didn't copy past our reservation window, taking the
2963          * possible unsigned int wrap into account.
2964          */
2965         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2966 }
2967
2968 int perf_output_begin(struct perf_output_handle *handle,
2969                       struct perf_event *event, unsigned int size,
2970                       int nmi, int sample)
2971 {
2972         struct perf_event *output_event;
2973         struct perf_mmap_data *data;
2974         unsigned long tail, offset, head;
2975         int have_lost;
2976         struct {
2977                 struct perf_event_header header;
2978                 u64                      id;
2979                 u64                      lost;
2980         } lost_event;
2981
2982         rcu_read_lock();
2983         /*
2984          * For inherited events we send all the output towards the parent.
2985          */
2986         if (event->parent)
2987                 event = event->parent;
2988
2989         output_event = rcu_dereference(event->output);
2990         if (output_event)
2991                 event = output_event;
2992
2993         data = rcu_dereference(event->data);
2994         if (!data)
2995                 goto out;
2996
2997         handle->data    = data;
2998         handle->event   = event;
2999         handle->nmi     = nmi;
3000         handle->sample  = sample;
3001
3002         if (!data->nr_pages)
3003                 goto fail;
3004
3005         have_lost = atomic_read(&data->lost);
3006         if (have_lost)
3007                 size += sizeof(lost_event);
3008
3009         perf_output_lock(handle);
3010
3011         do {
3012                 /*
3013                  * Userspace could choose to issue a mb() before updating the
3014                  * tail pointer. So that all reads will be completed before the
3015                  * write is issued.
3016                  */
3017                 tail = ACCESS_ONCE(data->user_page->data_tail);
3018                 smp_rmb();
3019                 offset = head = atomic_long_read(&data->head);
3020                 head += size;
3021                 if (unlikely(!perf_output_space(data, tail, offset, head)))
3022                         goto fail;
3023         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
3024
3025         handle->offset  = offset;
3026         handle->head    = head;
3027
3028         if (head - tail > data->watermark)
3029                 atomic_set(&data->wakeup, 1);
3030
3031         if (have_lost) {
3032                 lost_event.header.type = PERF_RECORD_LOST;
3033                 lost_event.header.misc = 0;
3034                 lost_event.header.size = sizeof(lost_event);
3035                 lost_event.id          = event->id;
3036                 lost_event.lost        = atomic_xchg(&data->lost, 0);
3037
3038                 perf_output_put(handle, lost_event);
3039         }
3040
3041         return 0;
3042
3043 fail:
3044         atomic_inc(&data->lost);
3045         perf_output_unlock(handle);
3046 out:
3047         rcu_read_unlock();
3048
3049         return -ENOSPC;
3050 }
3051
3052 void perf_output_end(struct perf_output_handle *handle)
3053 {
3054         struct perf_event *event = handle->event;
3055         struct perf_mmap_data *data = handle->data;
3056
3057         int wakeup_events = event->attr.wakeup_events;
3058
3059         if (handle->sample && wakeup_events) {
3060                 int events = atomic_inc_return(&data->events);
3061                 if (events >= wakeup_events) {
3062                         atomic_sub(wakeup_events, &data->events);
3063                         atomic_set(&data->wakeup, 1);
3064                 }
3065         }
3066
3067         perf_output_unlock(handle);
3068         rcu_read_unlock();
3069 }
3070
3071 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3072 {
3073         /*
3074          * only top level events have the pid namespace they were created in
3075          */
3076         if (event->parent)
3077                 event = event->parent;
3078
3079         return task_tgid_nr_ns(p, event->ns);
3080 }
3081
3082 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3083 {
3084         /*
3085          * only top level events have the pid namespace they were created in
3086          */
3087         if (event->parent)
3088                 event = event->parent;
3089
3090         return task_pid_nr_ns(p, event->ns);
3091 }
3092
3093 static void perf_output_read_one(struct perf_output_handle *handle,
3094                                  struct perf_event *event)
3095 {
3096         u64 read_format = event->attr.read_format;
3097         u64 values[4];
3098         int n = 0;
3099
3100         values[n++] = atomic64_read(&event->count);
3101         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3102                 values[n++] = event->total_time_enabled +
3103                         atomic64_read(&event->child_total_time_enabled);
3104         }
3105         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3106                 values[n++] = event->total_time_running +
3107                         atomic64_read(&event->child_total_time_running);
3108         }
3109         if (read_format & PERF_FORMAT_ID)
3110                 values[n++] = primary_event_id(event);
3111
3112         perf_output_copy(handle, values, n * sizeof(u64));
3113 }
3114
3115 /*
3116  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3117  */
3118 static void perf_output_read_group(struct perf_output_handle *handle,
3119                             struct perf_event *event)
3120 {
3121         struct perf_event *leader = event->group_leader, *sub;
3122         u64 read_format = event->attr.read_format;
3123         u64 values[5];
3124         int n = 0;
3125
3126         values[n++] = 1 + leader->nr_siblings;
3127
3128         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3129                 values[n++] = leader->total_time_enabled;
3130
3131         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3132                 values[n++] = leader->total_time_running;
3133
3134         if (leader != event)
3135                 leader->pmu->read(leader);
3136
3137         values[n++] = atomic64_read(&leader->count);
3138         if (read_format & PERF_FORMAT_ID)
3139                 values[n++] = primary_event_id(leader);
3140
3141         perf_output_copy(handle, values, n * sizeof(u64));
3142
3143         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3144                 n = 0;
3145
3146                 if (sub != event)
3147                         sub->pmu->read(sub);
3148
3149                 values[n++] = atomic64_read(&sub->count);
3150                 if (read_format & PERF_FORMAT_ID)
3151                         values[n++] = primary_event_id(sub);
3152
3153                 perf_output_copy(handle, values, n * sizeof(u64));
3154         }
3155 }
3156
3157 static void perf_output_read(struct perf_output_handle *handle,
3158                              struct perf_event *event)
3159 {
3160         if (event->attr.read_format & PERF_FORMAT_GROUP)
3161                 perf_output_read_group(handle, event);
3162         else
3163                 perf_output_read_one(handle, event);
3164 }
3165
3166 void perf_output_sample(struct perf_output_handle *handle,
3167                         struct perf_event_header *header,
3168                         struct perf_sample_data *data,
3169                         struct perf_event *event)
3170 {
3171         u64 sample_type = data->type;
3172
3173         perf_output_put(handle, *header);
3174
3175         if (sample_type & PERF_SAMPLE_IP)
3176                 perf_output_put(handle, data->ip);
3177
3178         if (sample_type & PERF_SAMPLE_TID)
3179                 perf_output_put(handle, data->tid_entry);
3180
3181         if (sample_type & PERF_SAMPLE_TIME)
3182                 perf_output_put(handle, data->time);
3183
3184         if (sample_type & PERF_SAMPLE_ADDR)
3185                 perf_output_put(handle, data->addr);
3186
3187         if (sample_type & PERF_SAMPLE_ID)
3188                 perf_output_put(handle, data->id);
3189
3190         if (sample_type & PERF_SAMPLE_STREAM_ID)
3191                 perf_output_put(handle, data->stream_id);
3192
3193         if (sample_type & PERF_SAMPLE_CPU)
3194                 perf_output_put(handle, data->cpu_entry);
3195
3196         if (sample_type & PERF_SAMPLE_PERIOD)
3197                 perf_output_put(handle, data->period);
3198
3199         if (sample_type & PERF_SAMPLE_READ)
3200                 perf_output_read(handle, event);
3201
3202         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3203                 if (data->callchain) {
3204                         int size = 1;
3205
3206                         if (data->callchain)
3207                                 size += data->callchain->nr;
3208
3209                         size *= sizeof(u64);
3210
3211                         perf_output_copy(handle, data->callchain, size);
3212                 } else {
3213                         u64 nr = 0;
3214                         perf_output_put(handle, nr);
3215                 }
3216         }
3217
3218         if (sample_type & PERF_SAMPLE_RAW) {
3219                 if (data->raw) {
3220                         perf_output_put(handle, data->raw->size);
3221                         perf_output_copy(handle, data->raw->data,
3222                                          data->raw->size);
3223                 } else {
3224                         struct {
3225                                 u32     size;
3226                                 u32     data;
3227                         } raw = {
3228                                 .size = sizeof(u32),
3229                                 .data = 0,
3230                         };
3231                         perf_output_put(handle, raw);
3232                 }
3233         }
3234 }
3235
3236 void perf_prepare_sample(struct perf_event_header *header,
3237                          struct perf_sample_data *data,
3238                          struct perf_event *event,
3239                          struct pt_regs *regs)
3240 {
3241         u64 sample_type = event->attr.sample_type;
3242
3243         data->type = sample_type;
3244
3245         header->type = PERF_RECORD_SAMPLE;
3246         header->size = sizeof(*header);
3247
3248         header->misc = 0;
3249         header->misc |= perf_misc_flags(regs);
3250
3251         if (sample_type & PERF_SAMPLE_IP) {
3252                 data->ip = perf_instruction_pointer(regs);
3253
3254                 header->size += sizeof(data->ip);
3255         }
3256
3257         if (sample_type & PERF_SAMPLE_TID) {
3258                 /* namespace issues */
3259                 data->tid_entry.pid = perf_event_pid(event, current);
3260                 data->tid_entry.tid = perf_event_tid(event, current);
3261
3262                 header->size += sizeof(data->tid_entry);
3263         }
3264
3265         if (sample_type & PERF_SAMPLE_TIME) {
3266                 data->time = perf_clock();
3267
3268                 header->size += sizeof(data->time);
3269         }
3270
3271         if (sample_type & PERF_SAMPLE_ADDR)
3272                 header->size += sizeof(data->addr);
3273
3274         if (sample_type & PERF_SAMPLE_ID) {
3275                 data->id = primary_event_id(event);
3276
3277                 header->size += sizeof(data->id);
3278         }
3279
3280         if (sample_type & PERF_SAMPLE_STREAM_ID) {
3281                 data->stream_id = event->id;
3282
3283                 header->size += sizeof(data->stream_id);
3284         }
3285
3286         if (sample_type & PERF_SAMPLE_CPU) {
3287                 data->cpu_entry.cpu             = raw_smp_processor_id();
3288                 data->cpu_entry.reserved        = 0;
3289
3290                 header->size += sizeof(data->cpu_entry);
3291         }
3292
3293         if (sample_type & PERF_SAMPLE_PERIOD)
3294                 header->size += sizeof(data->period);
3295
3296         if (sample_type & PERF_SAMPLE_READ)
3297                 header->size += perf_event_read_size(event);
3298
3299         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3300                 int size = 1;
3301
3302                 data->callchain = perf_callchain(regs);
3303
3304                 if (data->callchain)
3305                         size += data->callchain->nr;
3306
3307                 header->size += size * sizeof(u64);
3308         }
3309
3310         if (sample_type & PERF_SAMPLE_RAW) {
3311                 int size = sizeof(u32);
3312
3313                 if (data->raw)
3314                         size += data->raw->size;
3315                 else
3316                         size += sizeof(u32);
3317
3318                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3319                 header->size += size;
3320         }
3321 }
3322
3323 static void perf_event_output(struct perf_event *event, int nmi,
3324                                 struct perf_sample_data *data,
3325                                 struct pt_regs *regs)
3326 {
3327         struct perf_output_handle handle;
3328         struct perf_event_header header;
3329
3330         perf_prepare_sample(&header, data, event, regs);
3331
3332         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3333                 return;
3334
3335         perf_output_sample(&handle, &header, data, event);
3336
3337         perf_output_end(&handle);
3338 }
3339
3340 /*
3341  * read event_id
3342  */
3343
3344 struct perf_read_event {
3345         struct perf_event_header        header;
3346
3347         u32                             pid;
3348         u32                             tid;
3349 };
3350
3351 static void
3352 perf_event_read_event(struct perf_event *event,
3353                         struct task_struct *task)
3354 {
3355         struct perf_output_handle handle;
3356         struct perf_read_event read_event = {
3357                 .header = {
3358                         .type = PERF_RECORD_READ,
3359                         .misc = 0,
3360                         .size = sizeof(read_event) + perf_event_read_size(event),
3361                 },
3362                 .pid = perf_event_pid(event, task),
3363                 .tid = perf_event_tid(event, task),
3364         };
3365         int ret;
3366
3367         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3368         if (ret)
3369                 return;
3370
3371         perf_output_put(&handle, read_event);
3372         perf_output_read(&handle, event);
3373
3374         perf_output_end(&handle);
3375 }
3376
3377 /*
3378  * task tracking -- fork/exit
3379  *
3380  * enabled by: attr.comm | attr.mmap | attr.task
3381  */
3382
3383 struct perf_task_event {
3384         struct task_struct              *task;
3385         struct perf_event_context       *task_ctx;
3386
3387         struct {
3388                 struct perf_event_header        header;
3389
3390                 u32                             pid;
3391                 u32                             ppid;
3392                 u32                             tid;
3393                 u32                             ptid;
3394                 u64                             time;
3395         } event_id;
3396 };
3397
3398 static void perf_event_task_output(struct perf_event *event,
3399                                      struct perf_task_event *task_event)
3400 {
3401         struct perf_output_handle handle;
3402         int size;
3403         struct task_struct *task = task_event->task;
3404         int ret;
3405
3406         size  = task_event->event_id.header.size;
3407         ret = perf_output_begin(&handle, event, size, 0, 0);
3408
3409         if (ret)
3410                 return;
3411
3412         task_event->event_id.pid = perf_event_pid(event, task);
3413         task_event->event_id.ppid = perf_event_pid(event, current);
3414
3415         task_event->event_id.tid = perf_event_tid(event, task);
3416         task_event->event_id.ptid = perf_event_tid(event, current);
3417
3418         task_event->event_id.time = perf_clock();
3419
3420         perf_output_put(&handle, task_event->event_id);
3421
3422         perf_output_end(&handle);
3423 }
3424
3425 static int perf_event_task_match(struct perf_event *event)
3426 {
3427         if (event->state != PERF_EVENT_STATE_ACTIVE)
3428                 return 0;
3429
3430         if (event->cpu != -1 && event->cpu != smp_processor_id())
3431                 return 0;
3432
3433         if (event->attr.comm || event->attr.mmap || event->attr.task)
3434                 return 1;
3435
3436         return 0;
3437 }
3438
3439 static void perf_event_task_ctx(struct perf_event_context *ctx,
3440                                   struct perf_task_event *task_event)
3441 {
3442         struct perf_event *event;
3443
3444         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3445                 if (perf_event_task_match(event))
3446                         perf_event_task_output(event, task_event);
3447         }
3448 }
3449
3450 static void perf_event_task_event(struct perf_task_event *task_event)
3451 {
3452         struct perf_cpu_context *cpuctx;
3453         struct perf_event_context *ctx = task_event->task_ctx;
3454
3455         rcu_read_lock();
3456         cpuctx = &get_cpu_var(perf_cpu_context);
3457         perf_event_task_ctx(&cpuctx->ctx, task_event);
3458         if (!ctx)
3459                 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3460         if (ctx)
3461                 perf_event_task_ctx(ctx, task_event);
3462         put_cpu_var(perf_cpu_context);
3463         rcu_read_unlock();
3464 }
3465
3466 static void perf_event_task(struct task_struct *task,
3467                               struct perf_event_context *task_ctx,
3468                               int new)
3469 {
3470         struct perf_task_event task_event;
3471
3472         if (!atomic_read(&nr_comm_events) &&
3473             !atomic_read(&nr_mmap_events) &&
3474             !atomic_read(&nr_task_events))
3475                 return;
3476
3477         task_event = (struct perf_task_event){
3478                 .task     = task,
3479                 .task_ctx = task_ctx,
3480                 .event_id    = {
3481                         .header = {
3482                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3483                                 .misc = 0,
3484                                 .size = sizeof(task_event.event_id),
3485                         },
3486                         /* .pid  */
3487                         /* .ppid */
3488                         /* .tid  */
3489                         /* .ptid */
3490                 },
3491         };
3492
3493         perf_event_task_event(&task_event);
3494 }
3495
3496 void perf_event_fork(struct task_struct *task)
3497 {
3498         perf_event_task(task, NULL, 1);
3499 }
3500
3501 /*
3502  * comm tracking
3503  */
3504
3505 struct perf_comm_event {
3506         struct task_struct      *task;
3507         char                    *comm;
3508         int                     comm_size;
3509
3510         struct {
3511                 struct perf_event_header        header;
3512
3513                 u32                             pid;
3514                 u32                             tid;
3515         } event_id;
3516 };
3517
3518 static void perf_event_comm_output(struct perf_event *event,
3519                                      struct perf_comm_event *comm_event)
3520 {
3521         struct perf_output_handle handle;
3522         int size = comm_event->event_id.header.size;
3523         int ret = perf_output_begin(&handle, event, size, 0, 0);
3524
3525         if (ret)
3526                 return;
3527
3528         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3529         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3530
3531         perf_output_put(&handle, comm_event->event_id);
3532         perf_output_copy(&handle, comm_event->comm,
3533                                    comm_event->comm_size);
3534         perf_output_end(&handle);
3535 }
3536
3537 static int perf_event_comm_match(struct perf_event *event)
3538 {
3539         if (event->state != PERF_EVENT_STATE_ACTIVE)
3540                 return 0;
3541
3542         if (event->cpu != -1 && event->cpu != smp_processor_id())
3543                 return 0;
3544
3545         if (event->attr.comm)
3546                 return 1;
3547
3548         return 0;
3549 }
3550
3551 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3552                                   struct perf_comm_event *comm_event)
3553 {
3554         struct perf_event *event;
3555
3556         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3557                 if (perf_event_comm_match(event))
3558                         perf_event_comm_output(event, comm_event);
3559         }
3560 }
3561
3562 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3563 {
3564         struct perf_cpu_context *cpuctx;
3565         struct perf_event_context *ctx;
3566         unsigned int size;
3567         char comm[TASK_COMM_LEN];
3568
3569         memset(comm, 0, sizeof(comm));
3570         strlcpy(comm, comm_event->task->comm, sizeof(comm));
3571         size = ALIGN(strlen(comm)+1, sizeof(u64));
3572
3573         comm_event->comm = comm;
3574         comm_event->comm_size = size;
3575
3576         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3577
3578         rcu_read_lock();
3579         cpuctx = &get_cpu_var(perf_cpu_context);
3580         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3581         ctx = rcu_dereference(current->perf_event_ctxp);
3582         if (ctx)
3583                 perf_event_comm_ctx(ctx, comm_event);
3584         put_cpu_var(perf_cpu_context);
3585         rcu_read_unlock();
3586 }
3587
3588 void perf_event_comm(struct task_struct *task)
3589 {
3590         struct perf_comm_event comm_event;
3591
3592         if (task->perf_event_ctxp)
3593                 perf_event_enable_on_exec(task);
3594
3595         if (!atomic_read(&nr_comm_events))
3596                 return;
3597
3598         comm_event = (struct perf_comm_event){
3599                 .task   = task,
3600                 /* .comm      */
3601                 /* .comm_size */
3602                 .event_id  = {
3603                         .header = {
3604                                 .type = PERF_RECORD_COMM,
3605                                 .misc = 0,
3606                                 /* .size */
3607                         },
3608                         /* .pid */
3609                         /* .tid */
3610                 },
3611         };
3612
3613         perf_event_comm_event(&comm_event);
3614 }
3615
3616 /*
3617  * mmap tracking
3618  */
3619
3620 struct perf_mmap_event {
3621         struct vm_area_struct   *vma;
3622
3623         const char              *file_name;
3624         int                     file_size;
3625
3626         struct {
3627                 struct perf_event_header        header;
3628
3629                 u32                             pid;
3630                 u32                             tid;
3631                 u64                             start;
3632                 u64                             len;
3633                 u64                             pgoff;
3634         } event_id;
3635 };
3636
3637 static void perf_event_mmap_output(struct perf_event *event,
3638                                      struct perf_mmap_event *mmap_event)
3639 {
3640         struct perf_output_handle handle;
3641         int size = mmap_event->event_id.header.size;
3642         int ret = perf_output_begin(&handle, event, size, 0, 0);
3643
3644         if (ret)
3645                 return;
3646
3647         mmap_event->event_id.pid = perf_event_pid(event, current);
3648         mmap_event->event_id.tid = perf_event_tid(event, current);
3649
3650         perf_output_put(&handle, mmap_event->event_id);
3651         perf_output_copy(&handle, mmap_event->file_name,
3652                                    mmap_event->file_size);
3653         perf_output_end(&handle);
3654 }
3655
3656 static int perf_event_mmap_match(struct perf_event *event,
3657                                    struct perf_mmap_event *mmap_event)
3658 {
3659         if (event->state != PERF_EVENT_STATE_ACTIVE)
3660                 return 0;
3661
3662         if (event->cpu != -1 && event->cpu != smp_processor_id())
3663                 return 0;
3664
3665         if (event->attr.mmap)
3666                 return 1;
3667
3668         return 0;
3669 }
3670
3671 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3672                                   struct perf_mmap_event *mmap_event)
3673 {
3674         struct perf_event *event;
3675
3676         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3677                 if (perf_event_mmap_match(event, mmap_event))
3678                         perf_event_mmap_output(event, mmap_event);
3679         }
3680 }
3681
3682 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3683 {
3684         struct perf_cpu_context *cpuctx;
3685         struct perf_event_context *ctx;
3686         struct vm_area_struct *vma = mmap_event->vma;
3687         struct file *file = vma->vm_file;
3688         unsigned int size;
3689         char tmp[16];
3690         char *buf = NULL;
3691         const char *name;
3692
3693         memset(tmp, 0, sizeof(tmp));
3694
3695         if (file) {
3696                 /*
3697                  * d_path works from the end of the buffer backwards, so we
3698                  * need to add enough zero bytes after the string to handle
3699                  * the 64bit alignment we do later.
3700                  */
3701                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3702                 if (!buf) {
3703                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3704                         goto got_name;
3705                 }
3706                 name = d_path(&file->f_path, buf, PATH_MAX);
3707                 if (IS_ERR(name)) {
3708                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3709                         goto got_name;
3710                 }
3711         } else {
3712                 if (arch_vma_name(mmap_event->vma)) {
3713                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3714                                        sizeof(tmp));
3715                         goto got_name;
3716                 }
3717
3718                 if (!vma->vm_mm) {
3719                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3720                         goto got_name;
3721                 }
3722
3723                 name = strncpy(tmp, "//anon", sizeof(tmp));
3724                 goto got_name;
3725         }
3726
3727 got_name:
3728         size = ALIGN(strlen(name)+1, sizeof(u64));
3729
3730         mmap_event->file_name = name;
3731         mmap_event->file_size = size;
3732
3733         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3734
3735         rcu_read_lock();
3736         cpuctx = &get_cpu_var(perf_cpu_context);
3737         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3738         ctx = rcu_dereference(current->perf_event_ctxp);
3739         if (ctx)
3740                 perf_event_mmap_ctx(ctx, mmap_event);
3741         put_cpu_var(perf_cpu_context);
3742         rcu_read_unlock();
3743
3744         kfree(buf);
3745 }
3746
3747 void __perf_event_mmap(struct vm_area_struct *vma)
3748 {
3749         struct perf_mmap_event mmap_event;
3750
3751         if (!atomic_read(&nr_mmap_events))
3752                 return;
3753
3754         mmap_event = (struct perf_mmap_event){
3755                 .vma    = vma,
3756                 /* .file_name */
3757                 /* .file_size */
3758                 .event_id  = {
3759                         .header = {
3760                                 .type = PERF_RECORD_MMAP,
3761                                 .misc = 0,
3762                                 /* .size */
3763                         },
3764                         /* .pid */
3765                         /* .tid */
3766                         .start  = vma->vm_start,
3767                         .len    = vma->vm_end - vma->vm_start,
3768                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
3769                 },
3770         };
3771
3772         perf_event_mmap_event(&mmap_event);
3773 }
3774
3775 /*
3776  * IRQ throttle logging
3777  */
3778
3779 static void perf_log_throttle(struct perf_event *event, int enable)
3780 {
3781         struct perf_output_handle handle;
3782         int ret;
3783
3784         struct {
3785                 struct perf_event_header        header;
3786                 u64                             time;
3787                 u64                             id;
3788                 u64                             stream_id;
3789         } throttle_event = {
3790                 .header = {
3791                         .type = PERF_RECORD_THROTTLE,
3792                         .misc = 0,
3793                         .size = sizeof(throttle_event),
3794                 },
3795                 .time           = perf_clock(),
3796                 .id             = primary_event_id(event),
3797                 .stream_id      = event->id,
3798         };
3799
3800         if (enable)
3801                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3802
3803         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3804         if (ret)
3805                 return;
3806
3807         perf_output_put(&handle, throttle_event);
3808         perf_output_end(&handle);
3809 }
3810
3811 /*
3812  * Generic event overflow handling, sampling.
3813  */
3814
3815 static int __perf_event_overflow(struct perf_event *event, int nmi,
3816                                    int throttle, struct perf_sample_data *data,
3817                                    struct pt_regs *regs)
3818 {
3819         int events = atomic_read(&event->event_limit);
3820         struct hw_perf_event *hwc = &event->hw;
3821         int ret = 0;
3822
3823         throttle = (throttle && event->pmu->unthrottle != NULL);
3824
3825         if (!throttle) {
3826                 hwc->interrupts++;
3827         } else {
3828                 if (hwc->interrupts != MAX_INTERRUPTS) {
3829                         hwc->interrupts++;
3830                         if (HZ * hwc->interrupts >
3831                                         (u64)sysctl_perf_event_sample_rate) {
3832                                 hwc->interrupts = MAX_INTERRUPTS;
3833                                 perf_log_throttle(event, 0);
3834                                 ret = 1;
3835                         }
3836                 } else {
3837                         /*
3838                          * Keep re-disabling events even though on the previous
3839                          * pass we disabled it - just in case we raced with a
3840                          * sched-in and the event got enabled again:
3841                          */
3842                         ret = 1;
3843                 }
3844         }
3845
3846         if (event->attr.freq) {
3847                 u64 now = perf_clock();
3848                 s64 delta = now - hwc->freq_time_stamp;
3849
3850                 hwc->freq_time_stamp = now;
3851
3852                 if (delta > 0 && delta < 2*TICK_NSEC)
3853                         perf_adjust_period(event, delta, hwc->last_period);
3854         }
3855
3856         /*
3857          * XXX event_limit might not quite work as expected on inherited
3858          * events
3859          */
3860
3861         event->pending_kill = POLL_IN;
3862         if (events && atomic_dec_and_test(&event->event_limit)) {
3863                 ret = 1;
3864                 event->pending_kill = POLL_HUP;
3865                 if (nmi) {
3866                         event->pending_disable = 1;
3867                         perf_pending_queue(&event->pending,
3868                                            perf_pending_event);
3869                 } else
3870                         perf_event_disable(event);
3871         }
3872
3873         if (event->overflow_handler)
3874                 event->overflow_handler(event, nmi, data, regs);
3875         else
3876                 perf_event_output(event, nmi, data, regs);
3877
3878         return ret;
3879 }
3880
3881 int perf_event_overflow(struct perf_event *event, int nmi,
3882                           struct perf_sample_data *data,
3883                           struct pt_regs *regs)
3884 {
3885         return __perf_event_overflow(event, nmi, 1, data, regs);
3886 }
3887
3888 /*
3889  * Generic software event infrastructure
3890  */
3891
3892 /*
3893  * We directly increment event->count and keep a second value in
3894  * event->hw.period_left to count intervals. This period event
3895  * is kept in the range [-sample_period, 0] so that we can use the
3896  * sign as trigger.
3897  */
3898
3899 static u64 perf_swevent_set_period(struct perf_event *event)
3900 {
3901         struct hw_perf_event *hwc = &event->hw;
3902         u64 period = hwc->last_period;
3903         u64 nr, offset;
3904         s64 old, val;
3905
3906         hwc->last_period = hwc->sample_period;
3907
3908 again:
3909         old = val = atomic64_read(&hwc->period_left);
3910         if (val < 0)
3911                 return 0;
3912
3913         nr = div64_u64(period + val, period);
3914         offset = nr * period;
3915         val -= offset;
3916         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3917                 goto again;
3918
3919         return nr;
3920 }
3921
3922 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3923                                     int nmi, struct perf_sample_data *data,
3924                                     struct pt_regs *regs)
3925 {
3926         struct hw_perf_event *hwc = &event->hw;
3927         int throttle = 0;
3928
3929         data->period = event->hw.last_period;
3930         if (!overflow)
3931                 overflow = perf_swevent_set_period(event);
3932
3933         if (hwc->interrupts == MAX_INTERRUPTS)
3934                 return;
3935
3936         for (; overflow; overflow--) {
3937                 if (__perf_event_overflow(event, nmi, throttle,
3938                                             data, regs)) {
3939                         /*
3940                          * We inhibit the overflow from happening when
3941                          * hwc->interrupts == MAX_INTERRUPTS.
3942                          */
3943                         break;
3944                 }
3945                 throttle = 1;
3946         }
3947 }
3948
3949 static void perf_swevent_unthrottle(struct perf_event *event)
3950 {
3951         /*
3952          * Nothing to do, we already reset hwc->interrupts.
3953          */
3954 }
3955
3956 static void perf_swevent_add(struct perf_event *event, u64 nr,
3957                                int nmi, struct perf_sample_data *data,
3958                                struct pt_regs *regs)
3959 {
3960         struct hw_perf_event *hwc = &event->hw;
3961
3962         atomic64_add(nr, &event->count);
3963
3964         if (!regs)
3965                 return;
3966
3967         if (!hwc->sample_period)
3968                 return;
3969
3970         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3971                 return perf_swevent_overflow(event, 1, nmi, data, regs);
3972
3973         if (atomic64_add_negative(nr, &hwc->period_left))
3974                 return;
3975
3976         perf_swevent_overflow(event, 0, nmi, data, regs);
3977 }
3978
3979 static int perf_swevent_is_counting(struct perf_event *event)
3980 {
3981         /*
3982          * The event is active, we're good!
3983          */
3984         if (event->state == PERF_EVENT_STATE_ACTIVE)
3985                 return 1;
3986
3987         /*
3988          * The event is off/error, not counting.
3989          */
3990         if (event->state != PERF_EVENT_STATE_INACTIVE)
3991                 return 0;
3992
3993         /*
3994          * The event is inactive, if the context is active
3995          * we're part of a group that didn't make it on the 'pmu',
3996          * not counting.
3997          */
3998         if (event->ctx->is_active)
3999                 return 0;
4000
4001         /*
4002          * We're inactive and the context is too, this means the
4003          * task is scheduled out, we're counting events that happen
4004          * to us, like migration events.
4005          */
4006         return 1;
4007 }
4008
4009 static int perf_tp_event_match(struct perf_event *event,
4010                                 struct perf_sample_data *data);
4011
4012 static int perf_exclude_event(struct perf_event *event,
4013                               struct pt_regs *regs)
4014 {
4015         if (regs) {
4016                 if (event->attr.exclude_user && user_mode(regs))
4017                         return 1;
4018
4019                 if (event->attr.exclude_kernel && !user_mode(regs))
4020                         return 1;
4021         }
4022
4023         return 0;
4024 }
4025
4026 static int perf_swevent_match(struct perf_event *event,
4027                                 enum perf_type_id type,
4028                                 u32 event_id,
4029                                 struct perf_sample_data *data,
4030                                 struct pt_regs *regs)
4031 {
4032         if (event->cpu != -1 && event->cpu != smp_processor_id())
4033                 return 0;
4034
4035         if (!perf_swevent_is_counting(event))
4036                 return 0;
4037
4038         if (event->attr.type != type)
4039                 return 0;
4040
4041         if (event->attr.config != event_id)
4042                 return 0;
4043
4044         if (perf_exclude_event(event, regs))
4045                 return 0;
4046
4047         if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4048             !perf_tp_event_match(event, data))
4049                 return 0;
4050
4051         return 1;
4052 }
4053
4054 static void perf_swevent_ctx_event(struct perf_event_context *ctx,
4055                                      enum perf_type_id type,
4056                                      u32 event_id, u64 nr, int nmi,
4057                                      struct perf_sample_data *data,
4058                                      struct pt_regs *regs)
4059 {
4060         struct perf_event *event;
4061
4062         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4063                 if (perf_swevent_match(event, type, event_id, data, regs))
4064                         perf_swevent_add(event, nr, nmi, data, regs);
4065         }
4066 }
4067
4068 int perf_swevent_get_recursion_context(void)
4069 {
4070         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
4071         int rctx;
4072
4073         if (in_nmi())
4074                 rctx = 3;
4075         else if (in_irq())
4076                 rctx = 2;
4077         else if (in_softirq())
4078                 rctx = 1;
4079         else
4080                 rctx = 0;
4081
4082         if (cpuctx->recursion[rctx]) {
4083                 put_cpu_var(perf_cpu_context);
4084                 return -1;
4085         }
4086
4087         cpuctx->recursion[rctx]++;
4088         barrier();
4089
4090         return rctx;
4091 }
4092 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4093
4094 void perf_swevent_put_recursion_context(int rctx)
4095 {
4096         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4097         barrier();
4098         cpuctx->recursion[rctx]--;
4099         put_cpu_var(perf_cpu_context);
4100 }
4101 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4102
4103 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4104                                     u64 nr, int nmi,
4105                                     struct perf_sample_data *data,
4106                                     struct pt_regs *regs)
4107 {
4108         struct perf_cpu_context *cpuctx;
4109         struct perf_event_context *ctx;
4110
4111         cpuctx = &__get_cpu_var(perf_cpu_context);
4112         rcu_read_lock();
4113         perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4114                                  nr, nmi, data, regs);
4115         /*
4116          * doesn't really matter which of the child contexts the
4117          * events ends up in.
4118          */
4119         ctx = rcu_dereference(current->perf_event_ctxp);
4120         if (ctx)
4121                 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4122         rcu_read_unlock();
4123 }
4124
4125 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4126                             struct pt_regs *regs, u64 addr)
4127 {
4128         struct perf_sample_data data;
4129         int rctx;
4130
4131         rctx = perf_swevent_get_recursion_context();
4132         if (rctx < 0)
4133                 return;
4134
4135         data.addr = addr;
4136         data.raw  = NULL;
4137
4138         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4139
4140         perf_swevent_put_recursion_context(rctx);
4141 }
4142
4143 static void perf_swevent_read(struct perf_event *event)
4144 {
4145 }
4146
4147 static int perf_swevent_enable(struct perf_event *event)
4148 {
4149         struct hw_perf_event *hwc = &event->hw;
4150
4151         if (hwc->sample_period) {
4152                 hwc->last_period = hwc->sample_period;
4153                 perf_swevent_set_period(event);
4154         }
4155         return 0;
4156 }
4157
4158 static void perf_swevent_disable(struct perf_event *event)
4159 {
4160 }
4161
4162 static const struct pmu perf_ops_generic = {
4163         .enable         = perf_swevent_enable,
4164         .disable        = perf_swevent_disable,
4165         .read           = perf_swevent_read,
4166         .unthrottle     = perf_swevent_unthrottle,
4167 };
4168
4169 /*
4170  * hrtimer based swevent callback
4171  */
4172
4173 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4174 {
4175         enum hrtimer_restart ret = HRTIMER_RESTART;
4176         struct perf_sample_data data;
4177         struct pt_regs *regs;
4178         struct perf_event *event;
4179         u64 period;
4180
4181         event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
4182         event->pmu->read(event);
4183
4184         data.addr = 0;
4185         data.raw = NULL;
4186         data.period = event->hw.last_period;
4187         regs = get_irq_regs();
4188         /*
4189          * In case we exclude kernel IPs or are somehow not in interrupt
4190          * context, provide the next best thing, the user IP.
4191          */
4192         if ((event->attr.exclude_kernel || !regs) &&
4193                         !event->attr.exclude_user)
4194                 regs = task_pt_regs(current);
4195
4196         if (regs) {
4197                 if (!(event->attr.exclude_idle && current->pid == 0))
4198                         if (perf_event_overflow(event, 0, &data, regs))
4199                                 ret = HRTIMER_NORESTART;
4200         }
4201
4202         period = max_t(u64, 10000, event->hw.sample_period);
4203         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4204
4205         return ret;
4206 }
4207
4208 static void perf_swevent_start_hrtimer(struct perf_event *event)
4209 {
4210         struct hw_perf_event *hwc = &event->hw;
4211
4212         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4213         hwc->hrtimer.function = perf_swevent_hrtimer;
4214         if (hwc->sample_period) {
4215                 u64 period;
4216
4217                 if (hwc->remaining) {
4218                         if (hwc->remaining < 0)
4219                                 period = 10000;
4220                         else
4221                                 period = hwc->remaining;
4222                         hwc->remaining = 0;
4223                 } else {
4224                         period = max_t(u64, 10000, hwc->sample_period);
4225                 }
4226                 __hrtimer_start_range_ns(&hwc->hrtimer,
4227                                 ns_to_ktime(period), 0,
4228                                 HRTIMER_MODE_REL, 0);
4229         }
4230 }
4231
4232 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4233 {
4234         struct hw_perf_event *hwc = &event->hw;
4235
4236         if (hwc->sample_period) {
4237                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4238                 hwc->remaining = ktime_to_ns(remaining);
4239
4240                 hrtimer_cancel(&hwc->hrtimer);
4241         }
4242 }
4243
4244 /*
4245  * Software event: cpu wall time clock
4246  */
4247
4248 static void cpu_clock_perf_event_update(struct perf_event *event)
4249 {
4250         int cpu = raw_smp_processor_id();
4251         s64 prev;
4252         u64 now;
4253
4254         now = cpu_clock(cpu);
4255         prev = atomic64_xchg(&event->hw.prev_count, now);
4256         atomic64_add(now - prev, &event->count);
4257 }
4258
4259 static int cpu_clock_perf_event_enable(struct perf_event *event)
4260 {
4261         struct hw_perf_event *hwc = &event->hw;
4262         int cpu = raw_smp_processor_id();
4263
4264         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4265         perf_swevent_start_hrtimer(event);
4266
4267         return 0;
4268 }
4269
4270 static void cpu_clock_perf_event_disable(struct perf_event *event)
4271 {
4272         perf_swevent_cancel_hrtimer(event);
4273         cpu_clock_perf_event_update(event);
4274 }
4275
4276 static void cpu_clock_perf_event_read(struct perf_event *event)
4277 {
4278         cpu_clock_perf_event_update(event);
4279 }
4280
4281 static const struct pmu perf_ops_cpu_clock = {
4282         .enable         = cpu_clock_perf_event_enable,
4283         .disable        = cpu_clock_perf_event_disable,
4284         .read           = cpu_clock_perf_event_read,
4285 };
4286
4287 /*
4288  * Software event: task time clock
4289  */
4290
4291 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4292 {
4293         u64 prev;
4294         s64 delta;
4295
4296         prev = atomic64_xchg(&event->hw.prev_count, now);
4297         delta = now - prev;
4298         atomic64_add(delta, &event->count);
4299 }
4300
4301 static int task_clock_perf_event_enable(struct perf_event *event)
4302 {
4303         struct hw_perf_event *hwc = &event->hw;
4304         u64 now;
4305
4306         now = event->ctx->time;
4307
4308         atomic64_set(&hwc->prev_count, now);
4309
4310         perf_swevent_start_hrtimer(event);
4311
4312         return 0;
4313 }
4314
4315 static void task_clock_perf_event_disable(struct perf_event *event)
4316 {
4317         perf_swevent_cancel_hrtimer(event);
4318         task_clock_perf_event_update(event, event->ctx->time);
4319
4320 }
4321
4322 static void task_clock_perf_event_read(struct perf_event *event)
4323 {
4324         u64 time;
4325
4326         if (!in_nmi()) {
4327                 update_context_time(event->ctx);
4328                 time = event->ctx->time;
4329         } else {
4330                 u64 now = perf_clock();
4331                 u64 delta = now - event->ctx->timestamp;
4332                 time = event->ctx->time + delta;
4333         }
4334
4335         task_clock_perf_event_update(event, time);
4336 }
4337
4338 static const struct pmu perf_ops_task_clock = {
4339         .enable         = task_clock_perf_event_enable,
4340         .disable        = task_clock_perf_event_disable,
4341         .read           = task_clock_perf_event_read,
4342 };
4343
4344 #ifdef CONFIG_EVENT_TRACING
4345
4346 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4347                           int entry_size)
4348 {
4349         struct perf_raw_record raw = {
4350                 .size = entry_size,
4351                 .data = record,
4352         };
4353
4354         struct perf_sample_data data = {
4355                 .addr = addr,
4356                 .raw = &raw,
4357         };
4358
4359         struct pt_regs *regs = get_irq_regs();
4360
4361         if (!regs)
4362                 regs = task_pt_regs(current);
4363
4364         /* Trace events already protected against recursion */
4365         do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4366                                 &data, regs);
4367 }
4368 EXPORT_SYMBOL_GPL(perf_tp_event);
4369
4370 static int perf_tp_event_match(struct perf_event *event,
4371                                 struct perf_sample_data *data)
4372 {
4373         void *record = data->raw->data;
4374
4375         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4376                 return 1;
4377         return 0;
4378 }
4379
4380 static void tp_perf_event_destroy(struct perf_event *event)
4381 {
4382         ftrace_profile_disable(event->attr.config);
4383 }
4384
4385 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4386 {
4387         /*
4388          * Raw tracepoint data is a severe data leak, only allow root to
4389          * have these.
4390          */
4391         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4392                         perf_paranoid_tracepoint_raw() &&
4393                         !capable(CAP_SYS_ADMIN))
4394                 return ERR_PTR(-EPERM);
4395
4396         if (ftrace_profile_enable(event->attr.config))
4397                 return NULL;
4398
4399         event->destroy = tp_perf_event_destroy;
4400
4401         return &perf_ops_generic;
4402 }
4403
4404 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4405 {
4406         char *filter_str;
4407         int ret;
4408
4409         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4410                 return -EINVAL;
4411
4412         filter_str = strndup_user(arg, PAGE_SIZE);
4413         if (IS_ERR(filter_str))
4414                 return PTR_ERR(filter_str);
4415
4416         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4417
4418         kfree(filter_str);
4419         return ret;
4420 }
4421
4422 static void perf_event_free_filter(struct perf_event *event)
4423 {
4424         ftrace_profile_free_filter(event);
4425 }
4426
4427 #else
4428
4429 static int perf_tp_event_match(struct perf_event *event,
4430                                 struct perf_sample_data *data)
4431 {
4432         return 1;
4433 }
4434
4435 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4436 {
4437         return NULL;
4438 }
4439
4440 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4441 {
4442         return -ENOENT;
4443 }
4444
4445 static void perf_event_free_filter(struct perf_event *event)
4446 {
4447 }
4448
4449 #endif /* CONFIG_EVENT_TRACING */
4450
4451 #ifdef CONFIG_HAVE_HW_BREAKPOINT
4452 static void bp_perf_event_destroy(struct perf_event *event)
4453 {
4454         release_bp_slot(event);
4455 }
4456
4457 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4458 {
4459         int err;
4460
4461         err = register_perf_hw_breakpoint(bp);
4462         if (err)
4463                 return ERR_PTR(err);
4464
4465         bp->destroy = bp_perf_event_destroy;
4466
4467         return &perf_ops_bp;
4468 }
4469
4470 void perf_bp_event(struct perf_event *bp, void *data)
4471 {
4472         struct perf_sample_data sample;
4473         struct pt_regs *regs = data;
4474
4475         sample.raw = NULL;
4476         sample.addr = bp->attr.bp_addr;
4477
4478         if (!perf_exclude_event(bp, regs))
4479                 perf_swevent_add(bp, 1, 1, &sample, regs);
4480 }
4481 #else
4482 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4483 {
4484         return NULL;
4485 }
4486
4487 void perf_bp_event(struct perf_event *bp, void *regs)
4488 {
4489 }
4490 #endif
4491
4492 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4493
4494 static void sw_perf_event_destroy(struct perf_event *event)
4495 {
4496         u64 event_id = event->attr.config;
4497
4498         WARN_ON(event->parent);
4499
4500         atomic_dec(&perf_swevent_enabled[event_id]);
4501 }
4502
4503 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4504 {
4505         const struct pmu *pmu = NULL;
4506         u64 event_id = event->attr.config;
4507
4508         /*
4509          * Software events (currently) can't in general distinguish
4510          * between user, kernel and hypervisor events.
4511          * However, context switches and cpu migrations are considered
4512          * to be kernel events, and page faults are never hypervisor
4513          * events.
4514          */
4515         switch (event_id) {
4516         case PERF_COUNT_SW_CPU_CLOCK:
4517                 pmu = &perf_ops_cpu_clock;
4518
4519                 break;
4520         case PERF_COUNT_SW_TASK_CLOCK:
4521                 /*
4522                  * If the user instantiates this as a per-cpu event,
4523                  * use the cpu_clock event instead.
4524                  */
4525                 if (event->ctx->task)
4526                         pmu = &perf_ops_task_clock;
4527                 else
4528                         pmu = &perf_ops_cpu_clock;
4529
4530                 break;
4531         case PERF_COUNT_SW_PAGE_FAULTS:
4532         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4533         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4534         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4535         case PERF_COUNT_SW_CPU_MIGRATIONS:
4536         case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4537         case PERF_COUNT_SW_EMULATION_FAULTS:
4538                 if (!event->parent) {
4539                         atomic_inc(&perf_swevent_enabled[event_id]);
4540                         event->destroy = sw_perf_event_destroy;
4541                 }
4542                 pmu = &perf_ops_generic;
4543                 break;
4544         }
4545
4546         return pmu;
4547 }
4548
4549 /*
4550  * Allocate and initialize a event structure
4551  */
4552 static struct perf_event *
4553 perf_event_alloc(struct perf_event_attr *attr,
4554                    int cpu,
4555                    struct perf_event_context *ctx,
4556                    struct perf_event *group_leader,
4557                    struct perf_event *parent_event,
4558                    perf_overflow_handler_t overflow_handler,
4559                    gfp_t gfpflags)
4560 {
4561         const struct pmu *pmu;
4562         struct perf_event *event;
4563         struct hw_perf_event *hwc;
4564         long err;
4565
4566         event = kzalloc(sizeof(*event), gfpflags);
4567         if (!event)
4568                 return ERR_PTR(-ENOMEM);
4569
4570         /*
4571          * Single events are their own group leaders, with an
4572          * empty sibling list:
4573          */
4574         if (!group_leader)
4575                 group_leader = event;
4576
4577         mutex_init(&event->child_mutex);
4578         INIT_LIST_HEAD(&event->child_list);
4579
4580         INIT_LIST_HEAD(&event->group_entry);
4581         INIT_LIST_HEAD(&event->event_entry);
4582         INIT_LIST_HEAD(&event->sibling_list);
4583         init_waitqueue_head(&event->waitq);
4584
4585         mutex_init(&event->mmap_mutex);
4586
4587         event->cpu              = cpu;
4588         event->attr             = *attr;
4589         event->group_leader     = group_leader;
4590         event->pmu              = NULL;
4591         event->ctx              = ctx;
4592         event->oncpu            = -1;
4593
4594         event->parent           = parent_event;
4595
4596         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4597         event->id               = atomic64_inc_return(&perf_event_id);
4598
4599         event->state            = PERF_EVENT_STATE_INACTIVE;
4600
4601         if (!overflow_handler && parent_event)
4602                 overflow_handler = parent_event->overflow_handler;
4603
4604         event->overflow_handler = overflow_handler;
4605
4606         if (attr->disabled)
4607                 event->state = PERF_EVENT_STATE_OFF;
4608
4609         pmu = NULL;
4610
4611         hwc = &event->hw;
4612         hwc->sample_period = attr->sample_period;
4613         if (attr->freq && attr->sample_freq)
4614                 hwc->sample_period = 1;
4615         hwc->last_period = hwc->sample_period;
4616
4617         atomic64_set(&hwc->period_left, hwc->sample_period);
4618
4619         /*
4620          * we currently do not support PERF_FORMAT_GROUP on inherited events
4621          */
4622         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4623                 goto done;
4624
4625         switch (attr->type) {
4626         case PERF_TYPE_RAW:
4627         case PERF_TYPE_HARDWARE:
4628         case PERF_TYPE_HW_CACHE:
4629                 pmu = hw_perf_event_init(event);
4630                 break;
4631
4632         case PERF_TYPE_SOFTWARE:
4633                 pmu = sw_perf_event_init(event);
4634                 break;
4635
4636         case PERF_TYPE_TRACEPOINT:
4637                 pmu = tp_perf_event_init(event);
4638                 break;
4639
4640         case PERF_TYPE_BREAKPOINT:
4641                 pmu = bp_perf_event_init(event);
4642                 break;
4643
4644
4645         default:
4646                 break;
4647         }
4648 done:
4649         err = 0;
4650         if (!pmu)
4651                 err = -EINVAL;
4652         else if (IS_ERR(pmu))
4653                 err = PTR_ERR(pmu);
4654
4655         if (err) {
4656                 if (event->ns)
4657                         put_pid_ns(event->ns);
4658                 kfree(event);
4659                 return ERR_PTR(err);
4660         }
4661
4662         event->pmu = pmu;
4663
4664         if (!event->parent) {
4665                 atomic_inc(&nr_events);
4666                 if (event->attr.mmap)
4667                         atomic_inc(&nr_mmap_events);
4668                 if (event->attr.comm)
4669                         atomic_inc(&nr_comm_events);
4670                 if (event->attr.task)
4671                         atomic_inc(&nr_task_events);
4672         }
4673
4674         return event;
4675 }
4676
4677 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4678                           struct perf_event_attr *attr)
4679 {
4680         u32 size;
4681         int ret;
4682
4683         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4684                 return -EFAULT;
4685
4686         /*
4687          * zero the full structure, so that a short copy will be nice.
4688          */
4689         memset(attr, 0, sizeof(*attr));
4690
4691         ret = get_user(size, &uattr->size);
4692         if (ret)
4693                 return ret;
4694
4695         if (size > PAGE_SIZE)   /* silly large */
4696                 goto err_size;
4697
4698         if (!size)              /* abi compat */
4699                 size = PERF_ATTR_SIZE_VER0;
4700
4701         if (size < PERF_ATTR_SIZE_VER0)
4702                 goto err_size;
4703
4704         /*
4705          * If we're handed a bigger struct than we know of,
4706          * ensure all the unknown bits are 0 - i.e. new
4707          * user-space does not rely on any kernel feature
4708          * extensions we dont know about yet.
4709          */
4710         if (size > sizeof(*attr)) {
4711                 unsigned char __user *addr;
4712                 unsigned char __user *end;
4713                 unsigned char val;
4714
4715                 addr = (void __user *)uattr + sizeof(*attr);
4716                 end  = (void __user *)uattr + size;
4717
4718                 for (; addr < end; addr++) {
4719                         ret = get_user(val, addr);
4720                         if (ret)
4721                                 return ret;
4722                         if (val)
4723                                 goto err_size;
4724                 }
4725                 size = sizeof(*attr);
4726         }
4727
4728         ret = copy_from_user(attr, uattr, size);
4729         if (ret)
4730                 return -EFAULT;
4731
4732         /*
4733          * If the type exists, the corresponding creation will verify
4734          * the attr->config.
4735          */
4736         if (attr->type >= PERF_TYPE_MAX)
4737                 return -EINVAL;
4738
4739         if (attr->__reserved_1 || attr->__reserved_2)
4740                 return -EINVAL;
4741
4742         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4743                 return -EINVAL;
4744
4745         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4746                 return -EINVAL;
4747
4748 out:
4749         return ret;
4750
4751 err_size:
4752         put_user(sizeof(*attr), &uattr->size);
4753         ret = -E2BIG;
4754         goto out;
4755 }
4756
4757 static int perf_event_set_output(struct perf_event *event, int output_fd)
4758 {
4759         struct perf_event *output_event = NULL;
4760         struct file *output_file = NULL;
4761         struct perf_event *old_output;
4762         int fput_needed = 0;
4763         int ret = -EINVAL;
4764
4765         if (!output_fd)
4766                 goto set;
4767
4768         output_file = fget_light(output_fd, &fput_needed);
4769         if (!output_file)
4770                 return -EBADF;
4771
4772         if (output_file->f_op != &perf_fops)
4773                 goto out;
4774
4775         output_event = output_file->private_data;
4776
4777         /* Don't chain output fds */
4778         if (output_event->output)
4779                 goto out;
4780
4781         /* Don't set an output fd when we already have an output channel */
4782         if (event->data)
4783                 goto out;
4784
4785         atomic_long_inc(&output_file->f_count);
4786
4787 set:
4788         mutex_lock(&event->mmap_mutex);
4789         old_output = event->output;
4790         rcu_assign_pointer(event->output, output_event);
4791         mutex_unlock(&event->mmap_mutex);
4792
4793         if (old_output) {
4794                 /*
4795                  * we need to make sure no existing perf_output_*()
4796                  * is still referencing this event.
4797                  */
4798                 synchronize_rcu();
4799                 fput(old_output->filp);
4800         }
4801
4802         ret = 0;
4803 out:
4804         fput_light(output_file, fput_needed);
4805         return ret;
4806 }
4807
4808 /**
4809  * sys_perf_event_open - open a performance event, associate it to a task/cpu
4810  *
4811  * @attr_uptr:  event_id type attributes for monitoring/sampling
4812  * @pid:                target pid
4813  * @cpu:                target cpu
4814  * @group_fd:           group leader event fd
4815  */
4816 SYSCALL_DEFINE5(perf_event_open,
4817                 struct perf_event_attr __user *, attr_uptr,
4818                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4819 {
4820         struct perf_event *event, *group_leader;
4821         struct perf_event_attr attr;
4822         struct perf_event_context *ctx;
4823         struct file *event_file = NULL;
4824         struct file *group_file = NULL;
4825         int fput_needed = 0;
4826         int fput_needed2 = 0;
4827         int err;
4828
4829         /* for future expandability... */
4830         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4831                 return -EINVAL;
4832
4833         err = perf_copy_attr(attr_uptr, &attr);
4834         if (err)
4835                 return err;
4836
4837         if (!attr.exclude_kernel) {
4838                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4839                         return -EACCES;
4840         }
4841
4842         if (attr.freq) {
4843                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4844                         return -EINVAL;
4845         }
4846
4847         /*
4848          * Get the target context (task or percpu):
4849          */
4850         ctx = find_get_context(pid, cpu);
4851         if (IS_ERR(ctx))
4852                 return PTR_ERR(ctx);
4853
4854         /*
4855          * Look up the group leader (we will attach this event to it):
4856          */
4857         group_leader = NULL;
4858         if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4859                 err = -EINVAL;
4860                 group_file = fget_light(group_fd, &fput_needed);
4861                 if (!group_file)
4862                         goto err_put_context;
4863                 if (group_file->f_op != &perf_fops)
4864                         goto err_put_context;
4865
4866                 group_leader = group_file->private_data;
4867                 /*
4868                  * Do not allow a recursive hierarchy (this new sibling
4869                  * becoming part of another group-sibling):
4870                  */
4871                 if (group_leader->group_leader != group_leader)
4872                         goto err_put_context;
4873                 /*
4874                  * Do not allow to attach to a group in a different
4875                  * task or CPU context:
4876                  */
4877                 if (group_leader->ctx != ctx)
4878                         goto err_put_context;
4879                 /*
4880                  * Only a group leader can be exclusive or pinned
4881                  */
4882                 if (attr.exclusive || attr.pinned)
4883                         goto err_put_context;
4884         }
4885
4886         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4887                                      NULL, NULL, GFP_KERNEL);
4888         err = PTR_ERR(event);
4889         if (IS_ERR(event))
4890                 goto err_put_context;
4891
4892         err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4893         if (err < 0)
4894                 goto err_free_put_context;
4895
4896         event_file = fget_light(err, &fput_needed2);
4897         if (!event_file)
4898                 goto err_free_put_context;
4899
4900         if (flags & PERF_FLAG_FD_OUTPUT) {
4901                 err = perf_event_set_output(event, group_fd);
4902                 if (err)
4903                         goto err_fput_free_put_context;
4904         }
4905
4906         event->filp = event_file;
4907         WARN_ON_ONCE(ctx->parent_ctx);
4908         mutex_lock(&ctx->mutex);
4909         perf_install_in_context(ctx, event, cpu);
4910         ++ctx->generation;
4911         mutex_unlock(&ctx->mutex);
4912
4913         event->owner = current;
4914         get_task_struct(current);
4915         mutex_lock(&current->perf_event_mutex);
4916         list_add_tail(&event->owner_entry, &current->perf_event_list);
4917         mutex_unlock(&current->perf_event_mutex);
4918
4919 err_fput_free_put_context:
4920         fput_light(event_file, fput_needed2);
4921
4922 err_free_put_context:
4923         if (err < 0)
4924                 kfree(event);
4925
4926 err_put_context:
4927         if (err < 0)
4928                 put_ctx(ctx);
4929
4930         fput_light(group_file, fput_needed);
4931
4932         return err;
4933 }
4934
4935 /**
4936  * perf_event_create_kernel_counter
4937  *
4938  * @attr: attributes of the counter to create
4939  * @cpu: cpu in which the counter is bound
4940  * @pid: task to profile
4941  */
4942 struct perf_event *
4943 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4944                                  pid_t pid,
4945                                  perf_overflow_handler_t overflow_handler)
4946 {
4947         struct perf_event *event;
4948         struct perf_event_context *ctx;
4949         int err;
4950
4951         /*
4952          * Get the target context (task or percpu):
4953          */
4954
4955         ctx = find_get_context(pid, cpu);
4956         if (IS_ERR(ctx)) {
4957                 err = PTR_ERR(ctx);
4958                 goto err_exit;
4959         }
4960
4961         event = perf_event_alloc(attr, cpu, ctx, NULL,
4962                                  NULL, overflow_handler, GFP_KERNEL);
4963         if (IS_ERR(event)) {
4964                 err = PTR_ERR(event);
4965                 goto err_put_context;
4966         }
4967
4968         event->filp = NULL;
4969         WARN_ON_ONCE(ctx->parent_ctx);
4970         mutex_lock(&ctx->mutex);
4971         perf_install_in_context(ctx, event, cpu);
4972         ++ctx->generation;
4973         mutex_unlock(&ctx->mutex);
4974
4975         event->owner = current;
4976         get_task_struct(current);
4977         mutex_lock(&current->perf_event_mutex);
4978         list_add_tail(&event->owner_entry, &current->perf_event_list);
4979         mutex_unlock(&current->perf_event_mutex);
4980
4981         return event;
4982
4983  err_put_context:
4984         put_ctx(ctx);
4985  err_exit:
4986         return ERR_PTR(err);
4987 }
4988 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4989
4990 /*
4991  * inherit a event from parent task to child task:
4992  */
4993 static struct perf_event *
4994 inherit_event(struct perf_event *parent_event,
4995               struct task_struct *parent,
4996               struct perf_event_context *parent_ctx,
4997               struct task_struct *child,
4998               struct perf_event *group_leader,
4999               struct perf_event_context *child_ctx)
5000 {
5001         struct perf_event *child_event;
5002
5003         /*
5004          * Instead of creating recursive hierarchies of events,
5005          * we link inherited events back to the original parent,
5006          * which has a filp for sure, which we use as the reference
5007          * count:
5008          */
5009         if (parent_event->parent)
5010                 parent_event = parent_event->parent;
5011
5012         child_event = perf_event_alloc(&parent_event->attr,
5013                                            parent_event->cpu, child_ctx,
5014                                            group_leader, parent_event,
5015                                            NULL, GFP_KERNEL);
5016         if (IS_ERR(child_event))
5017                 return child_event;
5018         get_ctx(child_ctx);
5019
5020         /*
5021          * Make the child state follow the state of the parent event,
5022          * not its attr.disabled bit.  We hold the parent's mutex,
5023          * so we won't race with perf_event_{en, dis}able_family.
5024          */
5025         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5026                 child_event->state = PERF_EVENT_STATE_INACTIVE;
5027         else
5028                 child_event->state = PERF_EVENT_STATE_OFF;
5029
5030         if (parent_event->attr.freq) {
5031                 u64 sample_period = parent_event->hw.sample_period;
5032                 struct hw_perf_event *hwc = &child_event->hw;
5033
5034                 hwc->sample_period = sample_period;
5035                 hwc->last_period   = sample_period;
5036
5037                 atomic64_set(&hwc->period_left, sample_period);
5038         }
5039
5040         child_event->overflow_handler = parent_event->overflow_handler;
5041
5042         /*
5043          * Link it up in the child's context:
5044          */
5045         add_event_to_ctx(child_event, child_ctx);
5046
5047         /*
5048          * Get a reference to the parent filp - we will fput it
5049          * when the child event exits. This is safe to do because
5050          * we are in the parent and we know that the filp still
5051          * exists and has a nonzero count:
5052          */
5053         atomic_long_inc(&parent_event->filp->f_count);
5054
5055         /*
5056          * Link this into the parent event's child list
5057          */
5058         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5059         mutex_lock(&parent_event->child_mutex);
5060         list_add_tail(&child_event->child_list, &parent_event->child_list);
5061         mutex_unlock(&parent_event->child_mutex);
5062
5063         return child_event;
5064 }
5065
5066 static int inherit_group(struct perf_event *parent_event,
5067               struct task_struct *parent,
5068               struct perf_event_context *parent_ctx,
5069               struct task_struct *child,
5070               struct perf_event_context *child_ctx)
5071 {
5072         struct perf_event *leader;
5073         struct perf_event *sub;
5074         struct perf_event *child_ctr;
5075
5076         leader = inherit_event(parent_event, parent, parent_ctx,
5077                                  child, NULL, child_ctx);
5078         if (IS_ERR(leader))
5079                 return PTR_ERR(leader);
5080         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5081                 child_ctr = inherit_event(sub, parent, parent_ctx,
5082                                             child, leader, child_ctx);
5083                 if (IS_ERR(child_ctr))
5084                         return PTR_ERR(child_ctr);
5085         }
5086         return 0;
5087 }
5088
5089 static void sync_child_event(struct perf_event *child_event,
5090                                struct task_struct *child)
5091 {
5092         struct perf_event *parent_event = child_event->parent;
5093         u64 child_val;
5094
5095         if (child_event->attr.inherit_stat)
5096                 perf_event_read_event(child_event, child);
5097
5098         child_val = atomic64_read(&child_event->count);
5099
5100         /*
5101          * Add back the child's count to the parent's count:
5102          */
5103         atomic64_add(child_val, &parent_event->count);
5104         atomic64_add(child_event->total_time_enabled,
5105                      &parent_event->child_total_time_enabled);
5106         atomic64_add(child_event->total_time_running,
5107                      &parent_event->child_total_time_running);
5108
5109         /*
5110          * Remove this event from the parent's list
5111          */
5112         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5113         mutex_lock(&parent_event->child_mutex);
5114         list_del_init(&child_event->child_list);
5115         mutex_unlock(&parent_event->child_mutex);
5116
5117         /*
5118          * Release the parent event, if this was the last
5119          * reference to it.
5120          */
5121         fput(parent_event->filp);
5122 }
5123
5124 static void
5125 __perf_event_exit_task(struct perf_event *child_event,
5126                          struct perf_event_context *child_ctx,
5127                          struct task_struct *child)
5128 {
5129         struct perf_event *parent_event;
5130
5131         perf_event_remove_from_context(child_event);
5132
5133         parent_event = child_event->parent;
5134         /*
5135          * It can happen that parent exits first, and has events
5136          * that are still around due to the child reference. These
5137          * events need to be zapped - but otherwise linger.
5138          */
5139         if (parent_event) {
5140                 sync_child_event(child_event, child);
5141                 free_event(child_event);
5142         }
5143 }
5144
5145 /*
5146  * When a child task exits, feed back event values to parent events.
5147  */
5148 void perf_event_exit_task(struct task_struct *child)
5149 {
5150         struct perf_event *child_event, *tmp;
5151         struct perf_event_context *child_ctx;
5152         unsigned long flags;
5153
5154         if (likely(!child->perf_event_ctxp)) {
5155                 perf_event_task(child, NULL, 0);
5156                 return;
5157         }
5158
5159         local_irq_save(flags);
5160         /*
5161          * We can't reschedule here because interrupts are disabled,
5162          * and either child is current or it is a task that can't be
5163          * scheduled, so we are now safe from rescheduling changing
5164          * our context.
5165          */
5166         child_ctx = child->perf_event_ctxp;
5167         __perf_event_task_sched_out(child_ctx);
5168
5169         /*
5170          * Take the context lock here so that if find_get_context is
5171          * reading child->perf_event_ctxp, we wait until it has
5172          * incremented the context's refcount before we do put_ctx below.
5173          */
5174         raw_spin_lock(&child_ctx->lock);
5175         child->perf_event_ctxp = NULL;
5176         /*
5177          * If this context is a clone; unclone it so it can't get
5178          * swapped to another process while we're removing all
5179          * the events from it.
5180          */
5181         unclone_ctx(child_ctx);
5182         update_context_time(child_ctx);
5183         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5184
5185         /*
5186          * Report the task dead after unscheduling the events so that we
5187          * won't get any samples after PERF_RECORD_EXIT. We can however still
5188          * get a few PERF_RECORD_READ events.
5189          */
5190         perf_event_task(child, child_ctx, 0);
5191
5192         /*
5193          * We can recurse on the same lock type through:
5194          *
5195          *   __perf_event_exit_task()
5196          *     sync_child_event()
5197          *       fput(parent_event->filp)
5198          *         perf_release()
5199          *           mutex_lock(&ctx->mutex)
5200          *
5201          * But since its the parent context it won't be the same instance.
5202          */
5203         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5204
5205 again:
5206         list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5207                                  group_entry)
5208                 __perf_event_exit_task(child_event, child_ctx, child);
5209
5210         list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5211                                  group_entry)
5212                 __perf_event_exit_task(child_event, child_ctx, child);
5213
5214         /*
5215          * If the last event was a group event, it will have appended all
5216          * its siblings to the list, but we obtained 'tmp' before that which
5217          * will still point to the list head terminating the iteration.
5218          */
5219         if (!list_empty(&child_ctx->pinned_groups) ||
5220             !list_empty(&child_ctx->flexible_groups))
5221                 goto again;
5222
5223         mutex_unlock(&child_ctx->mutex);
5224
5225         put_ctx(child_ctx);
5226 }
5227
5228 static void perf_free_event(struct perf_event *event,
5229                             struct perf_event_context *ctx)
5230 {
5231         struct perf_event *parent = event->parent;
5232
5233         if (WARN_ON_ONCE(!parent))
5234                 return;
5235
5236         mutex_lock(&parent->child_mutex);
5237         list_del_init(&event->child_list);
5238         mutex_unlock(&parent->child_mutex);
5239
5240         fput(parent->filp);
5241
5242         list_del_event(event, ctx);
5243         free_event(event);
5244 }
5245
5246 /*
5247  * free an unexposed, unused context as created by inheritance by
5248  * init_task below, used by fork() in case of fail.
5249  */
5250 void perf_event_free_task(struct task_struct *task)
5251 {
5252         struct perf_event_context *ctx = task->perf_event_ctxp;
5253         struct perf_event *event, *tmp;
5254
5255         if (!ctx)
5256                 return;
5257
5258         mutex_lock(&ctx->mutex);
5259 again:
5260         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5261                 perf_free_event(event, ctx);
5262
5263         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5264                                  group_entry)
5265                 perf_free_event(event, ctx);
5266
5267         if (!list_empty(&ctx->pinned_groups) ||
5268             !list_empty(&ctx->flexible_groups))
5269                 goto again;
5270
5271         mutex_unlock(&ctx->mutex);
5272
5273         put_ctx(ctx);
5274 }
5275
5276 static int
5277 inherit_task_group(struct perf_event *event, struct task_struct *parent,
5278                    struct perf_event_context *parent_ctx,
5279                    struct task_struct *child,
5280                    int *inherited_all)
5281 {
5282         int ret;
5283         struct perf_event_context *child_ctx = child->perf_event_ctxp;
5284
5285         if (!event->attr.inherit) {
5286                 *inherited_all = 0;
5287                 return 0;
5288         }
5289
5290         if (!child_ctx) {
5291                 /*
5292                  * This is executed from the parent task context, so
5293                  * inherit events that have been marked for cloning.
5294                  * First allocate and initialize a context for the
5295                  * child.
5296                  */
5297
5298                 child_ctx = kzalloc(sizeof(struct perf_event_context),
5299                                     GFP_KERNEL);
5300                 if (!child_ctx)
5301                         return -ENOMEM;
5302
5303                 __perf_event_init_context(child_ctx, child);
5304                 child->perf_event_ctxp = child_ctx;
5305                 get_task_struct(child);
5306         }
5307
5308         ret = inherit_group(event, parent, parent_ctx,
5309                             child, child_ctx);
5310
5311         if (ret)
5312                 *inherited_all = 0;
5313
5314         return ret;
5315 }
5316
5317
5318 /*
5319  * Initialize the perf_event context in task_struct
5320  */
5321 int perf_event_init_task(struct task_struct *child)
5322 {
5323         struct perf_event_context *child_ctx, *parent_ctx;
5324         struct perf_event_context *cloned_ctx;
5325         struct perf_event *event;
5326         struct task_struct *parent = current;
5327         int inherited_all = 1;
5328         int ret = 0;
5329
5330         child->perf_event_ctxp = NULL;
5331
5332         mutex_init(&child->perf_event_mutex);
5333         INIT_LIST_HEAD(&child->perf_event_list);
5334
5335         if (likely(!parent->perf_event_ctxp))
5336                 return 0;
5337
5338         /*
5339          * If the parent's context is a clone, pin it so it won't get
5340          * swapped under us.
5341          */
5342         parent_ctx = perf_pin_task_context(parent);
5343
5344         /*
5345          * No need to check if parent_ctx != NULL here; since we saw
5346          * it non-NULL earlier, the only reason for it to become NULL
5347          * is if we exit, and since we're currently in the middle of
5348          * a fork we can't be exiting at the same time.
5349          */
5350
5351         /*
5352          * Lock the parent list. No need to lock the child - not PID
5353          * hashed yet and not running, so nobody can access it.
5354          */
5355         mutex_lock(&parent_ctx->mutex);
5356
5357         /*
5358          * We dont have to disable NMIs - we are only looking at
5359          * the list, not manipulating it:
5360          */
5361         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5362                 ret = inherit_task_group(event, parent, parent_ctx, child,
5363                                          &inherited_all);
5364                 if (ret)
5365                         break;
5366         }
5367
5368         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5369                 ret = inherit_task_group(event, parent, parent_ctx, child,
5370                                          &inherited_all);
5371                 if (ret)
5372                         break;
5373         }
5374
5375         child_ctx = child->perf_event_ctxp;
5376
5377         if (child_ctx && inherited_all) {
5378                 /*
5379                  * Mark the child context as a clone of the parent
5380                  * context, or of whatever the parent is a clone of.
5381                  * Note that if the parent is a clone, it could get
5382                  * uncloned at any point, but that doesn't matter
5383                  * because the list of events and the generation
5384                  * count can't have changed since we took the mutex.
5385                  */
5386                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5387                 if (cloned_ctx) {
5388                         child_ctx->parent_ctx = cloned_ctx;
5389                         child_ctx->parent_gen = parent_ctx->parent_gen;
5390                 } else {
5391                         child_ctx->parent_ctx = parent_ctx;
5392                         child_ctx->parent_gen = parent_ctx->generation;
5393                 }
5394                 get_ctx(child_ctx->parent_ctx);
5395         }
5396
5397         mutex_unlock(&parent_ctx->mutex);
5398
5399         perf_unpin_context(parent_ctx);
5400
5401         return ret;
5402 }
5403
5404 static void __cpuinit perf_event_init_cpu(int cpu)
5405 {
5406         struct perf_cpu_context *cpuctx;
5407
5408         cpuctx = &per_cpu(perf_cpu_context, cpu);
5409         __perf_event_init_context(&cpuctx->ctx, NULL);
5410
5411         spin_lock(&perf_resource_lock);
5412         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5413         spin_unlock(&perf_resource_lock);
5414
5415         hw_perf_event_setup(cpu);
5416 }
5417
5418 #ifdef CONFIG_HOTPLUG_CPU
5419 static void __perf_event_exit_cpu(void *info)
5420 {
5421         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5422         struct perf_event_context *ctx = &cpuctx->ctx;
5423         struct perf_event *event, *tmp;
5424
5425         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5426                 __perf_event_remove_from_context(event);
5427         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5428                 __perf_event_remove_from_context(event);
5429 }
5430 static void perf_event_exit_cpu(int cpu)
5431 {
5432         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5433         struct perf_event_context *ctx = &cpuctx->ctx;
5434
5435         mutex_lock(&ctx->mutex);
5436         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5437         mutex_unlock(&ctx->mutex);
5438 }
5439 #else
5440 static inline void perf_event_exit_cpu(int cpu) { }
5441 #endif
5442
5443 static int __cpuinit
5444 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5445 {
5446         unsigned int cpu = (long)hcpu;
5447
5448         switch (action) {
5449
5450         case CPU_UP_PREPARE:
5451         case CPU_UP_PREPARE_FROZEN:
5452                 perf_event_init_cpu(cpu);
5453                 break;
5454
5455         case CPU_ONLINE:
5456         case CPU_ONLINE_FROZEN:
5457                 hw_perf_event_setup_online(cpu);
5458                 break;
5459
5460         case CPU_DOWN_PREPARE:
5461         case CPU_DOWN_PREPARE_FROZEN:
5462                 perf_event_exit_cpu(cpu);
5463                 break;
5464
5465         default:
5466                 break;
5467         }
5468
5469         return NOTIFY_OK;
5470 }
5471
5472 /*
5473  * This has to have a higher priority than migration_notifier in sched.c.
5474  */
5475 static struct notifier_block __cpuinitdata perf_cpu_nb = {
5476         .notifier_call          = perf_cpu_notify,
5477         .priority               = 20,
5478 };
5479
5480 void __init perf_event_init(void)
5481 {
5482         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5483                         (void *)(long)smp_processor_id());
5484         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5485                         (void *)(long)smp_processor_id());
5486         register_cpu_notifier(&perf_cpu_nb);
5487 }
5488
5489 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5490 {
5491         return sprintf(buf, "%d\n", perf_reserved_percpu);
5492 }
5493
5494 static ssize_t
5495 perf_set_reserve_percpu(struct sysdev_class *class,
5496                         const char *buf,
5497                         size_t count)
5498 {
5499         struct perf_cpu_context *cpuctx;
5500         unsigned long val;
5501         int err, cpu, mpt;
5502
5503         err = strict_strtoul(buf, 10, &val);
5504         if (err)
5505                 return err;
5506         if (val > perf_max_events)
5507                 return -EINVAL;
5508
5509         spin_lock(&perf_resource_lock);
5510         perf_reserved_percpu = val;
5511         for_each_online_cpu(cpu) {
5512                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5513                 raw_spin_lock_irq(&cpuctx->ctx.lock);
5514                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5515                           perf_max_events - perf_reserved_percpu);
5516                 cpuctx->max_pertask = mpt;
5517                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5518         }
5519         spin_unlock(&perf_resource_lock);
5520
5521         return count;
5522 }
5523
5524 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5525 {
5526         return sprintf(buf, "%d\n", perf_overcommit);
5527 }
5528
5529 static ssize_t
5530 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5531 {
5532         unsigned long val;
5533         int err;
5534
5535         err = strict_strtoul(buf, 10, &val);
5536         if (err)
5537                 return err;
5538         if (val > 1)
5539                 return -EINVAL;
5540
5541         spin_lock(&perf_resource_lock);
5542         perf_overcommit = val;
5543         spin_unlock(&perf_resource_lock);
5544
5545         return count;
5546 }
5547
5548 static SYSDEV_CLASS_ATTR(
5549                                 reserve_percpu,
5550                                 0644,
5551                                 perf_show_reserve_percpu,
5552                                 perf_set_reserve_percpu
5553                         );
5554
5555 static SYSDEV_CLASS_ATTR(
5556                                 overcommit,
5557                                 0644,
5558                                 perf_show_overcommit,
5559                                 perf_set_overcommit
5560                         );
5561
5562 static struct attribute *perfclass_attrs[] = {
5563         &attr_reserve_percpu.attr,
5564         &attr_overcommit.attr,
5565         NULL
5566 };
5567
5568 static struct attribute_group perfclass_attr_group = {
5569         .attrs                  = perfclass_attrs,
5570         .name                   = "perf_events",
5571 };
5572
5573 static int __init perf_event_sysfs_init(void)
5574 {
5575         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5576                                   &perfclass_attr_group);
5577 }
5578 device_initcall(perf_event_sysfs_init);