]> git.karo-electronics.de Git - mv-sheeva.git/blob - kernel/perf_counter.c
Merge branch 'core/percpu' into perfcounters/core
[mv-sheeva.git] / kernel / perf_counter.c
1 /*
2  * Performance counter core code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *
7  *  For licencing details see kernel-base/COPYING
8  */
9
10 #include <linux/fs.h>
11 #include <linux/cpu.h>
12 #include <linux/smp.h>
13 #include <linux/file.h>
14 #include <linux/poll.h>
15 #include <linux/sysfs.h>
16 #include <linux/ptrace.h>
17 #include <linux/percpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/syscalls.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/kernel_stat.h>
22 #include <linux/perf_counter.h>
23
24 /*
25  * Each CPU has a list of per CPU counters:
26  */
27 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
28
29 int perf_max_counters __read_mostly = 1;
30 static int perf_reserved_percpu __read_mostly;
31 static int perf_overcommit __read_mostly = 1;
32
33 /*
34  * Mutex for (sysadmin-configurable) counter reservations:
35  */
36 static DEFINE_MUTEX(perf_resource_mutex);
37
38 /*
39  * Architecture provided APIs - weak aliases:
40  */
41 extern __weak const struct hw_perf_counter_ops *
42 hw_perf_counter_init(struct perf_counter *counter)
43 {
44         return NULL;
45 }
46
47 u64 __weak hw_perf_save_disable(void)           { return 0; }
48 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
49 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
50 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
51                struct perf_cpu_context *cpuctx,
52                struct perf_counter_context *ctx, int cpu)
53 {
54         return 0;
55 }
56
57 void __weak perf_counter_print_debug(void)      { }
58
59 static void
60 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
61 {
62         struct perf_counter *group_leader = counter->group_leader;
63
64         /*
65          * Depending on whether it is a standalone or sibling counter,
66          * add it straight to the context's counter list, or to the group
67          * leader's sibling list:
68          */
69         if (counter->group_leader == counter)
70                 list_add_tail(&counter->list_entry, &ctx->counter_list);
71         else
72                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
73 }
74
75 static void
76 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
77 {
78         struct perf_counter *sibling, *tmp;
79
80         list_del_init(&counter->list_entry);
81
82         /*
83          * If this was a group counter with sibling counters then
84          * upgrade the siblings to singleton counters by adding them
85          * to the context list directly:
86          */
87         list_for_each_entry_safe(sibling, tmp,
88                                  &counter->sibling_list, list_entry) {
89
90                 list_del_init(&sibling->list_entry);
91                 list_add_tail(&sibling->list_entry, &ctx->counter_list);
92                 sibling->group_leader = sibling;
93         }
94 }
95
96 static void
97 counter_sched_out(struct perf_counter *counter,
98                   struct perf_cpu_context *cpuctx,
99                   struct perf_counter_context *ctx)
100 {
101         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
102                 return;
103
104         counter->state = PERF_COUNTER_STATE_INACTIVE;
105         counter->hw_ops->disable(counter);
106         counter->oncpu = -1;
107
108         if (!is_software_counter(counter))
109                 cpuctx->active_oncpu--;
110         ctx->nr_active--;
111         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
112                 cpuctx->exclusive = 0;
113 }
114
115 static void
116 group_sched_out(struct perf_counter *group_counter,
117                 struct perf_cpu_context *cpuctx,
118                 struct perf_counter_context *ctx)
119 {
120         struct perf_counter *counter;
121
122         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
123                 return;
124
125         counter_sched_out(group_counter, cpuctx, ctx);
126
127         /*
128          * Schedule out siblings (if any):
129          */
130         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
131                 counter_sched_out(counter, cpuctx, ctx);
132
133         if (group_counter->hw_event.exclusive)
134                 cpuctx->exclusive = 0;
135 }
136
137 /*
138  * Cross CPU call to remove a performance counter
139  *
140  * We disable the counter on the hardware level first. After that we
141  * remove it from the context list.
142  */
143 static void __perf_counter_remove_from_context(void *info)
144 {
145         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
146         struct perf_counter *counter = info;
147         struct perf_counter_context *ctx = counter->ctx;
148         unsigned long flags;
149         u64 perf_flags;
150
151         /*
152          * If this is a task context, we need to check whether it is
153          * the current task context of this cpu. If not it has been
154          * scheduled out before the smp call arrived.
155          */
156         if (ctx->task && cpuctx->task_ctx != ctx)
157                 return;
158
159         curr_rq_lock_irq_save(&flags);
160         spin_lock(&ctx->lock);
161
162         counter_sched_out(counter, cpuctx, ctx);
163
164         counter->task = NULL;
165         ctx->nr_counters--;
166
167         /*
168          * Protect the list operation against NMI by disabling the
169          * counters on a global level. NOP for non NMI based counters.
170          */
171         perf_flags = hw_perf_save_disable();
172         list_del_counter(counter, ctx);
173         hw_perf_restore(perf_flags);
174
175         if (!ctx->task) {
176                 /*
177                  * Allow more per task counters with respect to the
178                  * reservation:
179                  */
180                 cpuctx->max_pertask =
181                         min(perf_max_counters - ctx->nr_counters,
182                             perf_max_counters - perf_reserved_percpu);
183         }
184
185         spin_unlock(&ctx->lock);
186         curr_rq_unlock_irq_restore(&flags);
187 }
188
189
190 /*
191  * Remove the counter from a task's (or a CPU's) list of counters.
192  *
193  * Must be called with counter->mutex and ctx->mutex held.
194  *
195  * CPU counters are removed with a smp call. For task counters we only
196  * call when the task is on a CPU.
197  */
198 static void perf_counter_remove_from_context(struct perf_counter *counter)
199 {
200         struct perf_counter_context *ctx = counter->ctx;
201         struct task_struct *task = ctx->task;
202
203         if (!task) {
204                 /*
205                  * Per cpu counters are removed via an smp call and
206                  * the removal is always sucessful.
207                  */
208                 smp_call_function_single(counter->cpu,
209                                          __perf_counter_remove_from_context,
210                                          counter, 1);
211                 return;
212         }
213
214 retry:
215         task_oncpu_function_call(task, __perf_counter_remove_from_context,
216                                  counter);
217
218         spin_lock_irq(&ctx->lock);
219         /*
220          * If the context is active we need to retry the smp call.
221          */
222         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
223                 spin_unlock_irq(&ctx->lock);
224                 goto retry;
225         }
226
227         /*
228          * The lock prevents that this context is scheduled in so we
229          * can remove the counter safely, if the call above did not
230          * succeed.
231          */
232         if (!list_empty(&counter->list_entry)) {
233                 ctx->nr_counters--;
234                 list_del_counter(counter, ctx);
235                 counter->task = NULL;
236         }
237         spin_unlock_irq(&ctx->lock);
238 }
239
240 /*
241  * Cross CPU call to disable a performance counter
242  */
243 static void __perf_counter_disable(void *info)
244 {
245         struct perf_counter *counter = info;
246         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
247         struct perf_counter_context *ctx = counter->ctx;
248         unsigned long flags;
249
250         /*
251          * If this is a per-task counter, need to check whether this
252          * counter's task is the current task on this cpu.
253          */
254         if (ctx->task && cpuctx->task_ctx != ctx)
255                 return;
256
257         curr_rq_lock_irq_save(&flags);
258         spin_lock(&ctx->lock);
259
260         /*
261          * If the counter is on, turn it off.
262          * If it is in error state, leave it in error state.
263          */
264         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
265                 if (counter == counter->group_leader)
266                         group_sched_out(counter, cpuctx, ctx);
267                 else
268                         counter_sched_out(counter, cpuctx, ctx);
269                 counter->state = PERF_COUNTER_STATE_OFF;
270         }
271
272         spin_unlock(&ctx->lock);
273         curr_rq_unlock_irq_restore(&flags);
274 }
275
276 /*
277  * Disable a counter.
278  */
279 static void perf_counter_disable(struct perf_counter *counter)
280 {
281         struct perf_counter_context *ctx = counter->ctx;
282         struct task_struct *task = ctx->task;
283
284         if (!task) {
285                 /*
286                  * Disable the counter on the cpu that it's on
287                  */
288                 smp_call_function_single(counter->cpu, __perf_counter_disable,
289                                          counter, 1);
290                 return;
291         }
292
293  retry:
294         task_oncpu_function_call(task, __perf_counter_disable, counter);
295
296         spin_lock_irq(&ctx->lock);
297         /*
298          * If the counter is still active, we need to retry the cross-call.
299          */
300         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
301                 spin_unlock_irq(&ctx->lock);
302                 goto retry;
303         }
304
305         /*
306          * Since we have the lock this context can't be scheduled
307          * in, so we can change the state safely.
308          */
309         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
310                 counter->state = PERF_COUNTER_STATE_OFF;
311
312         spin_unlock_irq(&ctx->lock);
313 }
314
315 /*
316  * Disable a counter and all its children.
317  */
318 static void perf_counter_disable_family(struct perf_counter *counter)
319 {
320         struct perf_counter *child;
321
322         perf_counter_disable(counter);
323
324         /*
325          * Lock the mutex to protect the list of children
326          */
327         mutex_lock(&counter->mutex);
328         list_for_each_entry(child, &counter->child_list, child_list)
329                 perf_counter_disable(child);
330         mutex_unlock(&counter->mutex);
331 }
332
333 static int
334 counter_sched_in(struct perf_counter *counter,
335                  struct perf_cpu_context *cpuctx,
336                  struct perf_counter_context *ctx,
337                  int cpu)
338 {
339         if (counter->state <= PERF_COUNTER_STATE_OFF)
340                 return 0;
341
342         counter->state = PERF_COUNTER_STATE_ACTIVE;
343         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
344         /*
345          * The new state must be visible before we turn it on in the hardware:
346          */
347         smp_wmb();
348
349         if (counter->hw_ops->enable(counter)) {
350                 counter->state = PERF_COUNTER_STATE_INACTIVE;
351                 counter->oncpu = -1;
352                 return -EAGAIN;
353         }
354
355         if (!is_software_counter(counter))
356                 cpuctx->active_oncpu++;
357         ctx->nr_active++;
358
359         if (counter->hw_event.exclusive)
360                 cpuctx->exclusive = 1;
361
362         return 0;
363 }
364
365 /*
366  * Return 1 for a group consisting entirely of software counters,
367  * 0 if the group contains any hardware counters.
368  */
369 static int is_software_only_group(struct perf_counter *leader)
370 {
371         struct perf_counter *counter;
372
373         if (!is_software_counter(leader))
374                 return 0;
375         list_for_each_entry(counter, &leader->sibling_list, list_entry)
376                 if (!is_software_counter(counter))
377                         return 0;
378         return 1;
379 }
380
381 /*
382  * Work out whether we can put this counter group on the CPU now.
383  */
384 static int group_can_go_on(struct perf_counter *counter,
385                            struct perf_cpu_context *cpuctx,
386                            int can_add_hw)
387 {
388         /*
389          * Groups consisting entirely of software counters can always go on.
390          */
391         if (is_software_only_group(counter))
392                 return 1;
393         /*
394          * If an exclusive group is already on, no other hardware
395          * counters can go on.
396          */
397         if (cpuctx->exclusive)
398                 return 0;
399         /*
400          * If this group is exclusive and there are already
401          * counters on the CPU, it can't go on.
402          */
403         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
404                 return 0;
405         /*
406          * Otherwise, try to add it if all previous groups were able
407          * to go on.
408          */
409         return can_add_hw;
410 }
411
412 /*
413  * Cross CPU call to install and enable a performance counter
414  */
415 static void __perf_install_in_context(void *info)
416 {
417         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
418         struct perf_counter *counter = info;
419         struct perf_counter_context *ctx = counter->ctx;
420         struct perf_counter *leader = counter->group_leader;
421         int cpu = smp_processor_id();
422         unsigned long flags;
423         u64 perf_flags;
424         int err;
425
426         /*
427          * If this is a task context, we need to check whether it is
428          * the current task context of this cpu. If not it has been
429          * scheduled out before the smp call arrived.
430          */
431         if (ctx->task && cpuctx->task_ctx != ctx)
432                 return;
433
434         curr_rq_lock_irq_save(&flags);
435         spin_lock(&ctx->lock);
436
437         /*
438          * Protect the list operation against NMI by disabling the
439          * counters on a global level. NOP for non NMI based counters.
440          */
441         perf_flags = hw_perf_save_disable();
442
443         list_add_counter(counter, ctx);
444         ctx->nr_counters++;
445
446         /*
447          * Don't put the counter on if it is disabled or if
448          * it is in a group and the group isn't on.
449          */
450         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
451             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
452                 goto unlock;
453
454         /*
455          * An exclusive counter can't go on if there are already active
456          * hardware counters, and no hardware counter can go on if there
457          * is already an exclusive counter on.
458          */
459         if (!group_can_go_on(counter, cpuctx, 1))
460                 err = -EEXIST;
461         else
462                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
463
464         if (err) {
465                 /*
466                  * This counter couldn't go on.  If it is in a group
467                  * then we have to pull the whole group off.
468                  * If the counter group is pinned then put it in error state.
469                  */
470                 if (leader != counter)
471                         group_sched_out(leader, cpuctx, ctx);
472                 if (leader->hw_event.pinned)
473                         leader->state = PERF_COUNTER_STATE_ERROR;
474         }
475
476         if (!err && !ctx->task && cpuctx->max_pertask)
477                 cpuctx->max_pertask--;
478
479  unlock:
480         hw_perf_restore(perf_flags);
481
482         spin_unlock(&ctx->lock);
483         curr_rq_unlock_irq_restore(&flags);
484 }
485
486 /*
487  * Attach a performance counter to a context
488  *
489  * First we add the counter to the list with the hardware enable bit
490  * in counter->hw_config cleared.
491  *
492  * If the counter is attached to a task which is on a CPU we use a smp
493  * call to enable it in the task context. The task might have been
494  * scheduled away, but we check this in the smp call again.
495  *
496  * Must be called with ctx->mutex held.
497  */
498 static void
499 perf_install_in_context(struct perf_counter_context *ctx,
500                         struct perf_counter *counter,
501                         int cpu)
502 {
503         struct task_struct *task = ctx->task;
504
505         counter->ctx = ctx;
506         if (!task) {
507                 /*
508                  * Per cpu counters are installed via an smp call and
509                  * the install is always sucessful.
510                  */
511                 smp_call_function_single(cpu, __perf_install_in_context,
512                                          counter, 1);
513                 return;
514         }
515
516         counter->task = task;
517 retry:
518         task_oncpu_function_call(task, __perf_install_in_context,
519                                  counter);
520
521         spin_lock_irq(&ctx->lock);
522         /*
523          * we need to retry the smp call.
524          */
525         if (ctx->is_active && list_empty(&counter->list_entry)) {
526                 spin_unlock_irq(&ctx->lock);
527                 goto retry;
528         }
529
530         /*
531          * The lock prevents that this context is scheduled in so we
532          * can add the counter safely, if it the call above did not
533          * succeed.
534          */
535         if (list_empty(&counter->list_entry)) {
536                 list_add_counter(counter, ctx);
537                 ctx->nr_counters++;
538         }
539         spin_unlock_irq(&ctx->lock);
540 }
541
542 /*
543  * Cross CPU call to enable a performance counter
544  */
545 static void __perf_counter_enable(void *info)
546 {
547         struct perf_counter *counter = info;
548         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
549         struct perf_counter_context *ctx = counter->ctx;
550         struct perf_counter *leader = counter->group_leader;
551         unsigned long flags;
552         int err;
553
554         /*
555          * If this is a per-task counter, need to check whether this
556          * counter's task is the current task on this cpu.
557          */
558         if (ctx->task && cpuctx->task_ctx != ctx)
559                 return;
560
561         curr_rq_lock_irq_save(&flags);
562         spin_lock(&ctx->lock);
563
564         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
565                 goto unlock;
566         counter->state = PERF_COUNTER_STATE_INACTIVE;
567
568         /*
569          * If the counter is in a group and isn't the group leader,
570          * then don't put it on unless the group is on.
571          */
572         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
573                 goto unlock;
574
575         if (!group_can_go_on(counter, cpuctx, 1))
576                 err = -EEXIST;
577         else
578                 err = counter_sched_in(counter, cpuctx, ctx,
579                                        smp_processor_id());
580
581         if (err) {
582                 /*
583                  * If this counter can't go on and it's part of a
584                  * group, then the whole group has to come off.
585                  */
586                 if (leader != counter)
587                         group_sched_out(leader, cpuctx, ctx);
588                 if (leader->hw_event.pinned)
589                         leader->state = PERF_COUNTER_STATE_ERROR;
590         }
591
592  unlock:
593         spin_unlock(&ctx->lock);
594         curr_rq_unlock_irq_restore(&flags);
595 }
596
597 /*
598  * Enable a counter.
599  */
600 static void perf_counter_enable(struct perf_counter *counter)
601 {
602         struct perf_counter_context *ctx = counter->ctx;
603         struct task_struct *task = ctx->task;
604
605         if (!task) {
606                 /*
607                  * Enable the counter on the cpu that it's on
608                  */
609                 smp_call_function_single(counter->cpu, __perf_counter_enable,
610                                          counter, 1);
611                 return;
612         }
613
614         spin_lock_irq(&ctx->lock);
615         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
616                 goto out;
617
618         /*
619          * If the counter is in error state, clear that first.
620          * That way, if we see the counter in error state below, we
621          * know that it has gone back into error state, as distinct
622          * from the task having been scheduled away before the
623          * cross-call arrived.
624          */
625         if (counter->state == PERF_COUNTER_STATE_ERROR)
626                 counter->state = PERF_COUNTER_STATE_OFF;
627
628  retry:
629         spin_unlock_irq(&ctx->lock);
630         task_oncpu_function_call(task, __perf_counter_enable, counter);
631
632         spin_lock_irq(&ctx->lock);
633
634         /*
635          * If the context is active and the counter is still off,
636          * we need to retry the cross-call.
637          */
638         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
639                 goto retry;
640
641         /*
642          * Since we have the lock this context can't be scheduled
643          * in, so we can change the state safely.
644          */
645         if (counter->state == PERF_COUNTER_STATE_OFF)
646                 counter->state = PERF_COUNTER_STATE_INACTIVE;
647  out:
648         spin_unlock_irq(&ctx->lock);
649 }
650
651 /*
652  * Enable a counter and all its children.
653  */
654 static void perf_counter_enable_family(struct perf_counter *counter)
655 {
656         struct perf_counter *child;
657
658         perf_counter_enable(counter);
659
660         /*
661          * Lock the mutex to protect the list of children
662          */
663         mutex_lock(&counter->mutex);
664         list_for_each_entry(child, &counter->child_list, child_list)
665                 perf_counter_enable(child);
666         mutex_unlock(&counter->mutex);
667 }
668
669 void __perf_counter_sched_out(struct perf_counter_context *ctx,
670                               struct perf_cpu_context *cpuctx)
671 {
672         struct perf_counter *counter;
673         u64 flags;
674
675         spin_lock(&ctx->lock);
676         ctx->is_active = 0;
677         if (likely(!ctx->nr_counters))
678                 goto out;
679
680         flags = hw_perf_save_disable();
681         if (ctx->nr_active) {
682                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
683                         group_sched_out(counter, cpuctx, ctx);
684         }
685         hw_perf_restore(flags);
686  out:
687         spin_unlock(&ctx->lock);
688 }
689
690 /*
691  * Called from scheduler to remove the counters of the current task,
692  * with interrupts disabled.
693  *
694  * We stop each counter and update the counter value in counter->count.
695  *
696  * This does not protect us against NMI, but disable()
697  * sets the disabled bit in the control field of counter _before_
698  * accessing the counter control register. If a NMI hits, then it will
699  * not restart the counter.
700  */
701 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
702 {
703         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
704         struct perf_counter_context *ctx = &task->perf_counter_ctx;
705
706         if (likely(!cpuctx->task_ctx))
707                 return;
708
709         __perf_counter_sched_out(ctx, cpuctx);
710
711         cpuctx->task_ctx = NULL;
712 }
713
714 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
715 {
716         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
717 }
718
719 static int
720 group_sched_in(struct perf_counter *group_counter,
721                struct perf_cpu_context *cpuctx,
722                struct perf_counter_context *ctx,
723                int cpu)
724 {
725         struct perf_counter *counter, *partial_group;
726         int ret;
727
728         if (group_counter->state == PERF_COUNTER_STATE_OFF)
729                 return 0;
730
731         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
732         if (ret)
733                 return ret < 0 ? ret : 0;
734
735         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
736                 return -EAGAIN;
737
738         /*
739          * Schedule in siblings as one group (if any):
740          */
741         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
742                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
743                         partial_group = counter;
744                         goto group_error;
745                 }
746         }
747
748         return 0;
749
750 group_error:
751         /*
752          * Groups can be scheduled in as one unit only, so undo any
753          * partial group before returning:
754          */
755         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
756                 if (counter == partial_group)
757                         break;
758                 counter_sched_out(counter, cpuctx, ctx);
759         }
760         counter_sched_out(group_counter, cpuctx, ctx);
761
762         return -EAGAIN;
763 }
764
765 static void
766 __perf_counter_sched_in(struct perf_counter_context *ctx,
767                         struct perf_cpu_context *cpuctx, int cpu)
768 {
769         struct perf_counter *counter;
770         u64 flags;
771         int can_add_hw = 1;
772
773         spin_lock(&ctx->lock);
774         ctx->is_active = 1;
775         if (likely(!ctx->nr_counters))
776                 goto out;
777
778         flags = hw_perf_save_disable();
779
780         /*
781          * First go through the list and put on any pinned groups
782          * in order to give them the best chance of going on.
783          */
784         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
785                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
786                     !counter->hw_event.pinned)
787                         continue;
788                 if (counter->cpu != -1 && counter->cpu != cpu)
789                         continue;
790
791                 if (group_can_go_on(counter, cpuctx, 1))
792                         group_sched_in(counter, cpuctx, ctx, cpu);
793
794                 /*
795                  * If this pinned group hasn't been scheduled,
796                  * put it in error state.
797                  */
798                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
799                         counter->state = PERF_COUNTER_STATE_ERROR;
800         }
801
802         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
803                 /*
804                  * Ignore counters in OFF or ERROR state, and
805                  * ignore pinned counters since we did them already.
806                  */
807                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
808                     counter->hw_event.pinned)
809                         continue;
810
811                 /*
812                  * Listen to the 'cpu' scheduling filter constraint
813                  * of counters:
814                  */
815                 if (counter->cpu != -1 && counter->cpu != cpu)
816                         continue;
817
818                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
819                         if (group_sched_in(counter, cpuctx, ctx, cpu))
820                                 can_add_hw = 0;
821                 }
822         }
823         hw_perf_restore(flags);
824  out:
825         spin_unlock(&ctx->lock);
826 }
827
828 /*
829  * Called from scheduler to add the counters of the current task
830  * with interrupts disabled.
831  *
832  * We restore the counter value and then enable it.
833  *
834  * This does not protect us against NMI, but enable()
835  * sets the enabled bit in the control field of counter _before_
836  * accessing the counter control register. If a NMI hits, then it will
837  * keep the counter running.
838  */
839 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
840 {
841         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
842         struct perf_counter_context *ctx = &task->perf_counter_ctx;
843
844         __perf_counter_sched_in(ctx, cpuctx, cpu);
845         cpuctx->task_ctx = ctx;
846 }
847
848 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
849 {
850         struct perf_counter_context *ctx = &cpuctx->ctx;
851
852         __perf_counter_sched_in(ctx, cpuctx, cpu);
853 }
854
855 int perf_counter_task_disable(void)
856 {
857         struct task_struct *curr = current;
858         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
859         struct perf_counter *counter;
860         unsigned long flags;
861         u64 perf_flags;
862         int cpu;
863
864         if (likely(!ctx->nr_counters))
865                 return 0;
866
867         curr_rq_lock_irq_save(&flags);
868         cpu = smp_processor_id();
869
870         /* force the update of the task clock: */
871         __task_delta_exec(curr, 1);
872
873         perf_counter_task_sched_out(curr, cpu);
874
875         spin_lock(&ctx->lock);
876
877         /*
878          * Disable all the counters:
879          */
880         perf_flags = hw_perf_save_disable();
881
882         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
883                 if (counter->state != PERF_COUNTER_STATE_ERROR)
884                         counter->state = PERF_COUNTER_STATE_OFF;
885         }
886
887         hw_perf_restore(perf_flags);
888
889         spin_unlock(&ctx->lock);
890
891         curr_rq_unlock_irq_restore(&flags);
892
893         return 0;
894 }
895
896 int perf_counter_task_enable(void)
897 {
898         struct task_struct *curr = current;
899         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
900         struct perf_counter *counter;
901         unsigned long flags;
902         u64 perf_flags;
903         int cpu;
904
905         if (likely(!ctx->nr_counters))
906                 return 0;
907
908         curr_rq_lock_irq_save(&flags);
909         cpu = smp_processor_id();
910
911         /* force the update of the task clock: */
912         __task_delta_exec(curr, 1);
913
914         perf_counter_task_sched_out(curr, cpu);
915
916         spin_lock(&ctx->lock);
917
918         /*
919          * Disable all the counters:
920          */
921         perf_flags = hw_perf_save_disable();
922
923         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
924                 if (counter->state > PERF_COUNTER_STATE_OFF)
925                         continue;
926                 counter->state = PERF_COUNTER_STATE_INACTIVE;
927                 counter->hw_event.disabled = 0;
928         }
929         hw_perf_restore(perf_flags);
930
931         spin_unlock(&ctx->lock);
932
933         perf_counter_task_sched_in(curr, cpu);
934
935         curr_rq_unlock_irq_restore(&flags);
936
937         return 0;
938 }
939
940 /*
941  * Round-robin a context's counters:
942  */
943 static void rotate_ctx(struct perf_counter_context *ctx)
944 {
945         struct perf_counter *counter;
946         u64 perf_flags;
947
948         if (!ctx->nr_counters)
949                 return;
950
951         spin_lock(&ctx->lock);
952         /*
953          * Rotate the first entry last (works just fine for group counters too):
954          */
955         perf_flags = hw_perf_save_disable();
956         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
957                 list_del(&counter->list_entry);
958                 list_add_tail(&counter->list_entry, &ctx->counter_list);
959                 break;
960         }
961         hw_perf_restore(perf_flags);
962
963         spin_unlock(&ctx->lock);
964 }
965
966 void perf_counter_task_tick(struct task_struct *curr, int cpu)
967 {
968         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
969         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
970         const int rotate_percpu = 0;
971
972         if (rotate_percpu)
973                 perf_counter_cpu_sched_out(cpuctx);
974         perf_counter_task_sched_out(curr, cpu);
975
976         if (rotate_percpu)
977                 rotate_ctx(&cpuctx->ctx);
978         rotate_ctx(ctx);
979
980         if (rotate_percpu)
981                 perf_counter_cpu_sched_in(cpuctx, cpu);
982         perf_counter_task_sched_in(curr, cpu);
983 }
984
985 /*
986  * Cross CPU call to read the hardware counter
987  */
988 static void __read(void *info)
989 {
990         struct perf_counter *counter = info;
991         unsigned long flags;
992
993         curr_rq_lock_irq_save(&flags);
994         counter->hw_ops->read(counter);
995         curr_rq_unlock_irq_restore(&flags);
996 }
997
998 static u64 perf_counter_read(struct perf_counter *counter)
999 {
1000         /*
1001          * If counter is enabled and currently active on a CPU, update the
1002          * value in the counter structure:
1003          */
1004         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1005                 smp_call_function_single(counter->oncpu,
1006                                          __read, counter, 1);
1007         }
1008
1009         return atomic64_read(&counter->count);
1010 }
1011
1012 /*
1013  * Cross CPU call to switch performance data pointers
1014  */
1015 static void __perf_switch_irq_data(void *info)
1016 {
1017         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1018         struct perf_counter *counter = info;
1019         struct perf_counter_context *ctx = counter->ctx;
1020         struct perf_data *oldirqdata = counter->irqdata;
1021
1022         /*
1023          * If this is a task context, we need to check whether it is
1024          * the current task context of this cpu. If not it has been
1025          * scheduled out before the smp call arrived.
1026          */
1027         if (ctx->task) {
1028                 if (cpuctx->task_ctx != ctx)
1029                         return;
1030                 spin_lock(&ctx->lock);
1031         }
1032
1033         /* Change the pointer NMI safe */
1034         atomic_long_set((atomic_long_t *)&counter->irqdata,
1035                         (unsigned long) counter->usrdata);
1036         counter->usrdata = oldirqdata;
1037
1038         if (ctx->task)
1039                 spin_unlock(&ctx->lock);
1040 }
1041
1042 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1043 {
1044         struct perf_counter_context *ctx = counter->ctx;
1045         struct perf_data *oldirqdata = counter->irqdata;
1046         struct task_struct *task = ctx->task;
1047
1048         if (!task) {
1049                 smp_call_function_single(counter->cpu,
1050                                          __perf_switch_irq_data,
1051                                          counter, 1);
1052                 return counter->usrdata;
1053         }
1054
1055 retry:
1056         spin_lock_irq(&ctx->lock);
1057         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1058                 counter->irqdata = counter->usrdata;
1059                 counter->usrdata = oldirqdata;
1060                 spin_unlock_irq(&ctx->lock);
1061                 return oldirqdata;
1062         }
1063         spin_unlock_irq(&ctx->lock);
1064         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1065         /* Might have failed, because task was scheduled out */
1066         if (counter->irqdata == oldirqdata)
1067                 goto retry;
1068
1069         return counter->usrdata;
1070 }
1071
1072 static void put_context(struct perf_counter_context *ctx)
1073 {
1074         if (ctx->task)
1075                 put_task_struct(ctx->task);
1076 }
1077
1078 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1079 {
1080         struct perf_cpu_context *cpuctx;
1081         struct perf_counter_context *ctx;
1082         struct task_struct *task;
1083
1084         /*
1085          * If cpu is not a wildcard then this is a percpu counter:
1086          */
1087         if (cpu != -1) {
1088                 /* Must be root to operate on a CPU counter: */
1089                 if (!capable(CAP_SYS_ADMIN))
1090                         return ERR_PTR(-EACCES);
1091
1092                 if (cpu < 0 || cpu > num_possible_cpus())
1093                         return ERR_PTR(-EINVAL);
1094
1095                 /*
1096                  * We could be clever and allow to attach a counter to an
1097                  * offline CPU and activate it when the CPU comes up, but
1098                  * that's for later.
1099                  */
1100                 if (!cpu_isset(cpu, cpu_online_map))
1101                         return ERR_PTR(-ENODEV);
1102
1103                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1104                 ctx = &cpuctx->ctx;
1105
1106                 return ctx;
1107         }
1108
1109         rcu_read_lock();
1110         if (!pid)
1111                 task = current;
1112         else
1113                 task = find_task_by_vpid(pid);
1114         if (task)
1115                 get_task_struct(task);
1116         rcu_read_unlock();
1117
1118         if (!task)
1119                 return ERR_PTR(-ESRCH);
1120
1121         ctx = &task->perf_counter_ctx;
1122         ctx->task = task;
1123
1124         /* Reuse ptrace permission checks for now. */
1125         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1126                 put_context(ctx);
1127                 return ERR_PTR(-EACCES);
1128         }
1129
1130         return ctx;
1131 }
1132
1133 /*
1134  * Called when the last reference to the file is gone.
1135  */
1136 static int perf_release(struct inode *inode, struct file *file)
1137 {
1138         struct perf_counter *counter = file->private_data;
1139         struct perf_counter_context *ctx = counter->ctx;
1140
1141         file->private_data = NULL;
1142
1143         mutex_lock(&ctx->mutex);
1144         mutex_lock(&counter->mutex);
1145
1146         perf_counter_remove_from_context(counter);
1147         put_context(ctx);
1148
1149         mutex_unlock(&counter->mutex);
1150         mutex_unlock(&ctx->mutex);
1151
1152         kfree(counter);
1153
1154         return 0;
1155 }
1156
1157 /*
1158  * Read the performance counter - simple non blocking version for now
1159  */
1160 static ssize_t
1161 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1162 {
1163         u64 cntval;
1164
1165         if (count != sizeof(cntval))
1166                 return -EINVAL;
1167
1168         /*
1169          * Return end-of-file for a read on a counter that is in
1170          * error state (i.e. because it was pinned but it couldn't be
1171          * scheduled on to the CPU at some point).
1172          */
1173         if (counter->state == PERF_COUNTER_STATE_ERROR)
1174                 return 0;
1175
1176         mutex_lock(&counter->mutex);
1177         cntval = perf_counter_read(counter);
1178         mutex_unlock(&counter->mutex);
1179
1180         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1181 }
1182
1183 static ssize_t
1184 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1185 {
1186         if (!usrdata->len)
1187                 return 0;
1188
1189         count = min(count, (size_t)usrdata->len);
1190         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1191                 return -EFAULT;
1192
1193         /* Adjust the counters */
1194         usrdata->len -= count;
1195         if (!usrdata->len)
1196                 usrdata->rd_idx = 0;
1197         else
1198                 usrdata->rd_idx += count;
1199
1200         return count;
1201 }
1202
1203 static ssize_t
1204 perf_read_irq_data(struct perf_counter  *counter,
1205                    char __user          *buf,
1206                    size_t               count,
1207                    int                  nonblocking)
1208 {
1209         struct perf_data *irqdata, *usrdata;
1210         DECLARE_WAITQUEUE(wait, current);
1211         ssize_t res, res2;
1212
1213         irqdata = counter->irqdata;
1214         usrdata = counter->usrdata;
1215
1216         if (usrdata->len + irqdata->len >= count)
1217                 goto read_pending;
1218
1219         if (nonblocking)
1220                 return -EAGAIN;
1221
1222         spin_lock_irq(&counter->waitq.lock);
1223         __add_wait_queue(&counter->waitq, &wait);
1224         for (;;) {
1225                 set_current_state(TASK_INTERRUPTIBLE);
1226                 if (usrdata->len + irqdata->len >= count)
1227                         break;
1228
1229                 if (signal_pending(current))
1230                         break;
1231
1232                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1233                         break;
1234
1235                 spin_unlock_irq(&counter->waitq.lock);
1236                 schedule();
1237                 spin_lock_irq(&counter->waitq.lock);
1238         }
1239         __remove_wait_queue(&counter->waitq, &wait);
1240         __set_current_state(TASK_RUNNING);
1241         spin_unlock_irq(&counter->waitq.lock);
1242
1243         if (usrdata->len + irqdata->len < count &&
1244             counter->state != PERF_COUNTER_STATE_ERROR)
1245                 return -ERESTARTSYS;
1246 read_pending:
1247         mutex_lock(&counter->mutex);
1248
1249         /* Drain pending data first: */
1250         res = perf_copy_usrdata(usrdata, buf, count);
1251         if (res < 0 || res == count)
1252                 goto out;
1253
1254         /* Switch irq buffer: */
1255         usrdata = perf_switch_irq_data(counter);
1256         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1257         if (res2 < 0) {
1258                 if (!res)
1259                         res = -EFAULT;
1260         } else {
1261                 res += res2;
1262         }
1263 out:
1264         mutex_unlock(&counter->mutex);
1265
1266         return res;
1267 }
1268
1269 static ssize_t
1270 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1271 {
1272         struct perf_counter *counter = file->private_data;
1273
1274         switch (counter->hw_event.record_type) {
1275         case PERF_RECORD_SIMPLE:
1276                 return perf_read_hw(counter, buf, count);
1277
1278         case PERF_RECORD_IRQ:
1279         case PERF_RECORD_GROUP:
1280                 return perf_read_irq_data(counter, buf, count,
1281                                           file->f_flags & O_NONBLOCK);
1282         }
1283         return -EINVAL;
1284 }
1285
1286 static unsigned int perf_poll(struct file *file, poll_table *wait)
1287 {
1288         struct perf_counter *counter = file->private_data;
1289         unsigned int events = 0;
1290         unsigned long flags;
1291
1292         poll_wait(file, &counter->waitq, wait);
1293
1294         spin_lock_irqsave(&counter->waitq.lock, flags);
1295         if (counter->usrdata->len || counter->irqdata->len)
1296                 events |= POLLIN;
1297         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1298
1299         return events;
1300 }
1301
1302 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1303 {
1304         struct perf_counter *counter = file->private_data;
1305         int err = 0;
1306
1307         switch (cmd) {
1308         case PERF_COUNTER_IOC_ENABLE:
1309                 perf_counter_enable_family(counter);
1310                 break;
1311         case PERF_COUNTER_IOC_DISABLE:
1312                 perf_counter_disable_family(counter);
1313                 break;
1314         default:
1315                 err = -ENOTTY;
1316         }
1317         return err;
1318 }
1319
1320 static const struct file_operations perf_fops = {
1321         .release                = perf_release,
1322         .read                   = perf_read,
1323         .poll                   = perf_poll,
1324         .unlocked_ioctl         = perf_ioctl,
1325         .compat_ioctl           = perf_ioctl,
1326 };
1327
1328 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1329 {
1330         int cpu = raw_smp_processor_id();
1331
1332         atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
1333         return 0;
1334 }
1335
1336 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1337 {
1338         int cpu = raw_smp_processor_id();
1339         s64 prev;
1340         u64 now;
1341
1342         now = cpu_clock(cpu);
1343         prev = atomic64_read(&counter->hw.prev_count);
1344         atomic64_set(&counter->hw.prev_count, now);
1345         atomic64_add(now - prev, &counter->count);
1346 }
1347
1348 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1349 {
1350         cpu_clock_perf_counter_update(counter);
1351 }
1352
1353 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1354 {
1355         cpu_clock_perf_counter_update(counter);
1356 }
1357
1358 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1359         .enable         = cpu_clock_perf_counter_enable,
1360         .disable        = cpu_clock_perf_counter_disable,
1361         .read           = cpu_clock_perf_counter_read,
1362 };
1363
1364 /*
1365  * Called from within the scheduler:
1366  */
1367 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1368 {
1369         struct task_struct *curr = counter->task;
1370         u64 delta;
1371
1372         delta = __task_delta_exec(curr, update);
1373
1374         return curr->se.sum_exec_runtime + delta;
1375 }
1376
1377 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1378 {
1379         u64 prev;
1380         s64 delta;
1381
1382         prev = atomic64_read(&counter->hw.prev_count);
1383
1384         atomic64_set(&counter->hw.prev_count, now);
1385
1386         delta = now - prev;
1387
1388         atomic64_add(delta, &counter->count);
1389 }
1390
1391 static void task_clock_perf_counter_read(struct perf_counter *counter)
1392 {
1393         u64 now = task_clock_perf_counter_val(counter, 1);
1394
1395         task_clock_perf_counter_update(counter, now);
1396 }
1397
1398 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1399 {
1400         u64 now = task_clock_perf_counter_val(counter, 0);
1401
1402         atomic64_set(&counter->hw.prev_count, now);
1403
1404         return 0;
1405 }
1406
1407 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1408 {
1409         u64 now = task_clock_perf_counter_val(counter, 0);
1410
1411         task_clock_perf_counter_update(counter, now);
1412 }
1413
1414 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1415         .enable         = task_clock_perf_counter_enable,
1416         .disable        = task_clock_perf_counter_disable,
1417         .read           = task_clock_perf_counter_read,
1418 };
1419
1420 static u64 get_page_faults(void)
1421 {
1422         struct task_struct *curr = current;
1423
1424         return curr->maj_flt + curr->min_flt;
1425 }
1426
1427 static void page_faults_perf_counter_update(struct perf_counter *counter)
1428 {
1429         u64 prev, now;
1430         s64 delta;
1431
1432         prev = atomic64_read(&counter->hw.prev_count);
1433         now = get_page_faults();
1434
1435         atomic64_set(&counter->hw.prev_count, now);
1436
1437         delta = now - prev;
1438
1439         atomic64_add(delta, &counter->count);
1440 }
1441
1442 static void page_faults_perf_counter_read(struct perf_counter *counter)
1443 {
1444         page_faults_perf_counter_update(counter);
1445 }
1446
1447 static int page_faults_perf_counter_enable(struct perf_counter *counter)
1448 {
1449         /*
1450          * page-faults is a per-task value already,
1451          * so we dont have to clear it on switch-in.
1452          */
1453
1454         return 0;
1455 }
1456
1457 static void page_faults_perf_counter_disable(struct perf_counter *counter)
1458 {
1459         page_faults_perf_counter_update(counter);
1460 }
1461
1462 static const struct hw_perf_counter_ops perf_ops_page_faults = {
1463         .enable         = page_faults_perf_counter_enable,
1464         .disable        = page_faults_perf_counter_disable,
1465         .read           = page_faults_perf_counter_read,
1466 };
1467
1468 static u64 get_context_switches(void)
1469 {
1470         struct task_struct *curr = current;
1471
1472         return curr->nvcsw + curr->nivcsw;
1473 }
1474
1475 static void context_switches_perf_counter_update(struct perf_counter *counter)
1476 {
1477         u64 prev, now;
1478         s64 delta;
1479
1480         prev = atomic64_read(&counter->hw.prev_count);
1481         now = get_context_switches();
1482
1483         atomic64_set(&counter->hw.prev_count, now);
1484
1485         delta = now - prev;
1486
1487         atomic64_add(delta, &counter->count);
1488 }
1489
1490 static void context_switches_perf_counter_read(struct perf_counter *counter)
1491 {
1492         context_switches_perf_counter_update(counter);
1493 }
1494
1495 static int context_switches_perf_counter_enable(struct perf_counter *counter)
1496 {
1497         /*
1498          * ->nvcsw + curr->nivcsw is a per-task value already,
1499          * so we dont have to clear it on switch-in.
1500          */
1501
1502         return 0;
1503 }
1504
1505 static void context_switches_perf_counter_disable(struct perf_counter *counter)
1506 {
1507         context_switches_perf_counter_update(counter);
1508 }
1509
1510 static const struct hw_perf_counter_ops perf_ops_context_switches = {
1511         .enable         = context_switches_perf_counter_enable,
1512         .disable        = context_switches_perf_counter_disable,
1513         .read           = context_switches_perf_counter_read,
1514 };
1515
1516 static inline u64 get_cpu_migrations(void)
1517 {
1518         return current->se.nr_migrations;
1519 }
1520
1521 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1522 {
1523         u64 prev, now;
1524         s64 delta;
1525
1526         prev = atomic64_read(&counter->hw.prev_count);
1527         now = get_cpu_migrations();
1528
1529         atomic64_set(&counter->hw.prev_count, now);
1530
1531         delta = now - prev;
1532
1533         atomic64_add(delta, &counter->count);
1534 }
1535
1536 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1537 {
1538         cpu_migrations_perf_counter_update(counter);
1539 }
1540
1541 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1542 {
1543         /*
1544          * se.nr_migrations is a per-task value already,
1545          * so we dont have to clear it on switch-in.
1546          */
1547
1548         return 0;
1549 }
1550
1551 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1552 {
1553         cpu_migrations_perf_counter_update(counter);
1554 }
1555
1556 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1557         .enable         = cpu_migrations_perf_counter_enable,
1558         .disable        = cpu_migrations_perf_counter_disable,
1559         .read           = cpu_migrations_perf_counter_read,
1560 };
1561
1562 static const struct hw_perf_counter_ops *
1563 sw_perf_counter_init(struct perf_counter *counter)
1564 {
1565         const struct hw_perf_counter_ops *hw_ops = NULL;
1566
1567         switch (counter->hw_event.type) {
1568         case PERF_COUNT_CPU_CLOCK:
1569                 hw_ops = &perf_ops_cpu_clock;
1570                 break;
1571         case PERF_COUNT_TASK_CLOCK:
1572                 hw_ops = &perf_ops_task_clock;
1573                 break;
1574         case PERF_COUNT_PAGE_FAULTS:
1575                 hw_ops = &perf_ops_page_faults;
1576                 break;
1577         case PERF_COUNT_CONTEXT_SWITCHES:
1578                 hw_ops = &perf_ops_context_switches;
1579                 break;
1580         case PERF_COUNT_CPU_MIGRATIONS:
1581                 hw_ops = &perf_ops_cpu_migrations;
1582                 break;
1583         default:
1584                 break;
1585         }
1586         return hw_ops;
1587 }
1588
1589 /*
1590  * Allocate and initialize a counter structure
1591  */
1592 static struct perf_counter *
1593 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1594                    int cpu,
1595                    struct perf_counter *group_leader,
1596                    gfp_t gfpflags)
1597 {
1598         const struct hw_perf_counter_ops *hw_ops;
1599         struct perf_counter *counter;
1600
1601         counter = kzalloc(sizeof(*counter), gfpflags);
1602         if (!counter)
1603                 return NULL;
1604
1605         /*
1606          * Single counters are their own group leaders, with an
1607          * empty sibling list:
1608          */
1609         if (!group_leader)
1610                 group_leader = counter;
1611
1612         mutex_init(&counter->mutex);
1613         INIT_LIST_HEAD(&counter->list_entry);
1614         INIT_LIST_HEAD(&counter->sibling_list);
1615         init_waitqueue_head(&counter->waitq);
1616
1617         INIT_LIST_HEAD(&counter->child_list);
1618
1619         counter->irqdata                = &counter->data[0];
1620         counter->usrdata                = &counter->data[1];
1621         counter->cpu                    = cpu;
1622         counter->hw_event               = *hw_event;
1623         counter->wakeup_pending         = 0;
1624         counter->group_leader           = group_leader;
1625         counter->hw_ops                 = NULL;
1626
1627         counter->state = PERF_COUNTER_STATE_INACTIVE;
1628         if (hw_event->disabled)
1629                 counter->state = PERF_COUNTER_STATE_OFF;
1630
1631         hw_ops = NULL;
1632         if (!hw_event->raw && hw_event->type < 0)
1633                 hw_ops = sw_perf_counter_init(counter);
1634         if (!hw_ops)
1635                 hw_ops = hw_perf_counter_init(counter);
1636
1637         if (!hw_ops) {
1638                 kfree(counter);
1639                 return NULL;
1640         }
1641         counter->hw_ops = hw_ops;
1642
1643         return counter;
1644 }
1645
1646 /**
1647  * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1648  *
1649  * @hw_event_uptr:      event type attributes for monitoring/sampling
1650  * @pid:                target pid
1651  * @cpu:                target cpu
1652  * @group_fd:           group leader counter fd
1653  */
1654 asmlinkage int
1655 sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1656                       pid_t pid, int cpu, int group_fd)
1657 {
1658         struct perf_counter *counter, *group_leader;
1659         struct perf_counter_hw_event hw_event;
1660         struct perf_counter_context *ctx;
1661         struct file *counter_file = NULL;
1662         struct file *group_file = NULL;
1663         int fput_needed = 0;
1664         int fput_needed2 = 0;
1665         int ret;
1666
1667         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1668                 return -EFAULT;
1669
1670         /*
1671          * Get the target context (task or percpu):
1672          */
1673         ctx = find_get_context(pid, cpu);
1674         if (IS_ERR(ctx))
1675                 return PTR_ERR(ctx);
1676
1677         /*
1678          * Look up the group leader (we will attach this counter to it):
1679          */
1680         group_leader = NULL;
1681         if (group_fd != -1) {
1682                 ret = -EINVAL;
1683                 group_file = fget_light(group_fd, &fput_needed);
1684                 if (!group_file)
1685                         goto err_put_context;
1686                 if (group_file->f_op != &perf_fops)
1687                         goto err_put_context;
1688
1689                 group_leader = group_file->private_data;
1690                 /*
1691                  * Do not allow a recursive hierarchy (this new sibling
1692                  * becoming part of another group-sibling):
1693                  */
1694                 if (group_leader->group_leader != group_leader)
1695                         goto err_put_context;
1696                 /*
1697                  * Do not allow to attach to a group in a different
1698                  * task or CPU context:
1699                  */
1700                 if (group_leader->ctx != ctx)
1701                         goto err_put_context;
1702                 /*
1703                  * Only a group leader can be exclusive or pinned
1704                  */
1705                 if (hw_event.exclusive || hw_event.pinned)
1706                         goto err_put_context;
1707         }
1708
1709         ret = -EINVAL;
1710         counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1711         if (!counter)
1712                 goto err_put_context;
1713
1714         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1715         if (ret < 0)
1716                 goto err_free_put_context;
1717
1718         counter_file = fget_light(ret, &fput_needed2);
1719         if (!counter_file)
1720                 goto err_free_put_context;
1721
1722         counter->filp = counter_file;
1723         mutex_lock(&ctx->mutex);
1724         perf_install_in_context(ctx, counter, cpu);
1725         mutex_unlock(&ctx->mutex);
1726
1727         fput_light(counter_file, fput_needed2);
1728
1729 out_fput:
1730         fput_light(group_file, fput_needed);
1731
1732         return ret;
1733
1734 err_free_put_context:
1735         kfree(counter);
1736
1737 err_put_context:
1738         put_context(ctx);
1739
1740         goto out_fput;
1741 }
1742
1743 /*
1744  * Initialize the perf_counter context in a task_struct:
1745  */
1746 static void
1747 __perf_counter_init_context(struct perf_counter_context *ctx,
1748                             struct task_struct *task)
1749 {
1750         memset(ctx, 0, sizeof(*ctx));
1751         spin_lock_init(&ctx->lock);
1752         mutex_init(&ctx->mutex);
1753         INIT_LIST_HEAD(&ctx->counter_list);
1754         ctx->task = task;
1755 }
1756
1757 /*
1758  * inherit a counter from parent task to child task:
1759  */
1760 static struct perf_counter *
1761 inherit_counter(struct perf_counter *parent_counter,
1762               struct task_struct *parent,
1763               struct perf_counter_context *parent_ctx,
1764               struct task_struct *child,
1765               struct perf_counter *group_leader,
1766               struct perf_counter_context *child_ctx)
1767 {
1768         struct perf_counter *child_counter;
1769
1770         /*
1771          * Instead of creating recursive hierarchies of counters,
1772          * we link inherited counters back to the original parent,
1773          * which has a filp for sure, which we use as the reference
1774          * count:
1775          */
1776         if (parent_counter->parent)
1777                 parent_counter = parent_counter->parent;
1778
1779         child_counter = perf_counter_alloc(&parent_counter->hw_event,
1780                                             parent_counter->cpu, group_leader,
1781                                             GFP_KERNEL);
1782         if (!child_counter)
1783                 return NULL;
1784
1785         /*
1786          * Link it up in the child's context:
1787          */
1788         child_counter->ctx = child_ctx;
1789         child_counter->task = child;
1790         list_add_counter(child_counter, child_ctx);
1791         child_ctx->nr_counters++;
1792
1793         child_counter->parent = parent_counter;
1794         /*
1795          * inherit into child's child as well:
1796          */
1797         child_counter->hw_event.inherit = 1;
1798
1799         /*
1800          * Get a reference to the parent filp - we will fput it
1801          * when the child counter exits. This is safe to do because
1802          * we are in the parent and we know that the filp still
1803          * exists and has a nonzero count:
1804          */
1805         atomic_long_inc(&parent_counter->filp->f_count);
1806
1807         /*
1808          * Link this into the parent counter's child list
1809          */
1810         mutex_lock(&parent_counter->mutex);
1811         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
1812
1813         /*
1814          * Make the child state follow the state of the parent counter,
1815          * not its hw_event.disabled bit.  We hold the parent's mutex,
1816          * so we won't race with perf_counter_{en,dis}able_family.
1817          */
1818         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
1819                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1820         else
1821                 child_counter->state = PERF_COUNTER_STATE_OFF;
1822
1823         mutex_unlock(&parent_counter->mutex);
1824
1825         return child_counter;
1826 }
1827
1828 static int inherit_group(struct perf_counter *parent_counter,
1829               struct task_struct *parent,
1830               struct perf_counter_context *parent_ctx,
1831               struct task_struct *child,
1832               struct perf_counter_context *child_ctx)
1833 {
1834         struct perf_counter *leader;
1835         struct perf_counter *sub;
1836
1837         leader = inherit_counter(parent_counter, parent, parent_ctx,
1838                                  child, NULL, child_ctx);
1839         if (!leader)
1840                 return -ENOMEM;
1841         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
1842                 if (!inherit_counter(sub, parent, parent_ctx,
1843                                      child, leader, child_ctx))
1844                         return -ENOMEM;
1845         }
1846         return 0;
1847 }
1848
1849 static void sync_child_counter(struct perf_counter *child_counter,
1850                                struct perf_counter *parent_counter)
1851 {
1852         u64 parent_val, child_val;
1853
1854         parent_val = atomic64_read(&parent_counter->count);
1855         child_val = atomic64_read(&child_counter->count);
1856
1857         /*
1858          * Add back the child's count to the parent's count:
1859          */
1860         atomic64_add(child_val, &parent_counter->count);
1861
1862         /*
1863          * Remove this counter from the parent's list
1864          */
1865         mutex_lock(&parent_counter->mutex);
1866         list_del_init(&child_counter->child_list);
1867         mutex_unlock(&parent_counter->mutex);
1868
1869         /*
1870          * Release the parent counter, if this was the last
1871          * reference to it.
1872          */
1873         fput(parent_counter->filp);
1874 }
1875
1876 static void
1877 __perf_counter_exit_task(struct task_struct *child,
1878                          struct perf_counter *child_counter,
1879                          struct perf_counter_context *child_ctx)
1880 {
1881         struct perf_counter *parent_counter;
1882         struct perf_counter *sub, *tmp;
1883
1884         /*
1885          * If we do not self-reap then we have to wait for the
1886          * child task to unschedule (it will happen for sure),
1887          * so that its counter is at its final count. (This
1888          * condition triggers rarely - child tasks usually get
1889          * off their CPU before the parent has a chance to
1890          * get this far into the reaping action)
1891          */
1892         if (child != current) {
1893                 wait_task_inactive(child, 0);
1894                 list_del_init(&child_counter->list_entry);
1895         } else {
1896                 struct perf_cpu_context *cpuctx;
1897                 unsigned long flags;
1898                 u64 perf_flags;
1899
1900                 /*
1901                  * Disable and unlink this counter.
1902                  *
1903                  * Be careful about zapping the list - IRQ/NMI context
1904                  * could still be processing it:
1905                  */
1906                 curr_rq_lock_irq_save(&flags);
1907                 perf_flags = hw_perf_save_disable();
1908
1909                 cpuctx = &__get_cpu_var(perf_cpu_context);
1910
1911                 group_sched_out(child_counter, cpuctx, child_ctx);
1912
1913                 list_del_init(&child_counter->list_entry);
1914
1915                 child_ctx->nr_counters--;
1916
1917                 hw_perf_restore(perf_flags);
1918                 curr_rq_unlock_irq_restore(&flags);
1919         }
1920
1921         parent_counter = child_counter->parent;
1922         /*
1923          * It can happen that parent exits first, and has counters
1924          * that are still around due to the child reference. These
1925          * counters need to be zapped - but otherwise linger.
1926          */
1927         if (parent_counter) {
1928                 sync_child_counter(child_counter, parent_counter);
1929                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
1930                                          list_entry) {
1931                         if (sub->parent)
1932                                 sync_child_counter(sub, sub->parent);
1933                         kfree(sub);
1934                 }
1935         }
1936
1937         kfree(child_counter);
1938 }
1939
1940 /*
1941  * When a child task exits, feed back counter values to parent counters.
1942  *
1943  * Note: we may be running in child context, but the PID is not hashed
1944  * anymore so new counters will not be added.
1945  */
1946 void perf_counter_exit_task(struct task_struct *child)
1947 {
1948         struct perf_counter *child_counter, *tmp;
1949         struct perf_counter_context *child_ctx;
1950
1951         child_ctx = &child->perf_counter_ctx;
1952
1953         if (likely(!child_ctx->nr_counters))
1954                 return;
1955
1956         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1957                                  list_entry)
1958                 __perf_counter_exit_task(child, child_counter, child_ctx);
1959 }
1960
1961 /*
1962  * Initialize the perf_counter context in task_struct
1963  */
1964 void perf_counter_init_task(struct task_struct *child)
1965 {
1966         struct perf_counter_context *child_ctx, *parent_ctx;
1967         struct perf_counter *counter;
1968         struct task_struct *parent = current;
1969
1970         child_ctx  =  &child->perf_counter_ctx;
1971         parent_ctx = &parent->perf_counter_ctx;
1972
1973         __perf_counter_init_context(child_ctx, child);
1974
1975         /*
1976          * This is executed from the parent task context, so inherit
1977          * counters that have been marked for cloning:
1978          */
1979
1980         if (likely(!parent_ctx->nr_counters))
1981                 return;
1982
1983         /*
1984          * Lock the parent list. No need to lock the child - not PID
1985          * hashed yet and not running, so nobody can access it.
1986          */
1987         mutex_lock(&parent_ctx->mutex);
1988
1989         /*
1990          * We dont have to disable NMIs - we are only looking at
1991          * the list, not manipulating it:
1992          */
1993         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1994                 if (!counter->hw_event.inherit)
1995                         continue;
1996
1997                 if (inherit_group(counter, parent,
1998                                   parent_ctx, child, child_ctx))
1999                         break;
2000         }
2001
2002         mutex_unlock(&parent_ctx->mutex);
2003 }
2004
2005 static void __cpuinit perf_counter_init_cpu(int cpu)
2006 {
2007         struct perf_cpu_context *cpuctx;
2008
2009         cpuctx = &per_cpu(perf_cpu_context, cpu);
2010         __perf_counter_init_context(&cpuctx->ctx, NULL);
2011
2012         mutex_lock(&perf_resource_mutex);
2013         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2014         mutex_unlock(&perf_resource_mutex);
2015
2016         hw_perf_counter_setup(cpu);
2017 }
2018
2019 #ifdef CONFIG_HOTPLUG_CPU
2020 static void __perf_counter_exit_cpu(void *info)
2021 {
2022         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2023         struct perf_counter_context *ctx = &cpuctx->ctx;
2024         struct perf_counter *counter, *tmp;
2025
2026         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2027                 __perf_counter_remove_from_context(counter);
2028 }
2029 static void perf_counter_exit_cpu(int cpu)
2030 {
2031         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2032         struct perf_counter_context *ctx = &cpuctx->ctx;
2033
2034         mutex_lock(&ctx->mutex);
2035         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2036         mutex_unlock(&ctx->mutex);
2037 }
2038 #else
2039 static inline void perf_counter_exit_cpu(int cpu) { }
2040 #endif
2041
2042 static int __cpuinit
2043 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2044 {
2045         unsigned int cpu = (long)hcpu;
2046
2047         switch (action) {
2048
2049         case CPU_UP_PREPARE:
2050         case CPU_UP_PREPARE_FROZEN:
2051                 perf_counter_init_cpu(cpu);
2052                 break;
2053
2054         case CPU_DOWN_PREPARE:
2055         case CPU_DOWN_PREPARE_FROZEN:
2056                 perf_counter_exit_cpu(cpu);
2057                 break;
2058
2059         default:
2060                 break;
2061         }
2062
2063         return NOTIFY_OK;
2064 }
2065
2066 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2067         .notifier_call          = perf_cpu_notify,
2068 };
2069
2070 static int __init perf_counter_init(void)
2071 {
2072         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2073                         (void *)(long)smp_processor_id());
2074         register_cpu_notifier(&perf_cpu_nb);
2075
2076         return 0;
2077 }
2078 early_initcall(perf_counter_init);
2079
2080 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2081 {
2082         return sprintf(buf, "%d\n", perf_reserved_percpu);
2083 }
2084
2085 static ssize_t
2086 perf_set_reserve_percpu(struct sysdev_class *class,
2087                         const char *buf,
2088                         size_t count)
2089 {
2090         struct perf_cpu_context *cpuctx;
2091         unsigned long val;
2092         int err, cpu, mpt;
2093
2094         err = strict_strtoul(buf, 10, &val);
2095         if (err)
2096                 return err;
2097         if (val > perf_max_counters)
2098                 return -EINVAL;
2099
2100         mutex_lock(&perf_resource_mutex);
2101         perf_reserved_percpu = val;
2102         for_each_online_cpu(cpu) {
2103                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2104                 spin_lock_irq(&cpuctx->ctx.lock);
2105                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2106                           perf_max_counters - perf_reserved_percpu);
2107                 cpuctx->max_pertask = mpt;
2108                 spin_unlock_irq(&cpuctx->ctx.lock);
2109         }
2110         mutex_unlock(&perf_resource_mutex);
2111
2112         return count;
2113 }
2114
2115 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2116 {
2117         return sprintf(buf, "%d\n", perf_overcommit);
2118 }
2119
2120 static ssize_t
2121 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2122 {
2123         unsigned long val;
2124         int err;
2125
2126         err = strict_strtoul(buf, 10, &val);
2127         if (err)
2128                 return err;
2129         if (val > 1)
2130                 return -EINVAL;
2131
2132         mutex_lock(&perf_resource_mutex);
2133         perf_overcommit = val;
2134         mutex_unlock(&perf_resource_mutex);
2135
2136         return count;
2137 }
2138
2139 static SYSDEV_CLASS_ATTR(
2140                                 reserve_percpu,
2141                                 0644,
2142                                 perf_show_reserve_percpu,
2143                                 perf_set_reserve_percpu
2144                         );
2145
2146 static SYSDEV_CLASS_ATTR(
2147                                 overcommit,
2148                                 0644,
2149                                 perf_show_overcommit,
2150                                 perf_set_overcommit
2151                         );
2152
2153 static struct attribute *perfclass_attrs[] = {
2154         &attr_reserve_percpu.attr,
2155         &attr_overcommit.attr,
2156         NULL
2157 };
2158
2159 static struct attribute_group perfclass_attr_group = {
2160         .attrs                  = perfclass_attrs,
2161         .name                   = "perf_counters",
2162 };
2163
2164 static int __init perf_counter_sysfs_init(void)
2165 {
2166         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2167                                   &perfclass_attr_group);
2168 }
2169 device_initcall(perf_counter_sysfs_init);