]> git.karo-electronics.de Git - linux-beck.git/blobdiff - kernel/events/core.c
perf: Fix hypervisor branch sampling permission check
[linux-beck.git] / kernel / events / core.c
index 6b41c1899a8b00acc0ca48ae30b0e8dfbdd2ad9d..aca95bce34c89f6dde0f922bd8b53d5848133de3 100644 (file)
@@ -170,6 +170,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 static int max_samples_per_tick __read_mostly =
        DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+
 int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -196,9 +198,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
 
-static void ring_buffer_attach(struct perf_event *event,
-                              struct ring_buffer *rb);
-
 void __weak perf_event_print_debug(void)       { }
 
 extern __weak const char *perf_pmu_name(void)
@@ -658,6 +657,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 }
 #endif
 
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+       struct perf_cpu_context *cpuctx;
+       enum hrtimer_restart ret = HRTIMER_NORESTART;
+       int rotations = 0;
+
+       WARN_ON(!irqs_disabled());
+
+       cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+       rotations = perf_rotate_context(cpuctx);
+
+       /*
+        * arm timer if needed
+        */
+       if (rotations) {
+               hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+               ret = HRTIMER_RESTART;
+       }
+
+       return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+       struct perf_cpu_context *cpuctx;
+       struct pmu *pmu;
+       unsigned long flags;
+
+       if (WARN_ON(cpu != smp_processor_id()))
+               return;
+
+       local_irq_save(flags);
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+               if (pmu->task_ctx_nr == perf_sw_context)
+                       continue;
+
+               hrtimer_cancel(&cpuctx->hrtimer);
+       }
+
+       rcu_read_unlock();
+
+       local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct pmu *pmu = cpuctx->ctx.pmu;
+       int timer;
+
+       /* no multiplexing needed for SW PMU */
+       if (pmu->task_ctx_nr == perf_sw_context)
+               return;
+
+       /*
+        * check default is sane, if not set then force to
+        * default interval (1/tick)
+        */
+       timer = pmu->hrtimer_interval_ms;
+       if (timer < 1)
+               timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+
+       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+       hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct pmu *pmu = cpuctx->ctx.pmu;
+
+       /* not for SW PMU */
+       if (pmu->task_ctx_nr == perf_sw_context)
+               return;
+
+       if (hrtimer_active(hr))
+               return;
+
+       if (!hrtimer_callback_running(hr))
+               __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+                                        0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1506,6 +1605,7 @@ group_sched_in(struct perf_event *group_event,
 
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
+               perf_cpu_hrtimer_restart(cpuctx);
                return -EAGAIN;
        }
 
@@ -1552,6 +1652,8 @@ group_error:
 
        pmu->cancel_txn(pmu);
 
+       perf_cpu_hrtimer_restart(cpuctx);
+
        return -EAGAIN;
 }
 
@@ -1807,8 +1909,10 @@ static int __perf_event_enable(void *info)
                 * If this event can't go on and it's part of a
                 * group, then the whole group has to come off.
                 */
-               if (leader != event)
+               if (leader != event) {
                        group_sched_out(leader, cpuctx, ctx);
+                       perf_cpu_hrtimer_restart(cpuctx);
+               }
                if (leader->attr.pinned) {
                        update_group_times(leader);
                        leader->state = PERF_EVENT_STATE_ERROR;
@@ -2555,7 +2659,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
  * because they're strictly cpu affine and rotate_start is called with IRQs
  * disabled, while rotate_context is called from IRQ context.
  */
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
        int rotate = 0, remove = 1;
@@ -2594,6 +2698,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
+
+       return rotate;
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -2625,10 +2731,6 @@ void perf_event_task_tick(void)
                ctx = cpuctx->task_ctx;
                if (ctx)
                        perf_adjust_freq_unthr_context(ctx, throttled);
-
-               if (cpuctx->jiffies_interval == 1 ||
-                               !(jiffies % cpuctx->jiffies_interval))
-                       perf_rotate_context(cpuctx);
        }
 }
 
@@ -2918,6 +3020,7 @@ static void free_event_rcu(struct rcu_head *head)
 }
 
 static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
 
 static void free_event(struct perf_event *event)
 {
@@ -2942,15 +3045,30 @@ static void free_event(struct perf_event *event)
                if (has_branch_stack(event)) {
                        static_key_slow_dec_deferred(&perf_sched_events);
                        /* is system-wide event */
-                       if (!(event->attach_state & PERF_ATTACH_TASK))
+                       if (!(event->attach_state & PERF_ATTACH_TASK)) {
                                atomic_dec(&per_cpu(perf_branch_stack_events,
                                                    event->cpu));
+                       }
                }
        }
 
        if (event->rb) {
-               ring_buffer_put(event->rb);
-               event->rb = NULL;
+               struct ring_buffer *rb;
+
+               /*
+                * Can happen when we close an event with re-directed output.
+                *
+                * Since we have a 0 refcount, perf_mmap_close() will skip
+                * over us; possibly making our ring_buffer_put() the last.
+                */
+               mutex_lock(&event->mmap_mutex);
+               rb = event->rb;
+               if (rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* could be last */
+               }
+               mutex_unlock(&event->mmap_mutex);
        }
 
        if (is_cgroup_event(event))
@@ -3188,30 +3306,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        unsigned int events = POLL_HUP;
 
        /*
-        * Race between perf_event_set_output() and perf_poll(): perf_poll()
-        * grabs the rb reference but perf_event_set_output() overrides it.
-        * Here is the timeline for two threads T1, T2:
-        * t0: T1, rb = rcu_dereference(event->rb)
-        * t1: T2, old_rb = event->rb
-        * t2: T2, event->rb = new rb
-        * t3: T2, ring_buffer_detach(old_rb)
-        * t4: T1, ring_buffer_attach(rb1)
-        * t5: T1, poll_wait(event->waitq)
-        *
-        * To avoid this problem, we grab mmap_mutex in perf_poll()
-        * thereby ensuring that the assignment of the new ring buffer
-        * and the detachment of the old buffer appear atomic to perf_poll()
+        * Pin the event->rb by taking event->mmap_mutex; otherwise
+        * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
-
-       rcu_read_lock();
-       rb = rcu_dereference(event->rb);
-       if (rb) {
-               ring_buffer_attach(event, rb);
+       rb = event->rb;
+       if (rb)
                events = atomic_xchg(&rb->poll, 0);
-       }
-       rcu_read_unlock();
-
        mutex_unlock(&event->mmap_mutex);
 
        poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3622,12 @@ static void ring_buffer_attach(struct perf_event *event,
                return;
 
        spin_lock_irqsave(&rb->event_lock, flags);
-       if (!list_empty(&event->rb_entry))
-               goto unlock;
-
-       list_add(&event->rb_entry, &rb->event_list);
-unlock:
+       if (list_empty(&event->rb_entry))
+               list_add(&event->rb_entry, &rb->event_list);
        spin_unlock_irqrestore(&rb->event_lock, flags);
 }
 
-static void ring_buffer_detach(struct perf_event *event,
-                              struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
 {
        unsigned long flags;
 
@@ -3549,13 +3646,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
 
        rcu_read_lock();
        rb = rcu_dereference(event->rb);
-       if (!rb)
-               goto unlock;
-
-       list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
-               wake_up_all(&event->waitq);
-
-unlock:
+       if (rb) {
+               list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+                       wake_up_all(&event->waitq);
+       }
        rcu_read_unlock();
 }
 
@@ -3584,18 +3678,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 
 static void ring_buffer_put(struct ring_buffer *rb)
 {
-       struct perf_event *event, *n;
-       unsigned long flags;
-
        if (!atomic_dec_and_test(&rb->refcount))
                return;
 
-       spin_lock_irqsave(&rb->event_lock, flags);
-       list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-               list_del_init(&event->rb_entry);
-               wake_up_all(&event->waitq);
-       }
-       spin_unlock_irqrestore(&rb->event_lock, flags);
+       WARN_ON_ONCE(!list_empty(&rb->event_list));
 
        call_rcu(&rb->rcu_head, rb_free_rcu);
 }
@@ -3605,26 +3691,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
        struct perf_event *event = vma->vm_file->private_data;
 
        atomic_inc(&event->mmap_count);
+       atomic_inc(&event->rb->mmap_count);
 }
 
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
 static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
 
-       if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-               unsigned long size = perf_data_size(event->rb);
-               struct user_struct *user = event->mmap_user;
-               struct ring_buffer *rb = event->rb;
+       struct ring_buffer *rb = event->rb;
+       struct user_struct *mmap_user = rb->mmap_user;
+       int mmap_locked = rb->mmap_locked;
+       unsigned long size = perf_data_size(rb);
+
+       atomic_dec(&rb->mmap_count);
+
+       if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+               return;
+
+       /* Detach current event from the buffer. */
+       rcu_assign_pointer(event->rb, NULL);
+       ring_buffer_detach(event, rb);
+       mutex_unlock(&event->mmap_mutex);
+
+       /* If there's still other mmap()s of this buffer, we're done. */
+       if (atomic_read(&rb->mmap_count)) {
+               ring_buffer_put(rb); /* can't be last */
+               return;
+       }
+
+       /*
+        * No other mmap()s, detach from all other events that might redirect
+        * into the now unreachable buffer. Somewhat complicated by the
+        * fact that rb::event_lock otherwise nests inside mmap_mutex.
+        */
+again:
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+               if (!atomic_long_inc_not_zero(&event->refcount)) {
+                       /*
+                        * This event is en-route to free_event() which will
+                        * detach it and remove it from the list.
+                        */
+                       continue;
+               }
+               rcu_read_unlock();
 
-               atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-               vma->vm_mm->pinned_vm -= event->mmap_locked;
-               rcu_assign_pointer(event->rb, NULL);
-               ring_buffer_detach(event, rb);
+               mutex_lock(&event->mmap_mutex);
+               /*
+                * Check we didn't race with perf_event_set_output() which can
+                * swizzle the rb from under us while we were waiting to
+                * acquire mmap_mutex.
+                *
+                * If we find a different rb; ignore this event, a next
+                * iteration will no longer find it on the list. We have to
+                * still restart the iteration to make sure we're not now
+                * iterating the wrong list.
+                */
+               if (event->rb == rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* can't be last, we still have one */
+               }
                mutex_unlock(&event->mmap_mutex);
+               put_event(event);
 
-               ring_buffer_put(rb);
-               free_uid(user);
+               /*
+                * Restart the iteration; either we're on the wrong list or
+                * destroyed its integrity by doing a deletion.
+                */
+               goto again;
        }
+       rcu_read_unlock();
+
+       /*
+        * It could be there's still a few 0-ref events on the list; they'll
+        * get cleaned up by free_event() -- they'll also still have their
+        * ref on the rb and will free it whenever they are done with it.
+        *
+        * Aside from that, this buffer is 'fully' detached and unmapped,
+        * undo the VM accounting.
+        */
+
+       atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+       vma->vm_mm->pinned_vm -= mmap_locked;
+       free_uid(mmap_user);
+
+       ring_buffer_put(rb); /* could be last */
 }
 
 static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +3834,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                return -EINVAL;
 
        WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
-               if (event->rb->nr_pages == nr_pages)
-                       atomic_inc(&event->rb->refcount);
-               else
+               if (event->rb->nr_pages != nr_pages) {
                        ret = -EINVAL;
+                       goto unlock;
+               }
+
+               if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+                       /*
+                        * Raced against perf_mmap_close() through
+                        * perf_event_set_output(). Try again, hope for better
+                        * luck.
+                        */
+                       mutex_unlock(&event->mmap_mutex);
+                       goto again;
+               }
+
                goto unlock;
        }
 
@@ -3720,12 +3892,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                ret = -ENOMEM;
                goto unlock;
        }
-       rcu_assign_pointer(event->rb, rb);
+
+       atomic_set(&rb->mmap_count, 1);
+       rb->mmap_locked = extra;
+       rb->mmap_user = get_current_user();
 
        atomic_long_add(user_extra, &user->locked_vm);
-       event->mmap_locked = extra;
-       event->mmap_user = get_current_user();
-       vma->vm_mm->pinned_vm += event->mmap_locked;
+       vma->vm_mm->pinned_vm += extra;
+
+       ring_buffer_attach(event, rb);
+       rcu_assign_pointer(event->rb, rb);
 
        perf_event_update_userpage(event);
 
@@ -3734,7 +3910,11 @@ unlock:
                atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
 
-       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+       /*
+        * Since pinned accounting is per vm we cannot allow fork() to copy our
+        * vma.
+        */
+       vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;
 
        return ret;
@@ -4394,6 +4574,64 @@ perf_event_read_event(struct perf_event *event,
        perf_output_end(&handle);
 }
 
+typedef int  (perf_event_aux_match_cb)(struct perf_event *event, void *data);
+typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+
+static void
+perf_event_aux_ctx(struct perf_event_context *ctx,
+                  perf_event_aux_match_cb match,
+                  perf_event_aux_output_cb output,
+                  void *data)
+{
+       struct perf_event *event;
+
+       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+               if (event->state < PERF_EVENT_STATE_INACTIVE)
+                       continue;
+               if (!event_filter_match(event))
+                       continue;
+               if (match(event, data))
+                       output(event, data);
+       }
+}
+
+static void
+perf_event_aux(perf_event_aux_match_cb match,
+              perf_event_aux_output_cb output,
+              void *data,
+              struct perf_event_context *task_ctx)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_event_context *ctx;
+       struct pmu *pmu;
+       int ctxn;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+               if (cpuctx->unique_pmu != pmu)
+                       goto next;
+               perf_event_aux_ctx(&cpuctx->ctx, match, output, data);
+               if (task_ctx)
+                       goto next;
+               ctxn = pmu->task_ctx_nr;
+               if (ctxn < 0)
+                       goto next;
+               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               if (ctx)
+                       perf_event_aux_ctx(ctx, match, output, data);
+next:
+               put_cpu_ptr(pmu->pmu_cpu_context);
+       }
+
+       if (task_ctx) {
+               preempt_disable();
+               perf_event_aux_ctx(task_ctx, match, output, data);
+               preempt_enable();
+       }
+       rcu_read_unlock();
+}
+
 /*
  * task tracking -- fork/exit
  *
@@ -4416,8 +4654,9 @@ struct perf_task_event {
 };
 
 static void perf_event_task_output(struct perf_event *event,
-                                    struct perf_task_event *task_event)
+                                  void *data)
 {
+       struct perf_task_event *task_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct task_struct *task = task_event->task;
@@ -4445,62 +4684,11 @@ out:
        task_event->event_id.header.size = size;
 }
 
-static int perf_event_task_match(struct perf_event *event)
-{
-       if (event->state < PERF_EVENT_STATE_INACTIVE)
-               return 0;
-
-       if (!event_filter_match(event))
-               return 0;
-
-       if (event->attr.comm || event->attr.mmap ||
-           event->attr.mmap_data || event->attr.task)
-               return 1;
-
-       return 0;
-}
-
-static void perf_event_task_ctx(struct perf_event_context *ctx,
-                                 struct perf_task_event *task_event)
+static int perf_event_task_match(struct perf_event *event,
+                                void *data __maybe_unused)
 {
-       struct perf_event *event;
-
-       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-               if (perf_event_task_match(event))
-                       perf_event_task_output(event, task_event);
-       }
-}
-
-static void perf_event_task_event(struct perf_task_event *task_event)
-{
-       struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx;
-       struct pmu *pmu;
-       int ctxn;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->unique_pmu != pmu)
-                       goto next;
-               perf_event_task_ctx(&cpuctx->ctx, task_event);
-
-               ctx = task_event->task_ctx;
-               if (!ctx) {
-                       ctxn = pmu->task_ctx_nr;
-                       if (ctxn < 0)
-                               goto next;
-                       ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-                       if (ctx)
-                               perf_event_task_ctx(ctx, task_event);
-               }
-next:
-               put_cpu_ptr(pmu->pmu_cpu_context);
-       }
-       if (task_event->task_ctx)
-               perf_event_task_ctx(task_event->task_ctx, task_event);
-
-       rcu_read_unlock();
+       return event->attr.comm || event->attr.mmap ||
+              event->attr.mmap_data || event->attr.task;
 }
 
 static void perf_event_task(struct task_struct *task,
@@ -4531,7 +4719,10 @@ static void perf_event_task(struct task_struct *task,
                },
        };
 
-       perf_event_task_event(&task_event);
+       perf_event_aux(perf_event_task_match,
+                      perf_event_task_output,
+                      &task_event,
+                      task_ctx);
 }
 
 void perf_event_fork(struct task_struct *task)
@@ -4557,8 +4748,9 @@ struct perf_comm_event {
 };
 
 static void perf_event_comm_output(struct perf_event *event,
-                                    struct perf_comm_event *comm_event)
+                                  void *data)
 {
+       struct perf_comm_event *comm_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
@@ -4585,39 +4777,16 @@ out:
        comm_event->event_id.header.size = size;
 }
 
-static int perf_event_comm_match(struct perf_event *event)
+static int perf_event_comm_match(struct perf_event *event,
+                                void *data __maybe_unused)
 {
-       if (event->state < PERF_EVENT_STATE_INACTIVE)
-               return 0;
-
-       if (!event_filter_match(event))
-               return 0;
-
-       if (event->attr.comm)
-               return 1;
-
-       return 0;
-}
-
-static void perf_event_comm_ctx(struct perf_event_context *ctx,
-                                 struct perf_comm_event *comm_event)
-{
-       struct perf_event *event;
-
-       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-               if (perf_event_comm_match(event))
-                       perf_event_comm_output(event, comm_event);
-       }
+       return event->attr.comm;
 }
 
 static void perf_event_comm_event(struct perf_comm_event *comm_event)
 {
-       struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx;
        char comm[TASK_COMM_LEN];
        unsigned int size;
-       struct pmu *pmu;
-       int ctxn;
 
        memset(comm, 0, sizeof(comm));
        strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -4627,24 +4796,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->comm_size = size;
 
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
-       rcu_read_lock();
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->unique_pmu != pmu)
-                       goto next;
-               perf_event_comm_ctx(&cpuctx->ctx, comm_event);
 
-               ctxn = pmu->task_ctx_nr;
-               if (ctxn < 0)
-                       goto next;
-
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx)
-                       perf_event_comm_ctx(ctx, comm_event);
-next:
-               put_cpu_ptr(pmu->pmu_cpu_context);
-       }
-       rcu_read_unlock();
+       perf_event_aux(perf_event_comm_match,
+                      perf_event_comm_output,
+                      comm_event,
+                      NULL);
 }
 
 void perf_event_comm(struct task_struct *task)
@@ -4706,8 +4862,9 @@ struct perf_mmap_event {
 };
 
 static void perf_event_mmap_output(struct perf_event *event,
-                                    struct perf_mmap_event *mmap_event)
+                                  void *data)
 {
+       struct perf_mmap_event *mmap_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
@@ -4734,46 +4891,24 @@ out:
 }
 
 static int perf_event_mmap_match(struct perf_event *event,
-                                  struct perf_mmap_event *mmap_event,
-                                  int executable)
+                                void *data)
 {
-       if (event->state < PERF_EVENT_STATE_INACTIVE)
-               return 0;
-
-       if (!event_filter_match(event))
-               return 0;
-
-       if ((!executable && event->attr.mmap_data) ||
-           (executable && event->attr.mmap))
-               return 1;
-
-       return 0;
-}
-
-static void perf_event_mmap_ctx(struct perf_event_context *ctx,
-                                 struct perf_mmap_event *mmap_event,
-                                 int executable)
-{
-       struct perf_event *event;
+       struct perf_mmap_event *mmap_event = data;
+       struct vm_area_struct *vma = mmap_event->vma;
+       int executable = vma->vm_flags & VM_EXEC;
 
-       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-               if (perf_event_mmap_match(event, mmap_event, executable))
-                       perf_event_mmap_output(event, mmap_event);
-       }
+       return (!executable && event->attr.mmap_data) ||
+              (executable && event->attr.mmap);
 }
 
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 {
-       struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx;
        struct vm_area_struct *vma = mmap_event->vma;
        struct file *file = vma->vm_file;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
        const char *name;
-       struct pmu *pmu;
-       int ctxn;
 
        memset(tmp, 0, sizeof(tmp));
 
@@ -4829,27 +4964,10 @@ got_name:
 
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
-       rcu_read_lock();
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->unique_pmu != pmu)
-                       goto next;
-               perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
-                                       vma->vm_flags & VM_EXEC);
-
-               ctxn = pmu->task_ctx_nr;
-               if (ctxn < 0)
-                       goto next;
-
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx) {
-                       perf_event_mmap_ctx(ctx, mmap_event,
-                                       vma->vm_flags & VM_EXEC);
-               }
-next:
-               put_cpu_ptr(pmu->pmu_cpu_context);
-       }
-       rcu_read_unlock();
+       perf_event_aux(perf_event_mmap_match,
+                      perf_event_mmap_output,
+                      mmap_event,
+                      NULL);
 
        kfree(buf);
 }
@@ -5023,7 +5141,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
  * sign as trigger.
  */
 
-static u64 perf_swevent_set_period(struct perf_event *event)
+u64 perf_swevent_set_period(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
@@ -5966,9 +6084,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 }
 
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *page)
+{
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+       struct pmu *pmu = dev_get_drvdata(dev);
+       int timer, cpu, ret;
+
+       ret = kstrtoint(buf, 0, &timer);
+       if (ret)
+               return ret;
+
+       if (timer < 1)
+               return -EINVAL;
+
+       /* same value, noting to do */
+       if (timer == pmu->hrtimer_interval_ms)
+               return count;
+
+       pmu->hrtimer_interval_ms = timer;
+
+       /* update all cpuctx for this PMU */
+       for_each_possible_cpu(cpu) {
+               struct perf_cpu_context *cpuctx;
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+               if (hrtimer_active(&cpuctx->hrtimer))
+                       hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+       }
+
+       return count;
+}
+
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+
 static struct device_attribute pmu_dev_attrs[] = {
-       __ATTR_RO(type),
-       __ATTR_NULL,
+       __ATTR_RO(type),
+       __ATTR_RW(perf_event_mux_interval_ms),
+       __ATTR_NULL,
 };
 
 static int pmu_bus_running;
@@ -6063,7 +6228,9 @@ skip_type:
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
-               cpuctx->jiffies_interval = 1;
+
+               __perf_cpu_hrtimer_init(cpuctx, cpu);
+
                INIT_LIST_HEAD(&cpuctx->rotation_list);
                cpuctx->unique_pmu = pmu;
        }
@@ -6389,11 +6556,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;
 
-               /* kernel level capture: check permissions */
-               if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
-                   && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-                       return -EACCES;
-
                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
 
@@ -6411,6 +6573,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                         */
                        attr->branch_sample_type = mask;
                }
+               /* privileged levels capture (kernel, hv): check permissions */
+               if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+                   && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
        }
 
        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -6474,6 +6640,8 @@ set:
        if (atomic_read(&event->mmap_count))
                goto unlock;
 
+       old_rb = event->rb;
+
        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
@@ -6481,16 +6649,28 @@ set:
                        goto unlock;
        }
 
-       old_rb = event->rb;
-       rcu_assign_pointer(event->rb, rb);
        if (old_rb)
                ring_buffer_detach(event, old_rb);
+
+       if (rb)
+               ring_buffer_attach(event, rb);
+
+       rcu_assign_pointer(event->rb, rb);
+
+       if (old_rb) {
+               ring_buffer_put(old_rb);
+               /*
+                * Since we detached before setting the new rb, so that we
+                * could attach the new rb, we could have missed a wakeup.
+                * Provide it now.
+                */
+               wake_up_all(&event->waitq);
+       }
+
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
 
-       if (old_rb)
-               ring_buffer_put(old_rb);
 out:
        return ret;
 }
@@ -7449,7 +7629,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        case CPU_DOWN_PREPARE:
                perf_event_exit_cpu(cpu);
                break;
-
        default:
                break;
        }