perf: Drop sample rate when sampling is too slow

author Dave Hansen <dave.hansen@linux.intel.com>

Fri, 21 Jun 2013 15:51:36 +0000 (08:51 -0700)

committer Ingo Molnar <mingo@kernel.org>

Sun, 23 Jun 2013 09:52:57 +0000 (11:52 +0200)
author Dave Hansen <dave.hansen@linux.intel.com>
Fri, 21 Jun 2013 15:51:36 +0000 (08:51 -0700)
committer Ingo Molnar <mingo@kernel.org>
Sun, 23 Jun 2013 09:52:57 +0000 (11:52 +0200)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt

index bcff3f9de5503d251b2fdf86a0cc585a34c6ba6f..ab7d16efa96bf78bbfdddc029eb40b82da1e03fe 100644 (file)
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -427,6 +427,32 @@ This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
  
  ==============================================================
  
+perf_cpu_time_max_percent:
+
+Hints to the kernel how much CPU time it should be allowed to
+use to handle perf sampling events.  If the perf subsystem
+is informed that its samples are exceeding this limit, it
+will drop its sampling frequency to attempt to reduce its CPU
+usage.
+
+Some perf sampling happens in NMIs.  If these samples
+unexpectedly take too long to execute, the NMIs can become
+stacked up next to each other so much that nothing else is
+allowed to execute.
+
+0: disable the mechanism.  Do not monitor or correct perf's
+   sampling rate no matter how CPU time it takes.
+
+1-100: attempt to throttle perf's sample rate to this
+   percentage of CPU.  Note: the kernel calculates an
+   "expected" length of each sample event.  100 here means
+   100% of that expected length.  Even if this is set to
+   100, you may still see sample throttling if this
+   length is exceeded.  Set to 0 if you truly do not care
+   how much CPU is consumed.
+
+==============================================================
+
  
  pid_max:
  
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index ab3395295224fb9a436bc8d170589dc064d87ac7..afc2413ba00c53c58e3714d0f31b325e05a973db 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1252,10 +1252,20 @@ void perf_events_lapic_init(void)
  static int __kprobes
  perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
  {
+       int ret;
+       u64 start_clock;
+       u64 finish_clock;
+
         if (!atomic_read(&active_events))
                 return NMI_DONE;
  
-       return x86_pmu.handle_irq(regs);
+       start_clock = local_clock();
+       ret = x86_pmu.handle_irq(regs);
+       finish_clock = local_clock();
+
+       perf_sample_event_took(finish_clock - start_clock);
+
+       return ret;
  }
  
  struct event_constraint emptyconstraint;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 056f93a7990f856b2daf1860b7530e57053d98f3..50b3efd14d29286cb580ce40e73697508b1943f5 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -706,10 +706,17 @@ static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64
  extern int sysctl_perf_event_paranoid;
  extern int sysctl_perf_event_mlock;
  extern int sysctl_perf_event_sample_rate;
+extern int sysctl_perf_cpu_time_max_percent;
+
+extern void perf_sample_event_took(u64 sample_len_ns);
  
  extern int perf_proc_update_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
                 loff_t *ppos);
+extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *lenp,
+               loff_t *ppos);
+
  
  static inline bool perf_paranoid_tracepoint_raw(void)
  {
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 9c8920783317b641609da2670570b2052c7a3fb5..1db3af93370410ed8538e04b785f7cb2a3e4b33c 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,26 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
  /*
   * max perf event sample rate
   */
-#define DEFAULT_MAX_SAMPLE_RATE 100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
-static int max_samples_per_tick __read_mostly =
-       DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+#define DEFAULT_MAX_SAMPLE_RATE                100000
+#define DEFAULT_SAMPLE_PERIOD_NS       (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT   25
+
+int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly  = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static atomic_t perf_sample_allowed_ns __read_mostly =
+       ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+
+void update_perf_cpu_limits(void)
+{
+       u64 tmp = perf_sample_period_ns;
+
+       tmp *= sysctl_perf_cpu_time_max_percent;
+       tmp = do_div(tmp, 100);
+       atomic_set(&perf_sample_allowed_ns, tmp);
+}
  
  static int perf_rotate_context(struct perf_cpu_context *cpuctx);
  
@@ -182,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                 return ret;
  
         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+       update_perf_cpu_limits();
  
         return 0;
  }
  
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+                               void __user *buffer, size_t *lenp,
+                               loff_t *ppos)
+{
+       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+       if (ret || !write)
+               return ret;
+
+       update_perf_cpu_limits();
+
+       return 0;
+}
+
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done.  This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+DEFINE_PER_CPU(u64, running_sample_length);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+       u64 avg_local_sample_len;
+       u64 local_samples_len = __get_cpu_var(running_sample_length);
+
+       if (atomic_read(&perf_sample_allowed_ns) == 0)
+               return;
+
+       /* decay the counter by 1 average sample */
+       local_samples_len = __get_cpu_var(running_sample_length);
+       local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+       local_samples_len += sample_len_ns;
+       __get_cpu_var(running_sample_length) = local_samples_len;
+
+       /*
+        * note: this will be biased artifically low until we have
+        * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+        * from having to maintain a count.
+        */
+       avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+       if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+               return;
+
+       if (max_samples_per_tick <= 1)
+               return;
+
+       max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+       sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+       printk_ratelimited(KERN_WARNING
+                       "perf samples too long (%lld > %d), lowering "
+                       "kernel.perf_event_max_sample_rate to %d\n",
+                       avg_local_sample_len,
+                       atomic_read(&perf_sample_allowed_ns),
+                       sysctl_perf_event_sample_rate);
+
+       update_perf_cpu_limits();
+}
+
  static atomic64_t perf_event_id;
  
  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index b0a1f99907f376f9a4dcd602c9044557bc3d0ea6..4ce13c3cedb97a8bb4eb402d02bc10f79c35245e 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1043,6 +1043,15 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = perf_proc_update_handler,
         },
+       {
+               .procname       = "perf_cpu_time_max_percent",
+               .data           = &sysctl_perf_cpu_time_max_percent,
+               .maxlen         = sizeof(sysctl_perf_cpu_time_max_percent),
+               .mode           = 0644,
+               .proc_handler   = perf_cpu_time_max_percent_handler,
+               .extra1         = &zero,
+               .extra2         = &one_hundred,
+       },
  #endif
  #ifdef CONFIG_KMEMCHECK
         {
author	Dave Hansen <dave.hansen@linux.intel.com>
	Fri, 21 Jun 2013 15:51:36 +0000 (08:51 -0700)
committer	Ingo Molnar <mingo@kernel.org>
	Sun, 23 Jun 2013 09:52:57 +0000 (11:52 +0200)
Documentation/sysctl/kernel.txt		patch \| blob \| history
arch/x86/kernel/cpu/perf_event.c		patch \| blob \| history
include/linux/perf_event.h		patch \| blob \| history
kernel/events/core.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history