rcu: Break call_rcu() deadlock involving scheduler and perf

author Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Fri, 4 Oct 2013 21:33:34 +0000 (14:33 -0700)

committer Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Tue, 3 Dec 2013 18:10:18 +0000 (10:10 -0800)
author Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Fri, 4 Oct 2013 21:33:34 +0000 (14:33 -0700)
committer Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tue, 3 Dec 2013 18:10:18 +0000 (10:10 -0800)
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt

index f3778f8952da1b9473897a4a2f392f049b9c544c..b8c3c813ea571d0405c7bf8be3be80f8e0668707 100644 (file)
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -396,14 +396,14 @@ o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
  
  The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
  
-  0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903
-  1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113
-  2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889
-  3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469
-  4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042
-  5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422
-  6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699
-  7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147
+  0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0
+  1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0
+  2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0
+  3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0
+  4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0
+  5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0
+  6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0
+  7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0
  
  The fields are as follows:
  
@@ -432,6 +432,10 @@ o  "gpc" is the number of times that an old grace period had
  o      "gps" is the number of times that a new grace period had started,
         but this CPU was not yet aware of it.
  
+o      "ndw" is the number of times that a wakeup of an rcuo
+       callback-offload kthread had to be deferred in order to avoid
+       deadlock.
+
  o      "nn" is the number of times that this CPU needed nothing.
  
  
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index abef9c358d47a45f2a0ba8edd5878c68ba8a385c..264f0284c0bd90a1655e81b286cc50b386fa026c 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
  static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
                                 bool user)
  {
+       struct rcu_state *rsp;
+       struct rcu_data *rdp;
+
         trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
         if (!user && !is_idle_task(current)) {
                 struct task_struct *idle __maybe_unused =
@@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
                           current->pid, current->comm,
                           idle->pid, idle->comm); /* must be idle task! */
         }
+       for_each_rcu_flavor(rsp) {
+               rdp = this_cpu_ptr(rsp->rda);
+               do_nocb_deferred_wakeup(rdp);
+       }
         rcu_prepare_for_idle(smp_processor_id());
         /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
         smp_mb__before_atomic_inc();  /* See above. */
@@ -1928,13 +1935,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
   * Adopt the RCU callbacks from the specified rcu_state structure's
   * orphanage.  The caller must hold the ->orphan_lock.
   */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
  {
         int i;
         struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
  
         /* No-CBs CPUs are handled specially. */
-       if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
+       if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
                 return;
  
         /* Do the accounting first. */
@@ -2013,7 +2020,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
  
         /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
         rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
-       rcu_adopt_orphan_cbs(rsp);
+       rcu_adopt_orphan_cbs(rsp, flags);
  
         /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
         mask = rdp->grpmask;    /* rnp->grplo is constant. */
@@ -2330,6 +2337,9 @@ __rcu_process_callbacks(struct rcu_state *rsp)
         /* If there are callbacks ready, invoke them. */
         if (cpu_has_callbacks_ready_to_invoke(rdp))
                 invoke_rcu_callbacks(rsp, rdp);
+
+       /* Do any needed deferred wakeups of rcuo kthreads. */
+       do_nocb_deferred_wakeup(rdp);
  }
  
  /*
@@ -2464,7 +2474,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
  
                 if (cpu != -1)
                         rdp = per_cpu_ptr(rsp->rda, cpu);
-               offline = !__call_rcu_nocb(rdp, head, lazy);
+               offline = !__call_rcu_nocb(rdp, head, lazy, flags);
                 WARN_ON_ONCE(offline);
                 /* _call_rcu() is illegal on offline CPU; leak the callback. */
                 local_irq_restore(flags);
@@ -2817,6 +2827,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
                 return 1;
         }
  
+       /* Does this CPU need a deferred NOCB wakeup? */
+       if (rcu_nocb_need_deferred_wakeup(rdp)) {
+               rdp->n_rp_nocb_defer_wakeup++;
+               return 1;
+       }
+
         /* nothing to do */
         rdp->n_rp_need_nothing++;
         return 0;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index 8e34d8674a4e6e438d4d2acaaf260ee507b83d5d..a87adfc2916b9e3a9881fb1234ae9f444f5faa89 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -317,6 +317,7 @@ struct rcu_data {
         unsigned long n_rp_cpu_needs_gp;
         unsigned long n_rp_gp_completed;
         unsigned long n_rp_gp_started;
+       unsigned long n_rp_nocb_defer_wakeup;
         unsigned long n_rp_need_nothing;
  
         /* 6) _rcu_barrier() and OOM callbacks. */
@@ -335,6 +336,7 @@ struct rcu_data {
         int nocb_p_count_lazy;          /*  (approximate). */
         wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
         struct task_struct *nocb_kthread;
+       bool nocb_defer_wakeup;         /* Defer wakeup of nocb_kthread. */
  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  
         /* 8) RCU CPU stall data. */
@@ -550,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
  static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
  static void rcu_init_one_nocb(struct rcu_node *rnp);
  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
-                           bool lazy);
+                           bool lazy, unsigned long flags);
  static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                     struct rcu_data *rdp);
+                                     struct rcu_data *rdp,
+                                     unsigned long flags);
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
  static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
  static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
  static void rcu_kick_nohz_cpu(int cpu);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h

index b023e5407111ca7b678e096d02e25a0dd966521e..752ffaa0d681eee1725172e0c4e06306b7415619 100644 (file)
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2104,7 +2104,8 @@ bool rcu_is_nocb_cpu(int cpu)
  static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                                     struct rcu_head *rhp,
                                     struct rcu_head **rhtp,
-                                   int rhcount, int rhcount_lazy)
+                                   int rhcount, int rhcount_lazy,
+                                   unsigned long flags)
  {
         int len;
         struct rcu_head **old_rhpp;
@@ -2125,9 +2126,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
         }
         len = atomic_long_read(&rdp->nocb_q_count);
         if (old_rhpp == &rdp->nocb_head) {
-               wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
+               if (!irqs_disabled_flags(flags)) {
+                       wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                           TPS("WakeEmpty"));
+               } else {
+                       rdp->nocb_defer_wakeup = true;
+                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                           TPS("WakeEmptyIsDeferred"));
+               }
                 rdp->qlen_last_fqs_check = 0;
-               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
         } else if (len > rdp->qlen_last_fqs_check + qhimark) {
                 wake_up_process(t); /* ... or if many callbacks queued. */
                 rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2148,12 +2156,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
   * "rcuo" kthread can find it.
   */
  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
-                           bool lazy)
+                           bool lazy, unsigned long flags)
  {
  
         if (!rcu_is_nocb_cpu(rdp->cpu))
                 return 0;
-       __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+       __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
         if (__is_kfree_rcu_offset((unsigned long)rhp->func))
                 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
                                          (unsigned long)rhp->func,
@@ -2171,7 +2179,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
   * not a no-CBs CPU.
   */
  static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                                    struct rcu_data *rdp)
+                                                    struct rcu_data *rdp,
+                                                    unsigned long flags)
  {
         long ql = rsp->qlen;
         long qll = rsp->qlen_lazy;
@@ -2185,14 +2194,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
         /* First, enqueue the donelist, if any.  This preserves CB ordering. */
         if (rsp->orphan_donelist != NULL) {
                 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
-                                       rsp->orphan_donetail, ql, qll);
+                                       rsp->orphan_donetail, ql, qll, flags);
                 ql = qll = 0;
                 rsp->orphan_donelist = NULL;
                 rsp->orphan_donetail = &rsp->orphan_donelist;
         }
         if (rsp->orphan_nxtlist != NULL) {
                 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
-                                       rsp->orphan_nxttail, ql, qll);
+                                       rsp->orphan_nxttail, ql, qll, flags);
                 ql = qll = 0;
                 rsp->orphan_nxtlist = NULL;
                 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
@@ -2314,6 +2323,22 @@ static int rcu_nocb_kthread(void *arg)
         return 0;
  }
  
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+       return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+       if (!rcu_nocb_need_deferred_wakeup(rdp))
+               return;
+       ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
+       wake_up(&rdp->nocb_wq);
+       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
+}
+
  /* Initialize per-rcu_data variables for no-CBs CPUs. */
  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
  {
@@ -2369,13 +2394,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
  }
  
  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
-                           bool lazy)
+                           bool lazy, unsigned long flags)
  {
         return 0;
  }
  
  static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                                    struct rcu_data *rdp)
+                                                    struct rcu_data *rdp,
+                                                    unsigned long flags)
  {
         return 0;
  }
@@ -2384,6 +2410,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
  {
  }
  
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+       return false;
+}
+
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+}
+
  static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
  {
  }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c

index 3596797b7e462b2069f8ef21f1b669d777667689..4def475336d412bcbfd8aa4e34a600e0f8b62d41 100644 (file)
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
                    rdp->n_rp_report_qs,
                    rdp->n_rp_cb_ready,
                    rdp->n_rp_cpu_needs_gp);
-       seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
+       seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
                    rdp->n_rp_gp_completed,
                    rdp->n_rp_gp_started,
+                  rdp->n_rp_nocb_defer_wakeup,
                    rdp->n_rp_need_nothing);
  }
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	Fri, 4 Oct 2013 21:33:34 +0000 (14:33 -0700)
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	Tue, 3 Dec 2013 18:10:18 +0000 (10:10 -0800)
Documentation/RCU/trace.txt		patch \| blob \| history
kernel/rcu/tree.c		patch \| blob \| history
kernel/rcu/tree.h		patch \| blob \| history
kernel/rcu/tree_plugin.h		patch \| blob \| history
kernel/rcu/tree_trace.c		patch \| blob \| history