From 30fc176aeff55c8dafa29d559353d0a5d4608511 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 9 May 2014 14:30:53 -0700
Subject: [PATCH] rcu: Throttle NOCB kthread grace-period rate

On a system with high CPU utilization and high context-switch rates,
grace periods tend to complete quite quickly.  If all CPUs are no-CBs
CPUs, and all are producing at least one callback per grace period, all
the rcuo kthreads will need to be awakened on every grace period, which
on large systems can be an excessive number of wakeups.  This commit
therefore throttles the rcuo kthreads to prevent them from needing
a grace-period wakeup more often than the interval between a pair of
force-quiescent-state scans.

Reported-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 25 +++++++++++++++++--------
 kernel/rcu/tree_plugin.h | 15 +++++++++++++++
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ebd99af2214e..987fd64f70dc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1578,6 +1578,22 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 		rcu_gp_kthread_wake(rsp);
 }
 
+/*
+ * Read from the jiffies_till_next_fqs boot/sysfs parameter, applying
+ * limits and updating as needed.
+ */
+static unsigned long read_jiffies_till_next_fqs(void)
+{
+	unsigned long j;
+
+	j = jiffies_till_next_fqs;
+	if (j > HZ)
+		j = jiffies_till_next_fqs = HZ;
+	else if (j < 1)
+		j = jiffies_till_next_fqs = 1;
+	return j;
+}
+
 /*
  * Initialize a new grace period.  Return 0 if no grace period required.
  */
@@ -1840,14 +1856,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 						       ACCESS_ONCE(rsp->gpnum),
 						       TPS("fqswaitsig"));
 			}
-			j = jiffies_till_next_fqs;
-			if (j > HZ) {
-				j = HZ;
-				jiffies_till_next_fqs = HZ;
-			} else if (j < 1) {
-				j = 1;
-				jiffies_till_next_fqs = 1;
-			}
+			j = read_jiffies_till_next_fqs();
 		}
 
 		/* Handle grace-period end. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 5da9f9b3abc9..55bbf0fd4f7e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2220,6 +2220,8 @@ static int rcu_nocb_kthread(void *arg)
 {
 	int c, cl;
 	bool firsttime = 1;
+	unsigned long gp_next_start = jiffies;
+	unsigned long j;
 	struct rcu_head *list;
 	struct rcu_head *next;
 	struct rcu_head **tail;
@@ -2227,6 +2229,11 @@ static int rcu_nocb_kthread(void *arg)
 
 	/* Each pass through this loop invokes one batch of callbacks */
 	for (;;) {
+		/* Avoid excessive wakeups due to short grace periods. */
+		j = jiffies;
+		if (time_before(j, gp_next_start))
+			schedule_timeout_uninterruptible(gp_next_start - j);
+
 		/* If not polling, wait for next batch of callbacks. */
 		if (!rcu_nocb_poll) {
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -2250,6 +2257,14 @@ static int rcu_nocb_kthread(void *arg)
 		firsttime = 1;
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 				    TPS("WokeNonEmpty"));
+		/*
+		 * Time of next grace-period start.  The idea is that
+		 * we are willing to take a jiffies_till_next_fqs delay
+		 * if any CPUs are idle, so we should be willing to take
+		 * a similar delay when all are busy context-switching
+		 * their little brains out.
+		 */
+		gp_next_start = jiffies + read_jiffies_till_next_fqs();
 
 		/*
 		 * Extract queued callbacks, update counts, and wait
-- 
2.39.5