mm: /proc/sys/vm/stat_refresh to force vmstat update

author Hugh Dickins <hughd@google.com>

Fri, 20 May 2016 00:12:50 +0000 (17:12 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 20 May 2016 02:12:14 +0000 (19:12 -0700)
author Hugh Dickins <hughd@google.com>
Fri, 20 May 2016 00:12:50 +0000 (17:12 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 20 May 2016 02:12:14 +0000 (19:12 -0700)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt

index 34a5fece31216320181fccfbdaf3a85ad91043be..720355cbdf452dd22cc82c92ab7a43677acec04a 100644 (file)
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
  - panic_on_oom
  - percpu_pagelist_fraction
  - stat_interval
+- stat_refresh
  - swappiness
  - user_reserve_kbytes
  - vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
  
  ==============================================================
  
+stat_refresh
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+==============================================================
+
  swappiness
  
  This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h

index 73fae8c4a5fb50d94b72f12bed28f98d170f5787..02fce415b3d96ba6e3cf1b2fcb1ef5b7c196036d 100644 (file)
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
  void cpu_vm_stats_fold(int cpu);
  void refresh_zone_stat_thresholds(void);
  
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos);
+
  void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
  
  int calculate_pressure_threshold(struct zone *zone);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index c8b318663525d02b2098238341aca72c701966fe..2effd84d83e3f5f8d8c2b0496d77cbe1a61b590e 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec_jiffies,
         },
+       {
+               .procname       = "stat_refresh",
+               .data           = NULL,
+               .maxlen         = 0,
+               .mode           = 0600,
+               .proc_handler   = vmstat_refresh,
+       },
  #endif
  #ifdef CONFIG_MMU
         {
diff --git a/mm/vmstat.c b/mm/vmstat.c

index a7de9adacbd9ed27af2e1b08be29b3548dfdf053..c831be32a1a35843cc6f42a0c553e653521200e8 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
  int sysctl_stat_interval __read_mostly = HZ;
  static cpumask_var_t cpu_stat_off;
  
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+       refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       long val;
+       int err;
+       int i;
+
+       /*
+        * The regular update, every sysctl_stat_interval, may come later
+        * than expected: leaving a significant amount in per_cpu buckets.
+        * This is particularly misleading when checking a quantity of HUGE
+        * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+        * which can equally be echo'ed to or cat'ted from (by root),
+        * can be used to update the stats just before reading them.
+        *
+        * Oh, and since global_page_state() etc. are so careful to hide
+        * transiently negative values, report an error here if any of
+        * the stats is negative, so we know to go looking for imbalance.
+        */
+       err = schedule_on_each_cpu(refresh_vm_stats);
+       if (err)
+               return err;
+       for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+               val = atomic_long_read(&vm_stat[i]);
+               if (val < 0) {
+                       switch (i) {
+                       case NR_ALLOC_BATCH:
+                       case NR_PAGES_SCANNED:
+                               /*
+                                * These are often seen to go negative in
+                                * recent kernels, but not to go permanently
+                                * negative.  Whilst it would be nicer not to
+                                * have exceptions, rooting them out would be
+                                * another task, of rather low priority.
+                                */
+                               break;
+                       default:
+                               pr_warn("%s: %s %ld\n",
+                                       __func__, vmstat_text[i], val);
+                               err = -EINVAL;
+                               break;
+                       }
+               }
+       }
+       if (err)
+               return err;
+       if (write)
+               *ppos += *lenp;
+       else
+               *lenp = 0;
+       return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
  static void vmstat_update(struct work_struct *w)
  {
         if (refresh_cpu_vm_stats(true)) {
author	Hugh Dickins <hughd@google.com>
	Fri, 20 May 2016 00:12:50 +0000 (17:12 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 20 May 2016 02:12:14 +0000 (19:12 -0700)
Documentation/sysctl/vm.txt		patch \| blob \| history
include/linux/vmstat.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history