]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
mm: fix Committed_AS underflow on large NR_CPUS environment
authorKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tue, 28 Apr 2009 20:48:11 +0000 (22:48 +0200)
committerGreg Kroah-Hartman <gregkh@suse.de>
Fri, 8 May 2009 22:45:11 +0000 (15:45 -0700)
commit 00a62ce91e554198ef28234c91c36f850f5a3bc9 upstream

The Committed_AS field can underflow in certain situations:

>         # while true; do cat /proc/meminfo  | grep _AS; sleep 1; done | uniq -c
>               1 Committed_AS: 18446744073709323392 kB
>              11 Committed_AS: 18446744073709455488 kB
>               6 Committed_AS:    35136 kB
>               5 Committed_AS: 18446744073709454400 kB
>               7 Committed_AS:    35904 kB
>               3 Committed_AS: 18446744073709453248 kB
>               2 Committed_AS:    34752 kB
>               9 Committed_AS: 18446744073709453248 kB
>               8 Committed_AS:    34752 kB
>               3 Committed_AS: 18446744073709320960 kB
>               7 Committed_AS: 18446744073709454080 kB
>               3 Committed_AS: 18446744073709320960 kB
>               5 Committed_AS: 18446744073709454080 kB
>               6 Committed_AS: 18446744073709320960 kB

Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does
not check for underflow.

But NR_CPUS proportional isn't good calculation.  In general,
possibility of lock contention is proportional to the number of online
cpus, not theorical maximum cpus (NR_CPUS).

The current kernel has generic percpu-counter stuff.  using it is right
way.  it makes code simplify and percpu_counter_read_positive() don't
make underflow issue.

Reported-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
fs/proc/meminfo.c
include/linux/mman.h
mm/mmap.c
mm/nommu.c
mm/swap.c

index 43d23948384addca6930aa65acbf49804226827f..52981cdd85a8ba4b470c60d4cf7cdf429046ad57 100644 (file)
@@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        si_meminfo(&i);
        si_swapinfo(&i);
-       committed = atomic_long_read(&vm_committed_space);
+       committed = percpu_counter_read_positive(&vm_committed_as);
        allowed = ((totalram_pages - hugetlb_total_pages())
                * sysctl_overcommit_ratio / 100) + total_swap_pages;
 
index 30d1073bac3ba63459123db829283fa9ec7eb8e5..9872d6ca58ae9a7b88dcc97387f86a434a633db5 100644 (file)
 
 #ifdef __KERNEL__
 #include <linux/mm.h>
+#include <linux/percpu_counter.h>
 
 #include <asm/atomic.h>
 
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
-extern atomic_long_t vm_committed_space;
+extern struct percpu_counter vm_committed_as;
 
-#ifdef CONFIG_SMP
-extern void vm_acct_memory(long pages);
-#else
 static inline void vm_acct_memory(long pages)
 {
-       atomic_long_add(pages, &vm_committed_space);
+       percpu_counter_add(&vm_committed_as, pages);
 }
-#endif
 
 static inline void vm_unacct_memory(long pages)
 {
index f1aa6f92bc8ae9cd31a585e30b91cab9af7f55e8..efff81b133e64efecab48f7174473970c4535b0d 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;      /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 
 /*
  * Check that a process has enough memory to allocate a new virtual
@@ -178,11 +178,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        if (mm)
                allowed -= mm->total_vm / 32;
 
-       /*
-        * cast `allowed' as a signed long because vm_committed_space
-        * sometimes has a negative value
-        */
-       if (atomic_long_read(&vm_committed_space) < (long)allowed)
+       if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
 error:
        vm_unacct_memory(pages);
@@ -2477,6 +2473,10 @@ void mm_drop_all_locks(struct mm_struct *mm)
  */
 void __init mmap_init(void)
 {
+       int ret;
+
+       ret = percpu_counter_init(&vm_committed_as, 0);
+       VM_BUG_ON(ret);
        vm_area_cachep = kmem_cache_create("vm_area_struct",
                        sizeof(struct vm_area_struct), 0,
                        SLAB_PANIC, NULL);
index 2fcf47d449b4e203d46bbbf76827aeedbd2e2c22..ee955bc2784566ba2e0a98077adec25c1f0b533e 100644 (file)
@@ -62,7 +62,7 @@ void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
 unsigned long num_physpages;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
@@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
  */
 void __init mmap_init(void)
 {
+       int ret;
+
+       ret = percpu_counter_init(&vm_committed_as, 0);
+       VM_BUG_ON(ret);
        vm_region_jar = kmem_cache_create("vm_region_jar",
                                          sizeof(struct vm_region), 0,
                                          SLAB_PANIC, NULL);
@@ -1849,12 +1853,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        if (mm)
                allowed -= mm->total_vm / 32;
 
-       /*
-        * cast `allowed' as a signed long because vm_committed_space
-        * sometimes has a negative value
-        */
-       if (atomic_long_read(&vm_committed_space) < (long)allowed)
+       if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
+
 error:
        vm_unacct_memory(pages);
 
index 8adb9feb61e10b7385e01d3410d8cb0f0a4807e1..2460f7de91f778c88d162a803b0251ed23e97604 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -514,49 +514,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 
 EXPORT_SYMBOL(pagevec_lookup_tag);
 
-#ifdef CONFIG_SMP
-/*
- * We tolerate a little inaccuracy to avoid ping-ponging the counter between
- * CPUs
- */
-#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
-
-static DEFINE_PER_CPU(long, committed_space);
-
-void vm_acct_memory(long pages)
-{
-       long *local;
-
-       preempt_disable();
-       local = &__get_cpu_var(committed_space);
-       *local += pages;
-       if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
-               atomic_long_add(*local, &vm_committed_space);
-               *local = 0;
-       }
-       preempt_enable();
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* Drop the CPU's cached committed space back into the central pool. */
-static int cpu_swap_callback(struct notifier_block *nfb,
-                            unsigned long action,
-                            void *hcpu)
-{
-       long *committed;
-
-       committed = &per_cpu(committed_space, (long)hcpu);
-       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-               atomic_long_add(*committed, &vm_committed_space);
-               *committed = 0;
-               drain_cpu_pagevecs((long)hcpu);
-       }
-       return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-#endif /* CONFIG_SMP */
-
 /*
  * Perform any setup for the swap system
  */
@@ -577,7 +534,4 @@ void __init swap_setup(void)
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */
-#ifdef CONFIG_HOTPLUG_CPU
-       hotcpu_notifier(cpu_swap_callback, 0);
-#endif
 }