Merge branch 'akpm' (patches from Andrew Morton)

[linux-beck.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 8f785b1534a31d84d4bd705ab65cb5b061466a28..0ea758b898fdae74dbe5bf414b8995c4c944e7b3 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
  
  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
  static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION   (8)
  
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
@@ -815,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
                 set_page_count(p, 0);
         } while (++p, --i);
  
-       set_page_refcounted(page);
         set_pageblock_migratetype(page, MIGRATE_CMA);
-       __free_pages(page, pageblock_order);
+
+       if (pageblock_order >= MAX_ORDER) {
+               i = pageblock_nr_pages;
+               p = page;
+               do {
+                       set_page_refcounted(p);
+                       __free_pages(p, MAX_ORDER - 1);
+                       p += MAX_ORDER_NR_PAGES;
+               } while (i -= MAX_ORDER_NR_PAGES);
+       } else {
+               set_page_refcounted(page);
+               __free_pages(page, pageblock_order);
+       }
+
         adjust_managed_page_count(page, pageblock_nr_pages);
  }
  #endif
@@ -3389,7 +3402,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
  /*
   * sysctl handler for numa_zonelist_order
   */
-int numa_zonelist_order_handler(ctl_table *table, int write,
+int numa_zonelist_order_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *length,
                 loff_t *ppos)
  {
@@ -4145,7 +4158,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
         memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
  #endif
  
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
  {
  #ifdef CONFIG_MMU
         int batch;
@@ -4261,8 +4274,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
         pageset_update(&p->pcp, high, batch);
  }
  
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
-               struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+                                      struct per_cpu_pageset *pcp)
  {
         if (percpu_pagelist_fraction)
                 pageset_set_high(pcp,
@@ -4387,9 +4400,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- * Architectures may implement their own version but if add_active_range()
- * was used and there are no special requirements, this is a convenient
- * alternative
   */
  int __meminit __early_pfn_to_nid(unsigned long pfn)
  {
@@ -4444,10 +4454,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
   * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
   *
- * If an architecture guarantees that all ranges registered with
- * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling memblock_free_early_nid()
- * manually.
+ * If an architecture guarantees that all ranges registered contain no holes
+ * and may be freed, this this function may be used instead of calling
+ * memblock_free_early_nid() manually.
   */
  void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
  {
@@ -4469,9 +4478,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
   * sparse_memory_present_with_active_regions - Call memory_present for each active range
   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
   *
- * If an architecture guarantees that all ranges registered with
- * add_active_ranges() contain no holes and may be freed, this
- * function may be used instead of calling memory_present() manually.
+ * If an architecture guarantees that all ranges registered contain no holes and may
+ * be freed, this function may be used instead of calling memory_present() manually.
   */
  void __init sparse_memory_present_with_active_regions(int nid)
  {
@@ -4489,7 +4497,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
   *
   * It returns the start and end page frame of a node based on information
- * provided by an arch calling add_active_range(). If called for a node
+ * provided by memblock_set_node(). If called for a node
   * with no available memory, a warning is printed and the start and end
   * PFNs will be 0.
   */
@@ -5066,7 +5074,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
   * find_min_pfn_with_active_regions - Find the minimum PFN registered
   *
   * It returns the minimum PFN based on information provided via
- * add_active_range().
+ * memblock_set_node().
   */
  unsigned long __init find_min_pfn_with_active_regions(void)
  {
@@ -5287,7 +5295,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
   * @max_zone_pfn: an array of max PFNs for each zone
   *
   * This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by add_active_range(), the size of each
+ * Using the page ranges provided by memblock_set_node(), the size of each
   * zone in each node and their holes is calculated. If the maximum PFN
   * between two adjacent zones match, it is assumed that the zone is empty.
   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
@@ -5810,7 +5818,7 @@ module_init(init_per_zone_wmark_min)
   *     that we can call two helper functions whenever min_free_kbytes
   *     changes.
   */
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
+int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
         int rc;
@@ -5827,7 +5835,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
  }
  
  #ifdef CONFIG_NUMA
-int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
         struct zone *zone;
@@ -5843,7 +5851,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
         return 0;
  }
  
-int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
         struct zone *zone;
@@ -5869,7 +5877,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
   * minimum watermarks. The lowmem reserve ratio can only make sense
   * if in function of the boot time zone sizes.
   */
-int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
         proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -5882,27 +5890,42 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu
   * pagelist can have before it gets flushed back to buddy allocator.
   */
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
         struct zone *zone;
-       unsigned int cpu;
+       int old_percpu_pagelist_fraction;
         int ret;
  
+       mutex_lock(&pcp_batch_high_lock);
+       old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-       if (!write || (ret < 0))
-               return ret;
+       if (!write || ret < 0)
+               goto out;
+
+       /* Sanity checking to avoid pcp imbalance */
+       if (percpu_pagelist_fraction &&
+           percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+               percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* No change? */
+       if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+               goto out;
  
-       mutex_lock(&pcp_batch_high_lock);
         for_each_populated_zone(zone) {
-               unsigned long  high;
-               high = zone->managed_pages / percpu_pagelist_fraction;
+               unsigned int cpu;
+
                 for_each_possible_cpu(cpu)
-                       pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
-                                        high);
+                       pageset_set_high_and_batch(zone,
+                                       per_cpu_ptr(zone->pageset, cpu));
         }
+out:
         mutex_unlock(&pcp_batch_high_lock);
-       return 0;
+       return ret;
  }
  
  int hashdist = HASHDIST_DEFAULT;