mm: migration: allow migration to operate asynchronously and avoid synchronous compac...

[mv-sheeva.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 07a654486f75cfdfe13035f7bf501e2d2dbf0a4a..0fd486467b4b953f318510744dc2b87d71f7d021 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
   * only be modified with pm_mutex held, unless the suspend/hibernate code is
   * guaranteed not to run in parallel with that modification).
   */
-void set_gfp_allowed_mask(gfp_t mask)
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
  {
         WARN_ON(!mutex_is_locked(&pm_mutex));
-       gfp_allowed_mask = mask;
+       if (saved_gfp_mask) {
+               gfp_allowed_mask = saved_gfp_mask;
+               saved_gfp_mask = 0;
+       }
  }
  
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
  {
-       gfp_t ret = gfp_allowed_mask;
-
         WARN_ON(!mutex_is_locked(&pm_mutex));
-       gfp_allowed_mask &= ~mask;
-       return ret;
+       WARN_ON(saved_gfp_mask);
+       saved_gfp_mask = gfp_allowed_mask;
+       gfp_allowed_mask &= ~GFP_IOFS;
  }
  #endif /* CONFIG_PM_SLEEP */
  
@@ -1455,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
  
  /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
   * of the allocation.
   */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags, long free_pages)
  {
         /* free_pages my go negative - that's OK */
         long min = mark;
-       long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
         int o;
  
+       free_pages -= (1 << order) + 1;
         if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
         if (alloc_flags & ALLOC_HARDER)
                 min -= min / 4;
  
         if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-               return 0;
+               return false;
         for (o = 0; o < order; o++) {
                 /* At the next order, this order's pages become unavailable */
                 free_pages -= z->free_area[o].nr_free << o;
@@ -1481,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                 min >>= 1;
  
                 if (free_pages <= min)
-                       return 0;
+                       return false;
         }
-       return 1;
+       return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                               free_pages);
  }
  
  #ifdef CONFIG_NUMA
@@ -1788,15 +1812,19 @@ static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress)
+       int migratetype, unsigned long *did_some_progress,
+       bool sync_migration)
  {
         struct page *page;
+       struct task_struct *tsk = current;
  
         if (!order || compaction_deferred(preferred_zone))
                 return NULL;
  
+       tsk->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                               nodemask);
+                                               nodemask, sync_migration);
+       tsk->flags &= ~PF_MEMALLOC;
         if (*did_some_progress != COMPACT_SKIPPED) {
  
                 /* Page migration frees to the PCP lists but we want merging */
@@ -1832,7 +1860,8 @@ static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress)
+       int migratetype, unsigned long *did_some_progress,
+       bool sync_migration)
  {
         return NULL;
  }
@@ -1974,6 +2003,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
         struct task_struct *p = current;
+       bool sync_migration = false;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -2036,14 +2066,19 @@ rebalance:
         if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                 goto nopage;
  
-       /* Try direct compaction */
+       /*
+        * Try direct compaction. The first pass is asynchronous. Subsequent
+        * attempts after direct reclaim are synchronous
+        */
         page = __alloc_pages_direct_compact(gfp_mask, order,
                                         zonelist, high_zoneidx,
                                         nodemask,
                                         alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress);
+                                       migratetype, &did_some_progress,
+                                       sync_migration);
         if (page)
                 goto got_pg;
+       sync_migration = true;
  
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2097,6 +2132,20 @@ rebalance:
                 /* Wait for some write requests to complete then retry */
                 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                 goto rebalance;
+       } else {
+               /*
+                * High-order allocations do not necessarily loop after
+                * direct reclaim and reclaim/compaction depends on compaction
+                * being called after reclaim so call directly if necessary
+                */
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, &did_some_progress,
+                                       sync_migration);
+               if (page)
+                       goto got_pg;
         }
  
  nopage:
@@ -2437,7 +2486,7 @@ void show_free_areas(void)
                         " all_unreclaimable? %s"
                         "\n",
                         zone->name,
-                       K(zone_nr_free_pages(zone)),
+                       K(zone_page_state(zone, NR_FREE_PAGES)),
                         K(min_wmark_pages(zone)),
                         K(low_wmark_pages(zone)),
                         K(high_wmark_pages(zone)),
@@ -3008,14 +3057,6 @@ static __init_refok int __build_all_zonelists(void *data)
                 build_zonelist_cache(pgdat);
         }
  
-#ifdef CONFIG_MEMORY_HOTPLUG
-       /* Setup real pagesets for the new zone */
-       if (data) {
-               struct zone *zone = data;
-               setup_zone_pageset(zone);
-       }
-#endif
-
         /*
          * Initialize the boot_pagesets that are going to be used
          * for bootstrapping processors. The real pagesets for
@@ -3064,7 +3105,11 @@ void build_all_zonelists(void *data)
         } else {
                 /* we have to stop all cpus to guarantee there is no user
                    of zonelist */
-               stop_machine(__build_all_zonelists, data, NULL);
+#ifdef CONFIG_MEMORY_HOTPLUG
+               if (data)
+                       setup_zone_pageset((struct zone *)data);
+#endif
+               stop_machine(__build_all_zonelists, NULL, NULL);
                 /* cpuset refresh routine should be here */
         }
         vm_total_pages = nr_free_pagecache_pages();
@@ -4013,7 +4058,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
                 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
  }
  #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
                                 struct zone *zone, unsigned long zonesize) {}
  #endif /* CONFIG_SPARSEMEM */