[PATCH] mm: dequeue a huge page near to this node

[mv-sheeva.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 3c5cf664abd2eca14d2613a57023dcfcac6b6fde..1e49dc7cd61969c49cef5185ff1e424be11b6815 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -60,8 +60,11 @@ long nr_swap_pages;
   *     NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
   *     HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
   *     HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
   */
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
  
  EXPORT_SYMBOL(totalram_pages);
  
@@ -72,7 +75,7 @@ EXPORT_SYMBOL(totalram_pages);
  struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
  EXPORT_SYMBOL(zone_table);
  
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
  int min_free_kbytes = 1024;
  
  unsigned long __initdata nr_kernel_pages;
@@ -124,7 +127,7 @@ static void bad_page(const char *function, struct page *page)
         printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
                 function, current->comm, page);
         printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-               (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+               (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
                 page->mapping, page_mapcount(page), page_count(page));
         printk(KERN_EMERG "Backtrace:\n");
         dump_stack();
@@ -137,18 +140,13 @@ static void bad_page(const char *function, struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback |
-                       1 << PG_reserved );
+                       1 << PG_writeback );
         set_page_count(page, 0);
         reset_page_mapcount(page);
         page->mapping = NULL;
         add_taint(TAINT_BAD_PAGE);
  }
  
-#ifndef CONFIG_HUGETLB_PAGE
-#define prep_compound_page(page, order) do { } while (0)
-#define destroy_compound_page(page, order) do { } while (0)
-#else
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
@@ -202,7 +200,6 @@ static void destroy_compound_page(struct page *page, unsigned long order)
                 ClearPageCompound(p);
         }
  }
-#endif         /* CONFIG_HUGETLB_PAGE */
  
  /*
   * function for dealing with page's order in buddy system.
@@ -337,7 +334,7 @@ static inline void __free_pages_bulk (struct page *page,
         zone->free_area[order].nr_free++;
  }
  
-static inline void free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(const char *function, struct page *page)
  {
         if (    page_mapcount(page) ||
                 page->mapping != NULL ||
@@ -355,6 +352,12 @@ static inline void free_pages_check(const char *function, struct page *page)
                 bad_page(function, page);
         if (PageDirty(page))
                 __ClearPageDirty(page);
+       /*
+        * For now, we report if PG_reserved was found set, but do not
+        * clear it, and do not free the page.  But we shall soon need
+        * to do more, for when the ZERO_PAGE count wraps negative.
+        */
+       return PageReserved(page);
  }
  
  /*
@@ -394,11 +397,10 @@ void __free_pages_ok(struct page *page, unsigned int order)
  {
         LIST_HEAD(list);
         int i;
+       int reserved = 0;
  
         arch_free_page(page, order);
  
-       mod_page_state(pgfree, 1 << order);
-
  #ifndef CONFIG_MMU
         if (order > 0)
                 for (i = 1 ; i < (1 << order) ; ++i)
@@ -406,8 +408,12 @@ void __free_pages_ok(struct page *page, unsigned int order)
  #endif
  
         for (i = 0 ; i < (1 << order) ; ++i)
-               free_pages_check(__FUNCTION__, page + i);
+               reserved += free_pages_check(__FUNCTION__, page + i);
+       if (reserved)
+               return;
+
         list_add(&page->lru, &list);
+       mod_page_state(pgfree, 1 << order);
         kernel_map_pages(page, 1<<order, 0);
         free_pages_bulk(page_zone(page), 1, &list, order);
  }
@@ -465,7 +471,7 @@ void set_page_refs(struct page *page, int order)
  /*
   * This page is about to be returned from the page allocator
   */
-static void prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order)
  {
         if (    page_mapcount(page) ||
                 page->mapping != NULL ||
@@ -483,12 +489,20 @@ static void prep_new_page(struct page *page, int order)
                         1 << PG_reserved )))
                 bad_page(__FUNCTION__, page);
  
+       /*
+        * For now, we report if PG_reserved was found set, but do not
+        * clear it, and do not allocate the page: as a safety net.
+        */
+       if (PageReserved(page))
+               return 1;
+
         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                         1 << PG_referenced | 1 << PG_arch_1 |
                         1 << PG_checked | 1 << PG_mappedtodisk);
         set_page_private(page, 0);
         set_page_refs(page, order);
         kernel_map_pages(page, 1 << order, 1);
+       return 0;
  }
  
  /* 
@@ -671,11 +685,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
  
         arch_free_page(page, 0);
  
-       kernel_map_pages(page, 1, 0);
-       inc_page_state(pgfree);
         if (PageAnon(page))
                 page->mapping = NULL;
-       free_pages_check(__FUNCTION__, page);
+       if (free_pages_check(__FUNCTION__, page))
+               return;
+
+       inc_page_state(pgfree);
+       kernel_map_pages(page, 1, 0);
+
         pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
         local_irq_save(flags);
         list_add(&page->lru, &pcp->list);
@@ -714,12 +731,14 @@ static struct page *
  buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
  {
         unsigned long flags;
-       struct page *page = NULL;
+       struct page *page;
         int cold = !!(gfp_flags & __GFP_COLD);
  
+again:
         if (order == 0) {
                 struct per_cpu_pages *pcp;
  
+               page = NULL;
                 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                 local_irq_save(flags);
                 if (pcp->count <= pcp->low)
@@ -741,7 +760,8 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
         if (page != NULL) {
                 BUG_ON(bad_range(zone, page));
                 mod_page_state_zone(zone, pgalloc, 1 << order);
-               prep_new_page(page, order);
+               if (prep_new_page(page, order))
+                       goto again;
  
                 if (gfp_flags & __GFP_ZERO)
                         prep_zero_page(page, order, gfp_flags);
@@ -753,9 +773,12 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
  }
  
  #define ALLOC_NO_WATERMARKS    0x01 /* don't check watermarks at all */
-#define ALLOC_HARDER           0x02 /* try to alloc harder */
-#define ALLOC_HIGH             0x04 /* __GFP_HIGH set */
-#define ALLOC_CPUSET           0x08 /* check for correct cpuset */
+#define ALLOC_WMARK_MIN                0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW                0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH       0x08 /* use pages_high watermark */
+#define ALLOC_HARDER           0x10 /* try to alloc harder */
+#define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET           0x40 /* check for correct cpuset */
  
  /*
   * Return 1 if free pages are above 'mark'. This takes into account the order
@@ -810,7 +833,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                         continue;
  
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
-                       if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+                       unsigned long mark;
+                       if (alloc_flags & ALLOC_WMARK_MIN)
+                               mark = (*z)->pages_min;
+                       else if (alloc_flags & ALLOC_WMARK_LOW)
+                               mark = (*z)->pages_low;
+                       else
+                               mark = (*z)->pages_high;
+                       if (!zone_watermark_ok(*z, order, mark,
                                     classzone_idx, alloc_flags))
                                 continue;
                 }
@@ -842,21 +872,22 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
  
         might_sleep_if(wait);
  
+restart:
         z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
  
         if (unlikely(*z == NULL)) {
                 /* Should this ever happen?? */
                 return NULL;
         }
-restart:
+
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-                               zonelist, ALLOC_CPUSET);
+                               zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
         if (page)
                 goto got_pg;
  
-       do
+       do {
                 wakeup_kswapd(*z, order);
-       while (*(++z));
+       } while (*(++z));
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -867,13 +898,12 @@ restart:
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy.
          */
-       alloc_flags = 0;
+       alloc_flags = ALLOC_WMARK_MIN;
         if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
                 alloc_flags |= ALLOC_HARDER;
         if (gfp_mask & __GFP_HIGH)
                 alloc_flags |= ALLOC_HIGH;
-       if (wait)
-               alloc_flags |= ALLOC_CPUSET;
+       alloc_flags |= ALLOC_CPUSET;
  
         /*
          * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -895,7 +925,7 @@ restart:
  nofail_alloc:
                         /* go through the zonelist yet again, ignoring mins */
                         page = get_page_from_freelist(gfp_mask, order,
-                               zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+                               zonelist, ALLOC_NO_WATERMARKS);
                         if (page)
                                 goto got_pg;
                         if (gfp_mask & __GFP_NOFAIL) {
@@ -938,7 +968,7 @@ rebalance:
                  * under heavy pressure.
                  */
                 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-                                               zonelist, ALLOC_CPUSET);
+                               zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
                 if (page)
                         goto got_pg;
  
@@ -1421,6 +1451,10 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
                 zone = pgdat->node_zones + ZONE_NORMAL;
                 if (zone->present_pages)
                         zonelist->zones[j++] = zone;
+       case ZONE_DMA32:
+               zone = pgdat->node_zones + ZONE_DMA32;
+               if (zone->present_pages)
+                       zonelist->zones[j++] = zone;
         case ZONE_DMA:
                 zone = pgdat->node_zones + ZONE_DMA;
                 if (zone->present_pages)
@@ -1435,6 +1469,8 @@ static inline int highest_zone(int zone_bits)
         int res = ZONE_NORMAL;
         if (zone_bits & (__force int)__GFP_HIGHMEM)
                 res = ZONE_HIGHMEM;
+       if (zone_bits & (__force int)__GFP_DMA32)
+               res = ZONE_DMA32;
         if (zone_bits & (__force int)__GFP_DMA)
                 res = ZONE_DMA;
         return res;
@@ -1735,16 +1771,16 @@ static int __devinit zone_batchsize(struct zone *zone)
                 batch = 1;
  
         /*
-        * We will be trying to allcoate bigger chunks of contiguous
-        * memory of the order of fls(batch).  This should result in
-        * better cache coloring.
+        * Clamp the batch to a 2^n - 1 value. Having a power
+        * of 2 value was found to be more likely to have
+        * suboptimal cache aliasing properties in some cases.
          *
-        * A sanity check also to ensure that batch is still in limits.
+        * For example if 2 tasks are alternately allocating
+        * batches of pages, one task can end up with a lot
+        * of pages of one half of the possible page colors
+        * and the other with pages of the other colors.
          */
-       batch = (1 << fls(batch + batch/2));
-
-       if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
-               batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
+       batch = (1 << (fls(batch + batch/2)-1)) - 1;
  
         return batch;
  }
@@ -1846,11 +1882,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
                         if (process_zones(cpu))
                                 ret = NOTIFY_BAD;
                         break;
-#ifdef CONFIG_HOTPLUG_CPU
+               case CPU_UP_CANCELED:
                 case CPU_DEAD:
                         free_zone_pagesets(cpu);
                         break;
-#endif
                 default:
                         break;
         }
@@ -1860,7 +1895,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
  static struct notifier_block pageset_notifier =
         { &pageset_cpuup_callback, NULL, 0 };
  
-void __init setup_per_cpu_pageset()
+void __init setup_per_cpu_pageset(void)
  {
         int err;
  
@@ -1955,7 +1990,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                 if (zholes_size)
                         realsize -= zholes_size[j];
  
-               if (j == ZONE_DMA || j == ZONE_NORMAL)
+               if (j < ZONE_HIGHMEM)
                         nr_kernel_pages += realsize;
                 nr_all_pages += realsize;