SLUB slab validation: Move tracking information alloc outside of lock

[karo-tx-linux.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 7430df68cb64aaf2856c7158920442c77fe1cf27..2225b7c9df8582cb8713281192c81d9b6d49e20c 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -66,17 +66,8 @@ struct scan_control {
         int swappiness;
  
         int all_unreclaimable;
-};
  
-/*
- * The list of shrinker callbacks used by to apply pressure to
- * ageable caches.
- */
-struct shrinker {
-       shrinker_t              shrinker;
-       struct list_head        list;
-       int                     seeks;  /* seeks to recreate an obj */
-       long                    nr;     /* objs pending delete */
+       int order;
  };
  
  #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -121,34 +112,25 @@ static DECLARE_RWSEM(shrinker_rwsem);
  /*
   * Add a shrinker callback to be called from the vm
   */
-struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+void register_shrinker(struct shrinker *shrinker)
  {
-        struct shrinker *shrinker;
-
-        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
-        if (shrinker) {
-               shrinker->shrinker = theshrinker;
-               shrinker->seeks = seeks;
-               shrinker->nr = 0;
-               down_write(&shrinker_rwsem);
-               list_add_tail(&shrinker->list, &shrinker_list);
-               up_write(&shrinker_rwsem);
-       }
-       return shrinker;
+       shrinker->nr = 0;
+       down_write(&shrinker_rwsem);
+       list_add_tail(&shrinker->list, &shrinker_list);
+       up_write(&shrinker_rwsem);
  }
-EXPORT_SYMBOL(set_shrinker);
+EXPORT_SYMBOL(register_shrinker);
  
  /*
   * Remove one
   */
-void remove_shrinker(struct shrinker *shrinker)
+void unregister_shrinker(struct shrinker *shrinker)
  {
         down_write(&shrinker_rwsem);
         list_del(&shrinker->list);
         up_write(&shrinker_rwsem);
-       kfree(shrinker);
  }
-EXPORT_SYMBOL(remove_shrinker);
+EXPORT_SYMBOL(unregister_shrinker);
  
  #define SHRINK_BATCH 128
  /*
@@ -185,7 +167,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
         list_for_each_entry(shrinker, &shrinker_list, list) {
                 unsigned long long delta;
                 unsigned long total_scan;
-               unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
+               unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
  
                 delta = (4 * scanned) / shrinker->seeks;
                 delta *= max_pass;
@@ -213,8 +195,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                         int shrink_ret;
                         int nr_before;
  
-                       nr_before = (*shrinker->shrinker)(0, gfp_mask);
-                       shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+                       nr_before = (*shrinker->shrink)(0, gfp_mask);
+                       shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
                         if (shrink_ret == -1)
                                 break;
                         if (shrink_ret < nr_before)
@@ -284,12 +266,8 @@ static void handle_write_error(struct address_space *mapping,
                                 struct page *page, int error)
  {
         lock_page(page);
-       if (page_mapping(page) == mapping) {
-               if (error == -ENOSPC)
-                       set_bit(AS_ENOSPC, &mapping->flags);
-               else
-                       set_bit(AS_EIO, &mapping->flags);
-       }
+       if (page_mapping(page) == mapping)
+               mapping_set_error(mapping, error);
         unlock_page(page);
  }
  
@@ -485,7 +463,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  
                 referenced = page_referenced(page, 1);
                 /* In active use or really unfreeable?  Activate it. */
-               if (referenced && page_mapping_inuse(page))
+               if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+                                       referenced && page_mapping_inuse(page))
                         goto activate_locked;
  
  #ifdef CONFIG_SWAP
@@ -518,7 +497,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 }
  
                 if (PageDirty(page)) {
-                       if (referenced)
+                       if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
                                 goto keep_locked;
                         if (!may_enter_fs)
                                 goto keep_locked;
@@ -602,6 +581,51 @@ keep:
         return nr_reclaimed;
  }
  
+/* LRU Isolation modes. */
+#define ISOLATE_INACTIVE 0     /* Isolate inactive pages. */
+#define ISOLATE_ACTIVE 1       /* Isolate active pages. */
+#define ISOLATE_BOTH 2         /* Isolate both active and inactive pages. */
+
+/*
+ * Attempt to remove the specified page from its LRU.  Only take this page
+ * if it is of the appropriate PageActive status.  Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page:       page to consider
+ * mode:       one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+static int __isolate_lru_page(struct page *page, int mode)
+{
+       int ret = -EINVAL;
+
+       /* Only take pages on the LRU. */
+       if (!PageLRU(page))
+               return ret;
+
+       /*
+        * When checking the active state, we need to be sure we are
+        * dealing with comparible boolean values.  Take the logical not
+        * of each.
+        */
+       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+               return ret;
+
+       ret = -EBUSY;
+       if (likely(get_page_unless_zero(page))) {
+               /*
+                * Be careful not to clear PageLRU until after we're
+                * sure the page is not being freed elsewhere -- the
+                * page release code relies on it.
+                */
+               ClearPageLRU(page);
+               ret = 0;
+       }
+
+       return ret;
+}
+
  /*
   * zone->lru_lock is heavily contended.  Some of the functions that
   * shrink the lists perform better by taking out a batch of pages
@@ -616,44 +640,114 @@ keep:
   * @src:       The LRU list to pull pages off.
   * @dst:       The temp list to put pages on to.
   * @scanned:   The number of pages that were scanned.
+ * @order:     The caller's attempted allocation order
+ * @mode:      One of the LRU isolation modes
   *
   * returns how many pages were moved onto *@dst.
   */
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 struct list_head *src, struct list_head *dst,
-               unsigned long *scanned)
+               unsigned long *scanned, int order, int mode)
  {
         unsigned long nr_taken = 0;
-       struct page *page;
         unsigned long scan;
  
         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
-               struct list_head *target;
+               struct page *page;
+               unsigned long pfn;
+               unsigned long end_pfn;
+               unsigned long page_pfn;
+               int zone_id;
+
                 page = lru_to_page(src);
                 prefetchw_prev_lru_page(page, src, flags);
  
                 VM_BUG_ON(!PageLRU(page));
  
-               list_del(&page->lru);
-               target = src;
-               if (likely(get_page_unless_zero(page))) {
-                       /*
-                        * Be careful not to clear PageLRU until after we're
-                        * sure the page is not being freed elsewhere -- the
-                        * page release code relies on it.
-                        */
-                       ClearPageLRU(page);
-                       target = dst;
+               switch (__isolate_lru_page(page, mode)) {
+               case 0:
+                       list_move(&page->lru, dst);
                         nr_taken++;
-               } /* else it is being freed elsewhere */
+                       break;
+
+               case -EBUSY:
+                       /* else it is being freed elsewhere */
+                       list_move(&page->lru, src);
+                       continue;
  
-               list_add(&page->lru, target);
+               default:
+                       BUG();
+               }
+
+               if (!order)
+                       continue;
+
+               /*
+                * Attempt to take all pages in the order aligned region
+                * surrounding the tag page.  Only take those pages of
+                * the same active state as that tag page.  We may safely
+                * round the target page pfn down to the requested order
+                * as the mem_map is guarenteed valid out to MAX_ORDER,
+                * where that page is in a different zone we will detect
+                * it from its zone id and abort this block scan.
+                */
+               zone_id = page_zone_id(page);
+               page_pfn = page_to_pfn(page);
+               pfn = page_pfn & ~((1 << order) - 1);
+               end_pfn = pfn + (1 << order);
+               for (; pfn < end_pfn; pfn++) {
+                       struct page *cursor_page;
+
+                       /* The target page is in the block, ignore it. */
+                       if (unlikely(pfn == page_pfn))
+                               continue;
+
+                       /* Avoid holes within the zone. */
+                       if (unlikely(!pfn_valid_within(pfn)))
+                               break;
+
+                       cursor_page = pfn_to_page(pfn);
+                       /* Check that we have not crossed a zone boundary. */
+                       if (unlikely(page_zone_id(cursor_page) != zone_id))
+                               continue;
+                       switch (__isolate_lru_page(cursor_page, mode)) {
+                       case 0:
+                               list_move(&cursor_page->lru, dst);
+                               nr_taken++;
+                               scan++;
+                               break;
+
+                       case -EBUSY:
+                               /* else it is being freed elsewhere */
+                               list_move(&cursor_page->lru, src);
+                       default:
+                               break;
+                       }
+               }
         }
  
         *scanned = scan;
         return nr_taken;
  }
  
+/*
+ * clear_active_flags() is a helper for shrink_active_list(), clearing
+ * any active bits from the pages in the list.
+ */
+static unsigned long clear_active_flags(struct list_head *page_list)
+{
+       int nr_active = 0;
+       struct page *page;
+
+       list_for_each_entry(page, page_list, lru)
+               if (PageActive(page)) {
+                       ClearPageActive(page);
+                       nr_active++;
+               }
+
+       return nr_active;
+}
+
  /*
   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
   * of reclaimed pages
@@ -675,11 +769,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 unsigned long nr_taken;
                 unsigned long nr_scan;
                 unsigned long nr_freed;
+               unsigned long nr_active;
  
                 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-                                            &zone->inactive_list,
-                                            &page_list, &nr_scan);
-               zone->nr_inactive -= nr_taken;
+                            &zone->inactive_list,
+                            &page_list, &nr_scan, sc->order,
+                            (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
+                                            ISOLATE_BOTH : ISOLATE_INACTIVE);
+               nr_active = clear_active_flags(&page_list);
+
+               __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
+               __mod_zone_page_state(zone, NR_INACTIVE,
+                                               -(nr_taken - nr_active));
                 zone->pages_scanned += nr_scan;
                 spin_unlock_irq(&zone->lru_lock);
  
@@ -740,7 +841,8 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
  
  static inline int zone_is_near_oom(struct zone *zone)
  {
-       return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
+       return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
+                               + zone_page_state(zone, NR_INACTIVE))*3;
  }
  
  /*
@@ -823,9 +925,9 @@ force_reclaim_mapped:
         lru_add_drain();
         spin_lock_irq(&zone->lru_lock);
         pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-                                   &l_hold, &pgscanned);
+                           &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
         zone->pages_scanned += pgscanned;
-       zone->nr_active -= pgmoved;
+       __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
         spin_unlock_irq(&zone->lru_lock);
  
         while (!list_empty(&l_hold)) {
@@ -857,7 +959,7 @@ force_reclaim_mapped:
                 list_move(&page->lru, &zone->inactive_list);
                 pgmoved++;
                 if (!pagevec_add(&pvec, page)) {
-                       zone->nr_inactive += pgmoved;
+                       __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
                         spin_unlock_irq(&zone->lru_lock);
                         pgdeactivate += pgmoved;
                         pgmoved = 0;
@@ -867,7 +969,7 @@ force_reclaim_mapped:
                         spin_lock_irq(&zone->lru_lock);
                 }
         }
-       zone->nr_inactive += pgmoved;
+       __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
         pgdeactivate += pgmoved;
         if (buffer_heads_over_limit) {
                 spin_unlock_irq(&zone->lru_lock);
@@ -885,14 +987,14 @@ force_reclaim_mapped:
                 list_move(&page->lru, &zone->active_list);
                 pgmoved++;
                 if (!pagevec_add(&pvec, page)) {
-                       zone->nr_active += pgmoved;
+                       __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
                         pgmoved = 0;
                         spin_unlock_irq(&zone->lru_lock);
                         __pagevec_release(&pvec);
                         spin_lock_irq(&zone->lru_lock);
                 }
         }
-       zone->nr_active += pgmoved;
+       __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
  
         __count_zone_vm_events(PGREFILL, zone, pgscanned);
         __count_vm_events(PGDEACTIVATE, pgdeactivate);
@@ -918,14 +1020,16 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
          * Add one to `nr_to_scan' just to make sure that the kernel will
          * slowly sift through the active list.
          */
-       zone->nr_scan_active += (zone->nr_active >> priority) + 1;
+       zone->nr_scan_active +=
+               (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
         nr_active = zone->nr_scan_active;
         if (nr_active >= sc->swap_cluster_max)
                 zone->nr_scan_active = 0;
         else
                 nr_active = 0;
  
-       zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
+       zone->nr_scan_inactive +=
+               (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
         nr_inactive = zone->nr_scan_inactive;
         if (nr_inactive >= sc->swap_cluster_max)
                 zone->nr_scan_inactive = 0;
@@ -949,7 +1053,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
                 }
         }
  
-       throttle_vm_writeout();
+       throttle_vm_writeout(sc->gfp_mask);
  
         atomic_dec(&zone->reclaim_in_progress);
         return nr_reclaimed;
@@ -1012,7 +1116,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
   * holds filesystem locks which prevent writeout this might not work, and the
   * allocation attempt will fail.
   */
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
  {
         int priority;
         int ret = 0;
@@ -1027,6 +1131,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                 .swap_cluster_max = SWAP_CLUSTER_MAX,
                 .may_swap = 1,
                 .swappiness = vm_swappiness,
+               .order = order,
         };
  
         count_vm_event(ALLOCSTALL);
@@ -1037,7 +1142,8 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                         continue;
  
-               lru_pages += zone->nr_active + zone->nr_inactive;
+               lru_pages += zone_page_state(zone, NR_ACTIVE)
+                               + zone_page_state(zone, NR_INACTIVE);
         }
  
         for (priority = DEF_PRIORITY; priority >= 0; priority--) {
@@ -1131,6 +1237,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                 .may_swap = 1,
                 .swap_cluster_max = SWAP_CLUSTER_MAX,
                 .swappiness = vm_swappiness,
+               .order = order,
         };
         /*
          * temp_priority is used to remember the scanning priority at which
@@ -1182,7 +1289,8 @@ loop_again:
                 for (i = 0; i <= end_zone; i++) {
                         struct zone *zone = pgdat->node_zones + i;
  
-                       lru_pages += zone->nr_active + zone->nr_inactive;
+                       lru_pages += zone_page_state(zone, NR_ACTIVE)
+                                       + zone_page_state(zone, NR_INACTIVE);
                 }
  
                 /*
@@ -1219,8 +1327,9 @@ loop_again:
                         if (zone->all_unreclaimable)
                                 continue;
                         if (nr_slab == 0 && zone->pages_scanned >=
-                                   (zone->nr_active + zone->nr_inactive) * 6)
-                               zone->all_unreclaimable = 1;
+                               (zone_page_state(zone, NR_ACTIVE)
+                               + zone_page_state(zone, NR_INACTIVE)) * 6)
+                                       zone->all_unreclaimable = 1;
                         /*
                          * If we've done a decent amount of scanning and
                          * the reclaim ratio is low, start doing writepage
@@ -1317,8 +1426,6 @@ static int kswapd(void *p)
         for ( ; ; ) {
                 unsigned long new_order;
  
-               try_to_freeze();
-
                 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
                 new_order = pgdat->kswapd_max_order;
                 pgdat->kswapd_max_order = 0;
@@ -1329,12 +1436,19 @@ static int kswapd(void *p)
                          */
                         order = new_order;
                 } else {
-                       schedule();
+                       if (!freezing(current))
+                               schedule();
+
                         order = pgdat->kswapd_max_order;
                 }
                 finish_wait(&pgdat->kswapd_wait, &wait);
  
-               balance_pgdat(pgdat, order);
+               if (!try_to_freeze()) {
+                       /* We can speed up thawing tasks if we don't call
+                        * balance_pgdat after returning from the refrigerator
+                        */
+                       balance_pgdat(pgdat, order);
+               }
         }
         return 0;
  }
@@ -1385,18 +1499,22 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
  
                 /* For pass = 0 we don't shrink the active list */
                 if (pass > 0) {
-                       zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+                       zone->nr_scan_active +=
+                               (zone_page_state(zone, NR_ACTIVE) >> prio) + 1;
                         if (zone->nr_scan_active >= nr_pages || pass > 3) {
                                 zone->nr_scan_active = 0;
-                               nr_to_scan = min(nr_pages, zone->nr_active);
+                               nr_to_scan = min(nr_pages,
+                                       zone_page_state(zone, NR_ACTIVE));
                                 shrink_active_list(nr_to_scan, zone, sc, prio);
                         }
                 }
  
-               zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+               zone->nr_scan_inactive +=
+                       (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
                 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
                         zone->nr_scan_inactive = 0;
-                       nr_to_scan = min(nr_pages, zone->nr_inactive);
+                       nr_to_scan = min(nr_pages,
+                               zone_page_state(zone, NR_INACTIVE));
                         ret += shrink_inactive_list(nr_to_scan, zone, sc);
                         if (ret >= nr_pages)
                                 return ret;
@@ -1408,12 +1526,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
  
  static unsigned long count_lru_pages(void)
  {
-       struct zone *zone;
-       unsigned long ret = 0;
-
-       for_each_zone(zone)
-               ret += zone->nr_active + zone->nr_inactive;
-       return ret;
+       return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
  }
  
  /*
@@ -1522,7 +1635,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
         pg_data_t *pgdat;
         cpumask_t mask;
  
-       if (action == CPU_ONLINE) {
+       if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
                 for_each_online_pgdat(pgdat) {
                         mask = node_to_cpumask(pgdat->node_id);
                         if (any_online_cpu(mask) != NR_CPUS)