per-zone and reclaim enhancements for memory controller: calculate active/inactive...

[karo-tx-linux.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index bbd194630c5b0c16dd550d21b81fb2b0865c3538..be4dfe87be03eef25206710409d9ae4e44143975 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -37,6 +37,7 @@
  #include <linux/delay.h>
  #include <linux/kthread.h>
  #include <linux/freezer.h>
+#include <linux/memcontrol.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -68,6 +69,22 @@ struct scan_control {
         int all_unreclaimable;
  
         int order;
+
+       /*
+        * Pages that have (or should have) IO pending.  If we run into
+        * a lot of these, we're better off waiting a little for IO to
+        * finish rather than scanning more pages in the VM.
+        */
+       int nr_io_pages;
+
+       /* Which cgroup do we reclaim from */
+       struct mem_cgroup *mem_cgroup;
+
+       /* Pluggable isolate pages callback */
+       unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
+                       unsigned long *scanned, int order, int mode,
+                       struct zone *z, struct mem_cgroup *mem_cont,
+                       int active);
  };
  
  #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -109,6 +126,12 @@ long vm_total_pages;       /* The total number of pages which the VM controls */
  static LIST_HEAD(shrinker_list);
  static DECLARE_RWSEM(shrinker_rwsem);
  
+#ifdef CONFIG_CGROUP_MEM_CONT
+#define scan_global_lru(sc)    (!(sc)->mem_cgroup)
+#else
+#define scan_global_lru(sc)    (1)
+#endif
+
  /*
   * Add a shrinker callback to be called from the vm
   */
@@ -141,7 +164,7 @@ EXPORT_SYMBOL(unregister_shrinker);
   * percentages of the lru and ageable caches.  This should balance the seeks
   * generated by these structures.
   *
- * If the vm encounted mapped pages on the LRU it increase the pressure on
+ * If the vm encountered mapped pages on the LRU it increase the pressure on
   * slab to avoid swapping.
   *
   * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
@@ -489,11 +512,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                          */
                         if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
                                 wait_on_page_writeback(page);
-                       else
+                       else {
+                               sc->nr_io_pages++;
                                 goto keep_locked;
+                       }
                 }
  
-               referenced = page_referenced(page, 1);
+               referenced = page_referenced(page, 1, sc->mem_cgroup);
                 /* In active use or really unfreeable?  Activate it. */
                 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
                                         referenced && page_mapping_inuse(page))
@@ -529,8 +554,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 if (PageDirty(page)) {
                         if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
                                 goto keep_locked;
-                       if (!may_enter_fs)
+                       if (!may_enter_fs) {
+                               sc->nr_io_pages++;
                                 goto keep_locked;
+                       }
                         if (!sc->may_writepage)
                                 goto keep_locked;
  
@@ -541,8 +568,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         case PAGE_ACTIVATE:
                                 goto activate_locked;
                         case PAGE_SUCCESS:
-                               if (PageWriteback(page) || PageDirty(page))
+                               if (PageWriteback(page) || PageDirty(page)) {
+                                       sc->nr_io_pages++;
                                         goto keep;
+                               }
                                 /*
                                  * A synchronous write - probably a ramdisk.  Go
                                  * ahead and try to reclaim the page.
@@ -626,7 +655,7 @@ keep:
   *
   * returns 0 on success, -ve errno on failure.
   */
-static int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode)
  {
         int ret = -EINVAL;
  
@@ -760,6 +789,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         return nr_taken;
  }
  
+static unsigned long isolate_pages_global(unsigned long nr,
+                                       struct list_head *dst,
+                                       unsigned long *scanned, int order,
+                                       int mode, struct zone *z,
+                                       struct mem_cgroup *mem_cont,
+                                       int active)
+{
+       if (active)
+               return isolate_lru_pages(nr, &z->active_list, dst,
+                                               scanned, order, mode);
+       else
+               return isolate_lru_pages(nr, &z->inactive_list, dst,
+                                               scanned, order, mode);
+}
+
  /*
   * clear_active_flags() is a helper for shrink_active_list(), clearing
   * any active bits from the pages in the list.
@@ -801,11 +845,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 unsigned long nr_freed;
                 unsigned long nr_active;
  
-               nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-                            &zone->inactive_list,
+               nr_taken = sc->isolate_pages(sc->swap_cluster_max,
                              &page_list, &nr_scan, sc->order,
                              (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
-                                            ISOLATE_BOTH : ISOLATE_INACTIVE);
+                                            ISOLATE_BOTH : ISOLATE_INACTIVE,
+                               zone, sc->mem_cgroup, 0);
                 nr_active = clear_active_flags(&page_list);
                 __count_vm_events(PGDEACTIVATE, nr_active);
  
@@ -1018,8 +1062,9 @@ force_reclaim_mapped:
  
         lru_add_drain();
         spin_lock_irq(&zone->lru_lock);
-       pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-                           &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
+       pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+                                       ISOLATE_ACTIVE, zone,
+                                       sc->mem_cgroup, 1);
         zone->pages_scanned += pgscanned;
         __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
         spin_unlock_irq(&zone->lru_lock);
@@ -1031,7 +1076,7 @@ force_reclaim_mapped:
                 if (page_mapped(page)) {
                         if (!reclaim_mapped ||
                             (total_swap_pages == 0 && PageAnon(page)) ||
-                           page_referenced(page, 0)) {
+                           page_referenced(page, 0, sc->mem_cgroup)) {
                                 list_add(&page->lru, &l_active);
                                 continue;
                         }
@@ -1051,6 +1096,7 @@ force_reclaim_mapped:
                 ClearPageActive(page);
  
                 list_move(&page->lru, &zone->inactive_list);
+               mem_cgroup_move_lists(page_get_page_cgroup(page), false);
                 pgmoved++;
                 if (!pagevec_add(&pvec, page)) {
                         __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1079,6 +1125,7 @@ force_reclaim_mapped:
                 SetPageLRU(page);
                 VM_BUG_ON(!PageActive(page));
                 list_move(&page->lru, &zone->active_list);
+               mem_cgroup_move_lists(page_get_page_cgroup(page), true);
                 pgmoved++;
                 if (!pagevec_add(&pvec, page)) {
                         __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1108,8 +1155,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
         unsigned long nr_to_scan;
         unsigned long nr_reclaimed = 0;
  
-       atomic_inc(&zone->reclaim_in_progress);
-
         /*
          * Add one to `nr_to_scan' just to make sure that the kernel will
          * slowly sift through the active list.
@@ -1148,8 +1193,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
         }
  
         throttle_vm_writeout(sc->gfp_mask);
-
-       atomic_dec(&zone->reclaim_in_progress);
         return nr_reclaimed;
  }
  
@@ -1187,7 +1230,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
  
                 note_zone_scanning_priority(zone, priority);
  
-               if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+               if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
                         continue;       /* Let kswapd poll it */
  
                 sc->all_unreclaimable = 0;
@@ -1210,7 +1253,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
   * holds filesystem locks which prevent writeout this might not work, and the
   * allocation attempt will fail.
   */
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
+                                         struct scan_control *sc)
  {
         int priority;
         int ret = 0;
@@ -1219,14 +1263,6 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
         struct reclaim_state *reclaim_state = current->reclaim_state;
         unsigned long lru_pages = 0;
         int i;
-       struct scan_control sc = {
-               .gfp_mask = gfp_mask,
-               .may_writepage = !laptop_mode,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
-               .may_swap = 1,
-               .swappiness = vm_swappiness,
-               .order = order,
-       };
  
         count_vm_event(ALLOCSTALL);
  
@@ -1241,17 +1277,24 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
         }
  
         for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-               sc.nr_scanned = 0;
+               sc->nr_scanned = 0;
+               sc->nr_io_pages = 0;
                 if (!priority)
                         disable_swap_token();
-               nr_reclaimed += shrink_zones(priority, zones, &sc);
-               shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
-               if (reclaim_state) {
-                       nr_reclaimed += reclaim_state->reclaimed_slab;
-                       reclaim_state->reclaimed_slab = 0;
+               nr_reclaimed += shrink_zones(priority, zones, sc);
+               /*
+                * Don't shrink slabs when reclaiming memory from
+                * over limit cgroups
+                */
+               if (scan_global_lru(sc)) {
+                       shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
+                       if (reclaim_state) {
+                               nr_reclaimed += reclaim_state->reclaimed_slab;
+                               reclaim_state->reclaimed_slab = 0;
+                       }
                 }
-               total_scanned += sc.nr_scanned;
-               if (nr_reclaimed >= sc.swap_cluster_max) {
+               total_scanned += sc->nr_scanned;
+               if (nr_reclaimed >= sc->swap_cluster_max) {
                         ret = 1;
                         goto out;
                 }
@@ -1263,18 +1306,19 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
                  * that's undesirable in laptop mode, where we *want* lumpy
                  * writeout.  So in laptop mode, write out the whole world.
                  */
-               if (total_scanned > sc.swap_cluster_max +
-                                       sc.swap_cluster_max / 2) {
+               if (total_scanned > sc->swap_cluster_max +
+                                       sc->swap_cluster_max / 2) {
                         wakeup_pdflush(laptop_mode ? 0 : total_scanned);
-                       sc.may_writepage = 1;
+                       sc->may_writepage = 1;
                 }
  
                 /* Take a nap, wait for some writeback to complete */
-               if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+               if (sc->nr_scanned && priority < DEF_PRIORITY - 2 &&
+                               sc->nr_io_pages > sc->swap_cluster_max)
                         congestion_wait(WRITE, HZ/10);
         }
         /* top priority shrink_caches still had more to do? don't OOM, then */
-       if (!sc.all_unreclaimable)
+       if (!sc->all_unreclaimable && scan_global_lru(sc))
                 ret = 1;
  out:
         /*
@@ -1286,7 +1330,7 @@ out:
          */
         if (priority < 0)
                 priority = 0;
-       for (i = 0; zones[i] != 0; i++) {
+       for (i = 0; zones[i] != NULL; i++) {
                 struct zone *zone = zones[i];
  
                 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1297,6 +1341,47 @@ out:
         return ret;
  }
  
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+{
+       struct scan_control sc = {
+               .gfp_mask = gfp_mask,
+               .may_writepage = !laptop_mode,
+               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .may_swap = 1,
+               .swappiness = vm_swappiness,
+               .order = order,
+               .mem_cgroup = NULL,
+               .isolate_pages = isolate_pages_global,
+       };
+
+       return do_try_to_free_pages(zones, gfp_mask, &sc);
+}
+
+#ifdef CONFIG_CGROUP_MEM_CONT
+
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+                                               gfp_t gfp_mask)
+{
+       struct scan_control sc = {
+               .gfp_mask = gfp_mask,
+               .may_writepage = !laptop_mode,
+               .may_swap = 1,
+               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .swappiness = vm_swappiness,
+               .order = 0,
+               .mem_cgroup = mem_cont,
+               .isolate_pages = mem_cgroup_isolate_pages,
+       };
+       struct zone **zones;
+       int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
+
+       zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
+       if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
+               return 1;
+       return 0;
+}
+#endif
+
  /*
   * For kswapd, balance_pgdat() will work across all this node's zones until
   * they are all at pages_high.
@@ -1332,6 +1417,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                 .swap_cluster_max = SWAP_CLUSTER_MAX,
                 .swappiness = vm_swappiness,
                 .order = order,
+               .mem_cgroup = NULL,
+               .isolate_pages = isolate_pages_global,
         };
         /*
          * temp_priority is used to remember the scanning priority at which
@@ -1356,6 +1443,7 @@ loop_again:
                 if (!priority)
                         disable_swap_token();
  
+               sc.nr_io_pages = 0;
                 all_zones_ok = 1;
  
                 /*
@@ -1368,7 +1456,8 @@ loop_again:
                         if (!populated_zone(zone))
                                 continue;
  
-                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                       if (zone_is_all_unreclaimable(zone) &&
+                           priority != DEF_PRIORITY)
                                 continue;
  
                         if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1403,7 +1492,8 @@ loop_again:
                         if (!populated_zone(zone))
                                 continue;
  
-                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                       if (zone_is_all_unreclaimable(zone) &&
+                                       priority != DEF_PRIORITY)
                                 continue;
  
                         if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1424,12 +1514,13 @@ loop_again:
                                                 lru_pages);
                         nr_reclaimed += reclaim_state->reclaimed_slab;
                         total_scanned += sc.nr_scanned;
-                       if (zone->all_unreclaimable)
+                       if (zone_is_all_unreclaimable(zone))
                                 continue;
                         if (nr_slab == 0 && zone->pages_scanned >=
                                 (zone_page_state(zone, NR_ACTIVE)
                                 + zone_page_state(zone, NR_INACTIVE)) * 6)
-                                       zone->all_unreclaimable = 1;
+                                       zone_set_flag(zone,
+                                                     ZONE_ALL_UNRECLAIMABLE);
                         /*
                          * If we've done a decent amount of scanning and
                          * the reclaim ratio is low, start doing writepage
@@ -1445,7 +1536,8 @@ loop_again:
                  * OK, kswapd is getting into trouble.  Take a nap, then take
                  * another pass across the zones.
                  */
-               if (total_scanned && priority < DEF_PRIORITY - 2)
+               if (total_scanned && priority < DEF_PRIORITY - 2 &&
+                                       sc.nr_io_pages > sc.swap_cluster_max)
                         congestion_wait(WRITE, HZ/10);
  
                 /*
@@ -1595,7 +1687,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                 if (!populated_zone(zone))
                         continue;
  
-               if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+               if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                         continue;
  
                 /* For pass = 0 we don't shrink the active list */
@@ -1650,6 +1742,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                 .swap_cluster_max = nr_pages,
                 .may_writepage = 1,
                 .swappiness = vm_swappiness,
+               .isolate_pages = isolate_pages_global,
         };
  
         current->reclaim_state = &reclaim_state;
@@ -1835,6 +1928,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                                         SWAP_CLUSTER_MAX),
                 .gfp_mask = gfp_mask,
                 .swappiness = vm_swappiness,
+               .isolate_pages = isolate_pages_global,
         };
         unsigned long slab_reclaimable;
  
@@ -1897,6 +1991,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
  int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
  {
         int node_id;
+       int ret;
  
         /*
          * Zone reclaim reclaims unmapped file backed pages and
@@ -1914,15 +2009,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                         <= zone->min_slab_pages)
                 return 0;
  
+       if (zone_is_all_unreclaimable(zone))
+               return 0;
+
         /*
-        * Avoid concurrent zone reclaims, do not reclaim in a zone that does
-        * not have reclaimable pages and if we should not delay the allocation
-        * then do not scan.
+        * Do not scan if the allocation should not be delayed.
          */
-       if (!(gfp_mask & __GFP_WAIT) ||
-               zone->all_unreclaimable ||
-               atomic_read(&zone->reclaim_in_progress) > 0 ||
-               (current->flags & PF_MEMALLOC))
+       if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
                         return 0;
  
         /*
@@ -1934,6 +2027,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         node_id = zone_to_nid(zone);
         if (node_state(node_id, N_CPU) && node_id != numa_node_id())
                 return 0;
-       return __zone_reclaim(zone, gfp_mask, order);
+
+       if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+               return 0;
+       ret = __zone_reclaim(zone, gfp_mask, order);
+       zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+
+       return ret;
  }
  #endif