mm: don't avoid high-priority reclaim on memcg limit reclaim

[karo-tx-linux.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index bc8031ef994d57a1d1622468f8df6d745853562b..9117ae8d49eed25939fedcddfcc11c278bca6bd4 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2123,30 +2123,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         unsigned long anon_prio, file_prio;
         enum scan_balance scan_balance;
         unsigned long anon, file;
-       bool force_scan = false;
         unsigned long ap, fp;
         enum lru_list lru;
-       bool some_scanned;
-       int pass;
-
-       /*
-        * If the zone or memcg is small, nr[l] can be 0.  This
-        * results in no scanning on this priority and a potential
-        * priority drop.  Global direct reclaim can go to the next
-        * zone and tends to have no problems. Global kswapd is for
-        * zone balancing and it needs to scan a minimum amount. When
-        * reclaiming for a memcg, a priority drop can cause high
-        * latencies, so it's better to scan a minimum amount there as
-        * well.
-        */
-       if (current_is_kswapd()) {
-               if (!pgdat_reclaimable(pgdat))
-                       force_scan = true;
-               if (!mem_cgroup_online(memcg))
-                       force_scan = true;
-       }
-       if (!global_reclaim(sc))
-               force_scan = true;
  
         /* If we have no swap space, do not bother scanning anon pages. */
         if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
@@ -2277,55 +2255,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         fraction[1] = fp;
         denominator = ap + fp + 1;
  out:
-       some_scanned = false;
-       /* Only use force_scan on second pass. */
-       for (pass = 0; !some_scanned && pass < 2; pass++) {
-               *lru_pages = 0;
-               for_each_evictable_lru(lru) {
-                       int file = is_file_lru(lru);
-                       unsigned long size;
-                       unsigned long scan;
-
-                       size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-                       scan = size >> sc->priority;
-
-                       if (!scan && pass && force_scan)
-                               scan = min(size, SWAP_CLUSTER_MAX);
-
-                       switch (scan_balance) {
-                       case SCAN_EQUAL:
-                               /* Scan lists relative to size */
-                               break;
-                       case SCAN_FRACT:
-                               /*
-                                * Scan types proportional to swappiness and
-                                * their relative recent reclaim efficiency.
-                                */
-                               scan = div64_u64(scan * fraction[file],
-                                                       denominator);
-                               break;
-                       case SCAN_FILE:
-                       case SCAN_ANON:
-                               /* Scan one type exclusively */
-                               if ((scan_balance == SCAN_FILE) != file) {
-                                       size = 0;
-                                       scan = 0;
-                               }
-                               break;
-                       default:
-                               /* Look ma, no brain */
-                               BUG();
-                       }
+       *lru_pages = 0;
+       for_each_evictable_lru(lru) {
+               int file = is_file_lru(lru);
+               unsigned long size;
+               unsigned long scan;
  
-                       *lru_pages += size;
-                       nr[lru] = scan;
+               size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+               scan = size >> sc->priority;
+               /*
+                * If the cgroup's already been deleted, make sure to
+                * scrape out the remaining cache.
+                */
+               if (!scan && !mem_cgroup_online(memcg))
+                       scan = min(size, SWAP_CLUSTER_MAX);
  
+               switch (scan_balance) {
+               case SCAN_EQUAL:
+                       /* Scan lists relative to size */
+                       break;
+               case SCAN_FRACT:
                         /*
-                        * Skip the second pass and don't force_scan,
-                        * if we found something to scan.
+                        * Scan types proportional to swappiness and
+                        * their relative recent reclaim efficiency.
                          */
-                       some_scanned |= !!scan;
+                       scan = div64_u64(scan * fraction[file],
+                                        denominator);
+                       break;
+               case SCAN_FILE:
+               case SCAN_ANON:
+                       /* Scan one type exclusively */
+                       if ((scan_balance == SCAN_FILE) != file) {
+                               size = 0;
+                               scan = 0;
+                       }
+                       break;
+               default:
+                       /* Look ma, no brain */
+                       BUG();
                 }
+
+               *lru_pages += size;
+               nr[lru] = scan;
         }
  }
  
@@ -2620,6 +2591,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
         } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                          sc->nr_scanned - nr_scanned, sc));
  
+       /*
+        * Kswapd gives up on balancing particular nodes after too
+        * many failures to reclaim anything from them and goes to
+        * sleep. On reclaim progress, reset the failure counter. A
+        * successful direct reclaim run will revive a dormant kswapd.
+        */
+       if (reclaimable)
+               pgdat->kswapd_failures = 0;
+
         return reclaimable;
  }
  
@@ -2694,10 +2674,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                  GFP_KERNEL | __GFP_HARDWALL))
                                 continue;
  
-                       if (sc->priority != DEF_PRIORITY &&
-                           !pgdat_reclaimable(zone->zone_pgdat))
-                               continue;       /* Let kswapd poll it */
-
                         /*
                          * If we already have plenty of memory free for
                          * compaction in this zone, don't free any more.
@@ -2817,7 +2793,7 @@ retry:
         return 0;
  }
  
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
  {
         struct zone *zone;
         unsigned long pfmemalloc_reserve = 0;
@@ -2825,10 +2801,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
         int i;
         bool wmark_ok;
  
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
-               if (!managed_zone(zone) ||
-                   pgdat_reclaimable_pages(pgdat) == 0)
+               if (!managed_zone(zone))
+                       continue;
+
+               if (!zone_reclaimable_pages(zone))
                         continue;
  
                 pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2905,7 +2886,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
  
                 /* Throttle based on the first usable node */
                 pgdat = zone->zone_pgdat;
-               if (pfmemalloc_watermark_ok(pgdat))
+               if (allow_direct_reclaim(pgdat))
                         goto out;
                 break;
         }
@@ -2927,14 +2908,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
          */
         if (!(gfp_mask & __GFP_FS)) {
                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-                       pfmemalloc_watermark_ok(pgdat), HZ);
+                       allow_direct_reclaim(pgdat), HZ);
  
                 goto check_pending;
         }
  
         /* Throttle until kswapd wakes the process */
         wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               pfmemalloc_watermark_ok(pgdat));
+               allow_direct_reclaim(pgdat));
  
  check_pending:
         if (fatal_signal_pending(current))
@@ -3114,7 +3095,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
-        * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+        * soon as allow_direct_reclaim() is true. But there is a potential
          * race between when kswapd checks the watermarks and a process gets
          * throttled. There is also a potential race if processes get
          * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3128,6 +3109,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         if (waitqueue_active(&pgdat->pfmemalloc_wait))
                 wake_up_all(&pgdat->pfmemalloc_wait);
  
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
         for (i = 0; i <= classzone_idx; i++) {
                 struct zone *zone = pgdat->node_zones + i;
  
@@ -3214,9 +3199,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
         count_vm_event(PAGEOUTRUN);
  
         do {
+               unsigned long nr_reclaimed = sc.nr_reclaimed;
                 bool raise_priority = true;
  
-               sc.nr_reclaimed = 0;
                 sc.reclaim_idx = classzone_idx;
  
                 /*
@@ -3271,7 +3256,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * If we're getting trouble reclaiming, start doing writepage
                  * even in laptop mode.
                  */
-               if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+               if (sc.priority < DEF_PRIORITY - 2)
                         sc.may_writepage = 1;
  
                 /* Call soft limit reclaim before calling shrink_node. */
@@ -3295,7 +3280,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * able to safely make forward progress. Wake them
                  */
                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-                               pfmemalloc_watermark_ok(pgdat))
+                               allow_direct_reclaim(pgdat))
                         wake_up_all(&pgdat->pfmemalloc_wait);
  
                 /* Check if kswapd should be suspending */
@@ -3306,10 +3291,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * Raise priority if scanning rate is too low or there was no
                  * progress in reclaiming pages
                  */
-               if (raise_priority || !sc.nr_reclaimed)
+               nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+               if (raise_priority || !nr_reclaimed)
                         sc.priority--;
         } while (sc.priority >= 1);
  
+       if (!sc.nr_reclaimed)
+               pgdat->kswapd_failures++;
+
  out:
         /*
          * Return the order kswapd stopped reclaiming at as
@@ -3509,6 +3498,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
  
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return;
+
         /* Only wake kswapd if all zones are unbalanced */
         for (z = 0; z <= classzone_idx; z++) {
                 zone = pgdat->node_zones + z;
@@ -3779,9 +3772,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
             sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
                 return NODE_RECLAIM_FULL;
  
-       if (!pgdat_reclaimable(pgdat))
-               return NODE_RECLAIM_FULL;
-
         /*
          * Do not scan if the allocation should not be delayed.
          */