mm: remove unnecessary reclaimability check from NUMA balancing target

[karo-tx-linux.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 7bb23ff229b6677aa3afde74ac48641dfbad7ada..014d0d181be0b8ed4136d9dfcb72d23a0052f820 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -14,6 +14,7 @@
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  
  #include <linux/mm.h>
+#include <linux/sched/mm.h>
  #include <linux/module.h>
  #include <linux/gfp.h>
  #include <linux/kernel_stat.h>
@@ -87,6 +88,7 @@ struct scan_control {
         /* The highest zone to isolate pages for reclaim from */
         enum zone_type reclaim_idx;
  
+       /* Writepage batching in laptop mode; RECLAIM_WRITE */
         unsigned int may_writepage:1;
  
         /* Can mapped pages be reclaimed? */
@@ -1055,6 +1057,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                  *    throttling so we could easily OOM just because too many
                  *    pages are in writeback and there is nothing else to
                  *    reclaim. Wait for the writeback to complete.
+                *
+                * In cases 1) and 2) we activate the pages to get them out of
+                * the way while we continue scanning for clean pages on the
+                * inactive list and refilling from the active list. The
+                * observation here is that waiting for disk writes is more
+                * expensive than potentially causing reloads down the line.
+                * Since they're marked for immediate reclaim, they won't put
+                * memory pressure on the cache working set any longer than it
+                * takes to write them to disk.
                  */
                 if (PageWriteback(page)) {
                         /* Case 1 above */
@@ -1062,7 +1073,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                             PageReclaim(page) &&
                             test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
                                 nr_immediate++;
-                               goto keep_locked;
+                               goto activate_locked;
  
                         /* Case 2 above */
                         } else if (sane_reclaim(sc) ||
@@ -1080,7 +1091,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                  */
                                 SetPageReclaim(page);
                                 nr_writeback++;
-                               goto keep_locked;
+                               goto activate_locked;
  
                         /* Case 3 above */
                         } else {
@@ -1152,13 +1163,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  
                 if (PageDirty(page)) {
                         /*
-                        * Only kswapd can writeback filesystem pages to
-                        * avoid risk of stack overflow but only writeback
-                        * if many dirty pages have been encountered.
+                        * Only kswapd can writeback filesystem pages
+                        * to avoid risk of stack overflow. But avoid
+                        * injecting inefficient single-page IO into
+                        * flusher writeback as much as possible: only
+                        * write pages when we've encountered many
+                        * dirty pages, and when we've already scanned
+                        * the rest of the LRU for clean pages and see
+                        * the same dirty pages again (PageReclaim).
                          */
                         if (page_is_file_cache(page) &&
-                                       (!current_is_kswapd() ||
-                                        !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
+                           (!current_is_kswapd() || !PageReclaim(page) ||
+                            !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
                                 /*
                                  * Immediately reclaim when written back.
                                  * Similar in principal to deactivate_page()
@@ -1168,7 +1184,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
                                 SetPageReclaim(page);
  
-                               goto keep_locked;
+                               goto activate_locked;
                         }
  
                         if (references == PAGEREF_RECLAIM_CLEAN)
@@ -1373,13 +1389,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
          * wants to isolate pages it will be able to operate on without
          * blocking - clean pages for the most part.
          *
-        * ISOLATE_CLEAN means that only clean pages should be isolated. This
-        * is used by reclaim when it is cannot write to backing storage
-        *
          * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
          * that it is possible to migrate without blocking
          */
-       if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+       if (mode & ISOLATE_ASYNC_MIGRATE) {
                 /* All the caller can do on PageWriteback is block */
                 if (PageWriteback(page))
                         return ret;
@@ -1387,10 +1400,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
                 if (PageDirty(page)) {
                         struct address_space *mapping;
  
-                       /* ISOLATE_CLEAN means only clean pages */
-                       if (mode & ISOLATE_CLEAN)
-                               return ret;
-
                         /*
                          * Only pages without mappings or that have a
                          * ->migratepage callback are possible to migrate
@@ -1731,8 +1740,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  
         if (!sc->may_unmap)
                 isolate_mode |= ISOLATE_UNMAPPED;
-       if (!sc->may_writepage)
-               isolate_mode |= ISOLATE_CLEAN;
  
         spin_lock_irq(&pgdat->lru_lock);
  
@@ -1806,12 +1813,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  
                 /*
                  * If dirty pages are scanned that are not queued for IO, it
-                * implies that flushers are not keeping up. In this case, flag
-                * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
-                * reclaim context.
+                * implies that flushers are not doing their job. This can
+                * happen when memory pressure pushes dirty pages to the end of
+                * the LRU before the dirty limits are breached and the dirty
+                * data has expired. It can also happen when the proportion of
+                * dirty pages grows not through writes but through memory
+                * pressure reclaiming all the clean cache. And in some cases,
+                * the flushers simply cannot keep up with the allocation
+                * rate. Nudge the flusher threads in case they are asleep, but
+                * also allow kswapd to start writing pages during reclaim.
                  */
-               if (stat.nr_unqueued_dirty == nr_taken)
+               if (stat.nr_unqueued_dirty == nr_taken) {
+                       wakeup_flusher_threads(0, WB_REASON_VMSCAN);
                         set_bit(PGDAT_DIRTY, &pgdat->flags);
+               }
  
                 /*
                  * If kswapd scans pages marked marked for immediate
@@ -1929,8 +1944,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
  
         if (!sc->may_unmap)
                 isolate_mode |= ISOLATE_UNMAPPED;
-       if (!sc->may_writepage)
-               isolate_mode |= ISOLATE_CLEAN;
  
         spin_lock_irq(&pgdat->lru_lock);
  
@@ -2607,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
         } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                          sc->nr_scanned - nr_scanned, sc));
  
+       /*
+        * Kswapd gives up on balancing particular nodes after too
+        * many failures to reclaim anything from them and goes to
+        * sleep. On reclaim progress, reset the failure counter. A
+        * successful direct reclaim run will revive a dormant kswapd.
+        */
+       if (reclaimable)
+               pgdat->kswapd_failures = 0;
+
         return reclaimable;
  }
  
@@ -2681,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                  GFP_KERNEL | __GFP_HARDWALL))
                                 continue;
  
-                       if (sc->priority != DEF_PRIORITY &&
-                           !pgdat_reclaimable(zone->zone_pgdat))
-                               continue;       /* Let kswapd poll it */
-
                         /*
                          * If we already have plenty of memory free for
                          * compaction in this zone, don't free any more.
@@ -2759,8 +2777,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                           struct scan_control *sc)
  {
         int initial_priority = sc->priority;
-       unsigned long total_scanned = 0;
-       unsigned long writeback_threshold;
  retry:
         delayacct_freepages_start();
  
@@ -2773,7 +2789,6 @@ retry:
                 sc->nr_scanned = 0;
                 shrink_zones(zonelist, sc);
  
-               total_scanned += sc->nr_scanned;
                 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
                         break;
  
@@ -2786,20 +2801,6 @@ retry:
                  */
                 if (sc->priority < DEF_PRIORITY - 2)
                         sc->may_writepage = 1;
-
-               /*
-                * Try to write back as many pages as we just scanned.  This
-                * tends to cause slow streaming writers to write data to the
-                * disk smoothly, at the dirtying rate, which is nice.   But
-                * that's undesirable in laptop mode, where we *want* lumpy
-                * writeout.  So in laptop mode, write out the whole world.
-                */
-               writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
-               if (total_scanned > writeback_threshold) {
-                       wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
-                                               WB_REASON_TRY_TO_FREE_PAGES);
-                       sc->may_writepage = 1;
-               }
         } while (--sc->priority >= 0);
  
         delayacct_freepages_end();
@@ -2821,7 +2822,7 @@ retry:
         return 0;
  }
  
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
  {
         struct zone *zone;
         unsigned long pfmemalloc_reserve = 0;
@@ -2829,10 +2830,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
         int i;
         bool wmark_ok;
  
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
-               if (!managed_zone(zone) ||
-                   pgdat_reclaimable_pages(pgdat) == 0)
+               if (!managed_zone(zone))
+                       continue;
+
+               if (!zone_reclaimable_pages(zone))
                         continue;
  
                 pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2909,7 +2915,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
  
                 /* Throttle based on the first usable node */
                 pgdat = zone->zone_pgdat;
-               if (pfmemalloc_watermark_ok(pgdat))
+               if (allow_direct_reclaim(pgdat))
                         goto out;
                 break;
         }
@@ -2931,14 +2937,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
          */
         if (!(gfp_mask & __GFP_FS)) {
                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-                       pfmemalloc_watermark_ok(pgdat), HZ);
+                       allow_direct_reclaim(pgdat), HZ);
  
                 goto check_pending;
         }
  
         /* Throttle until kswapd wakes the process */
         wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               pfmemalloc_watermark_ok(pgdat));
+               allow_direct_reclaim(pgdat));
  
  check_pending:
         if (fatal_signal_pending(current))
@@ -3101,6 +3107,7 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
          */
         clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
         clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
+       clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
  
         return true;
  }
@@ -3117,7 +3124,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
-        * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+        * soon as allow_direct_reclaim() is true. But there is a potential
          * race between when kswapd checks the watermarks and a process gets
          * throttled. There is also a potential race if processes get
          * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3131,6 +3138,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         if (waitqueue_active(&pgdat->pfmemalloc_wait))
                 wake_up_all(&pgdat->pfmemalloc_wait);
  
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
         for (i = 0; i <= classzone_idx; i++) {
                 struct zone *zone = pgdat->node_zones + i;
  
@@ -3217,9 +3228,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
         count_vm_event(PAGEOUTRUN);
  
         do {
+               unsigned long nr_reclaimed = sc.nr_reclaimed;
                 bool raise_priority = true;
  
-               sc.nr_reclaimed = 0;
                 sc.reclaim_idx = classzone_idx;
  
                 /*
@@ -3274,7 +3285,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * If we're getting trouble reclaiming, start doing writepage
                  * even in laptop mode.
                  */
-               if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+               if (sc.priority < DEF_PRIORITY - 2)
                         sc.may_writepage = 1;
  
                 /* Call soft limit reclaim before calling shrink_node. */
@@ -3298,7 +3309,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * able to safely make forward progress. Wake them
                  */
                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-                               pfmemalloc_watermark_ok(pgdat))
+                               allow_direct_reclaim(pgdat))
                         wake_up_all(&pgdat->pfmemalloc_wait);
  
                 /* Check if kswapd should be suspending */
@@ -3309,10 +3320,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * Raise priority if scanning rate is too low or there was no
                  * progress in reclaiming pages
                  */
-               if (raise_priority || !sc.nr_reclaimed)
+               nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+               if (raise_priority || !nr_reclaimed)
                         sc.priority--;
         } while (sc.priority >= 1);
  
+       if (!sc.nr_reclaimed)
+               pgdat->kswapd_failures++;
+
  out:
         /*
          * Return the order kswapd stopped reclaiming at as
@@ -3512,6 +3527,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
  
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return;
+
         /* Only wake kswapd if all zones are unbalanced */
         for (z = 0; z <= classzone_idx; z++) {
                 zone = pgdat->node_zones + z;
@@ -3782,9 +3801,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
             sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
                 return NODE_RECLAIM_FULL;
  
-       if (!pgdat_reclaimable(pgdat))
-               return NODE_RECLAIM_FULL;
-
         /*
          * Do not scan if the allocation should not be delayed.
          */