X-Git-Url: https://git.karo-electronics.de/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=014d0d181be0b8ed4136d9dfcb72d23a0052f820;hb=15038d0de9eca33f66bc1fed4243914906e04de4;hp=7bb23ff229b6677aa3afde74ac48641dfbad7ada;hpb=df9cdc1727ed9debfce59c5f600d794a63fcbfeb;p=karo-tx-linux.git diff --git a/mm/vmscan.c b/mm/vmscan.c index 7bb23ff229b6..014d0d181be0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -87,6 +88,7 @@ struct scan_control { /* The highest zone to isolate pages for reclaim from */ enum zone_type reclaim_idx; + /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ @@ -1055,6 +1057,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. + * + * In cases 1) and 2) we activate the pages to get them out of + * the way while we continue scanning for clean pages on the + * inactive list and refilling from the active list. The + * observation here is that waiting for disk writes is more + * expensive than potentially causing reloads down the line. + * Since they're marked for immediate reclaim, they won't put + * memory pressure on the cache working set any longer than it + * takes to write them to disk. */ if (PageWriteback(page)) { /* Case 1 above */ @@ -1062,7 +1073,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; - goto keep_locked; + goto activate_locked; /* Case 2 above */ } else if (sane_reclaim(sc) || @@ -1080,7 +1091,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; - goto keep_locked; + goto activate_locked; /* Case 3 above */ } else { @@ -1152,13 +1163,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { /* - * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but only writeback - * if many dirty pages have been encountered. + * Only kswapd can writeback filesystem pages + * to avoid risk of stack overflow. But avoid + * injecting inefficient single-page IO into + * flusher writeback as much as possible: only + * write pages when we've encountered many + * dirty pages, and when we've already scanned + * the rest of the LRU for clean pages and see + * the same dirty pages again (PageReclaim). */ if (page_is_file_cache(page) && - (!current_is_kswapd() || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + (!current_is_kswapd() || !PageReclaim(page) || + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1168,7 +1184,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); - goto keep_locked; + goto activate_locked; } if (references == PAGEREF_RECLAIM_CLEAN) @@ -1373,13 +1389,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * wants to isolate pages it will be able to operate on without * blocking - clean pages for the most part. * - * ISOLATE_CLEAN means that only clean pages should be isolated. This - * is used by reclaim when it is cannot write to backing storage - * * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages * that it is possible to migrate without blocking */ - if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + if (mode & ISOLATE_ASYNC_MIGRATE) { /* All the caller can do on PageWriteback is block */ if (PageWriteback(page)) return ret; @@ -1387,10 +1400,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageDirty(page)) { struct address_space *mapping; - /* ISOLATE_CLEAN means only clean pages */ - if (mode & ISOLATE_CLEAN) - return ret; - /* * Only pages without mappings or that have a * ->migratepage callback are possible to migrate @@ -1731,8 +1740,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&pgdat->lru_lock); @@ -1806,12 +1813,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, /* * If dirty pages are scanned that are not queued for IO, it - * implies that flushers are not keeping up. In this case, flag - * the pgdat PGDAT_DIRTY and kswapd will start writing pages from - * reclaim context. + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty pages to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty pages grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep, but + * also allow kswapd to start writing pages during reclaim. */ - if (stat.nr_unqueued_dirty == nr_taken) + if (stat.nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(0, WB_REASON_VMSCAN); set_bit(PGDAT_DIRTY, &pgdat->flags); + } /* * If kswapd scans pages marked marked for immediate @@ -1929,8 +1944,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&pgdat->lru_lock); @@ -2607,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + return reclaimable; } @@ -2681,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - if (sc->priority != DEF_PRIORITY && - !pgdat_reclaimable(zone->zone_pgdat)) - continue; /* Let kswapd poll it */ - /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2759,8 +2777,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; - unsigned long total_scanned = 0; - unsigned long writeback_threshold; retry: delayacct_freepages_start(); @@ -2773,7 +2789,6 @@ retry: sc->nr_scanned = 0; shrink_zones(zonelist, sc); - total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) break; @@ -2786,20 +2801,6 @@ retry: */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; - - /* - * Try to write back as many pages as we just scanned. This - * tends to cause slow streaming writers to write data to the - * disk smoothly, at the dirtying rate, which is nice. But - * that's undesirable in laptop mode, where we *want* lumpy - * writeout. So in laptop mode, write out the whole world. - */ - writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; - if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, - WB_REASON_TRY_TO_FREE_PAGES); - sc->may_writepage = 1; - } } while (--sc->priority >= 0); delayacct_freepages_end(); @@ -2821,7 +2822,7 @@ retry: return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2829,10 +2830,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!managed_zone(zone) || - pgdat_reclaimable_pages(pgdat) == 0) + if (!managed_zone(zone)) + continue; + + if (!zone_reclaimable_pages(zone)) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2909,7 +2915,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + if (allow_direct_reclaim(pgdat)) goto out; break; } @@ -2931,14 +2937,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); + allow_direct_reclaim(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -3101,6 +3107,7 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) */ clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); return true; } @@ -3117,7 +3124,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3131,6 +3138,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; @@ -3217,9 +3228,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3274,7 +3285,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) + if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ @@ -3298,7 +3309,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) + allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ @@ -3309,10 +3320,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - if (raise_priority || !sc.nr_reclaimed) + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + out: /* * Return the order kswapd stopped reclaiming at as @@ -3512,6 +3527,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!waitqueue_active(&pgdat->kswapd_wait)) return; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return; + /* Only wake kswapd if all zones are unbalanced */ for (z = 0; z <= classzone_idx; z++) { zone = pgdat->node_zones + z; @@ -3782,9 +3801,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(pgdat)) - return NODE_RECLAIM_FULL; - /* * Do not scan if the allocation should not be delayed. */