2 * linux/mm/compaction.c
4 * Memory compaction for the reduction of external fragmentation. Note that
5 * this heavily depends upon page migration to do all the real heavy
8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
10 #include <linux/swap.h>
11 #include <linux/migrate.h>
12 #include <linux/compaction.h>
13 #include <linux/mm_inline.h>
14 #include <linux/backing-dev.h>
15 #include <linux/sysctl.h>
16 #include <linux/sysfs.h>
17 #include <linux/balloon_compaction.h>
18 #include <linux/page-isolation.h>
21 #ifdef CONFIG_COMPACTION
22 static inline void count_compact_event(enum vm_event_item item)
27 static inline void count_compact_events(enum vm_event_item item, long delta)
29 count_vm_events(item, delta);
32 #define count_compact_event(item) do { } while (0)
33 #define count_compact_events(item, delta) do { } while (0)
36 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
38 #define CREATE_TRACE_POINTS
39 #include <trace/events/compaction.h>
41 static unsigned long release_freepages(struct list_head *freelist)
43 struct page *page, *next;
44 unsigned long count = 0;
46 list_for_each_entry_safe(page, next, freelist, lru) {
55 static void map_pages(struct list_head *list)
59 list_for_each_entry(page, list, lru) {
60 arch_alloc_page(page, 0);
61 kernel_map_pages(page, 1, 1);
65 static inline bool migrate_async_suitable(int migratetype)
67 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
71 * Check that the whole (or subset of) a pageblock given by the interval of
72 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
73 * with the migration of free compaction scanner. The scanners then need to
74 * use only pfn_valid_within() check for arches that allow holes within
77 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
79 * It's possible on some configurations to have a setup like node0 node1 node0
80 * i.e. it's possible that all pages within a zones range of pages do not
81 * belong to a single zone. We assume that a border between node0 and node1
82 * can occur within a single pageblock, but not a node0 node1 node0
83 * interleaving within a single pageblock. It is therefore sufficient to check
84 * the first and last page of a pageblock and avoid checking each individual
85 * page in a pageblock.
87 static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
88 unsigned long end_pfn, struct zone *zone)
90 struct page *start_page;
91 struct page *end_page;
93 /* end_pfn is one past the range we are checking */
96 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
99 start_page = pfn_to_page(start_pfn);
101 if (page_zone(start_page) != zone)
104 end_page = pfn_to_page(end_pfn);
106 /* This gives a shorter code than deriving page_zone(end_page) */
107 if (page_zone_id(start_page) != page_zone_id(end_page))
113 #ifdef CONFIG_COMPACTION
114 /* Returns true if the pageblock should be scanned for pages to isolate. */
115 static inline bool isolation_suitable(struct compact_control *cc,
118 if (cc->ignore_skip_hint)
121 return !get_pageblock_skip(page);
125 * This function is called to clear all cached information on pageblocks that
126 * should be skipped for page isolation when the migrate and free page scanner
129 static void __reset_isolation_suitable(struct zone *zone)
131 unsigned long start_pfn = zone->zone_start_pfn;
132 unsigned long end_pfn = zone_end_pfn(zone);
135 zone->compact_cached_migrate_pfn[0] = start_pfn;
136 zone->compact_cached_migrate_pfn[1] = start_pfn;
137 zone->compact_cached_free_pfn = end_pfn;
138 zone->compact_blockskip_flush = false;
140 /* Walk the zone and mark every pageblock as suitable for isolation */
141 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
149 page = pfn_to_page(pfn);
150 if (zone != page_zone(page))
153 clear_pageblock_skip(page);
157 void reset_isolation_suitable(pg_data_t *pgdat)
161 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
162 struct zone *zone = &pgdat->node_zones[zoneid];
163 if (!populated_zone(zone))
166 /* Only flush if a full compaction finished recently */
167 if (zone->compact_blockskip_flush)
168 __reset_isolation_suitable(zone);
173 * If no pages were isolated then mark this pageblock to be skipped in the
174 * future. The information is later cleared by __reset_isolation_suitable().
176 static void update_pageblock_skip(struct compact_control *cc,
177 struct page *page, unsigned long nr_isolated,
178 bool migrate_scanner)
180 struct zone *zone = cc->zone;
183 if (cc->ignore_skip_hint)
192 set_pageblock_skip(page);
194 pfn = page_to_pfn(page);
196 /* Update where async and sync compaction should restart */
197 if (migrate_scanner) {
198 if (cc->finished_update_migrate)
200 if (pfn > zone->compact_cached_migrate_pfn[0])
201 zone->compact_cached_migrate_pfn[0] = pfn;
202 if (cc->mode != MIGRATE_ASYNC &&
203 pfn > zone->compact_cached_migrate_pfn[1])
204 zone->compact_cached_migrate_pfn[1] = pfn;
206 if (cc->finished_update_free)
208 if (pfn < zone->compact_cached_free_pfn)
209 zone->compact_cached_free_pfn = pfn;
213 static inline bool isolation_suitable(struct compact_control *cc,
219 static void update_pageblock_skip(struct compact_control *cc,
220 struct page *page, unsigned long nr_isolated,
221 bool migrate_scanner)
224 #endif /* CONFIG_COMPACTION */
226 static inline bool should_release_lock(spinlock_t *lock)
228 return need_resched() || spin_is_contended(lock);
232 * Compaction requires the taking of some coarse locks that are potentially
233 * very heavily contended. Check if the process needs to be scheduled or
234 * if the lock is contended. For async compaction, back out in the event
235 * if contention is severe. For sync compaction, schedule.
237 * Returns true if the lock is held.
238 * Returns false if the lock is released and compaction should abort
240 static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
241 bool locked, struct compact_control *cc)
243 if (should_release_lock(lock)) {
245 spin_unlock_irqrestore(lock, *flags);
249 /* async aborts if taking too long or contended */
250 if (cc->mode == MIGRATE_ASYNC) {
251 cc->contended = true;
259 spin_lock_irqsave(lock, *flags);
264 * Aside from avoiding lock contention, compaction also periodically checks
265 * need_resched() and either schedules in sync compaction or aborts async
266 * compaction. This is similar to what compact_checklock_irqsave() does, but
267 * is used where no lock is concerned.
269 * Returns false when no scheduling was needed, or sync compaction scheduled.
270 * Returns true when async compaction should abort.
272 static inline bool compact_should_abort(struct compact_control *cc)
274 /* async compaction aborts if contended */
275 if (need_resched()) {
276 if (cc->mode == MIGRATE_ASYNC) {
277 cc->contended = true;
287 /* Returns true if the page is within a block suitable for migration to */
288 static bool suitable_migration_target(struct page *page)
290 /* If the page is a large free page, then disallow migration */
291 if (PageBuddy(page) && page_order(page) >= pageblock_order)
294 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
295 if (migrate_async_suitable(get_pageblock_migratetype(page)))
298 /* Otherwise skip the block */
303 * Isolate free pages onto a private freelist. If @strict is true, will abort
304 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
305 * (even though it may still end up isolating some pages).
307 static unsigned long isolate_freepages_block(struct compact_control *cc,
308 unsigned long blockpfn,
309 unsigned long end_pfn,
310 struct list_head *freelist,
313 int nr_scanned = 0, total_isolated = 0;
314 struct page *cursor, *valid_page = NULL;
318 cursor = pfn_to_page(blockpfn);
320 /* Isolate free pages. */
321 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
323 struct page *page = cursor;
326 if (!pfn_valid_within(blockpfn))
331 if (!PageBuddy(page))
335 * The zone lock must be held to isolate freepages.
336 * Unfortunately this is a very coarse lock and can be
337 * heavily contended if there are parallel allocations
338 * or parallel compactions. For async compaction do not
339 * spin on the lock and we acquire the lock as late as
342 locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
347 /* Recheck this is a buddy page under lock */
348 if (!PageBuddy(page))
351 /* Found a free page, break it into order-0 pages */
352 isolated = split_free_page(page);
353 total_isolated += isolated;
354 for (i = 0; i < isolated; i++) {
355 list_add(&page->lru, freelist);
359 /* If a page was split, advance to the end of it */
361 blockpfn += isolated - 1;
362 cursor += isolated - 1;
374 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
377 * If strict isolation is requested by CMA then check that all the
378 * pages requested were isolated. If there were any failures, 0 is
379 * returned and CMA will fail.
381 if (strict && blockpfn < end_pfn)
385 spin_unlock_irqrestore(&cc->zone->lock, flags);
387 /* Update the pageblock-skip if the whole pageblock was scanned */
388 if (blockpfn == end_pfn)
389 update_pageblock_skip(cc, valid_page, total_isolated, false);
391 count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
393 count_compact_events(COMPACTISOLATED, total_isolated);
394 return total_isolated;
398 * isolate_freepages_range() - isolate free pages.
399 * @start_pfn: The first PFN to start isolating.
400 * @end_pfn: The one-past-last PFN.
402 * Non-free pages, invalid PFNs, or zone boundaries within the
403 * [start_pfn, end_pfn) range are considered errors, cause function to
404 * undo its actions and return zero.
406 * Otherwise, function returns one-past-the-last PFN of isolated page
407 * (which may be greater then end_pfn if end fell in a middle of
411 isolate_freepages_range(struct compact_control *cc,
412 unsigned long start_pfn, unsigned long end_pfn)
414 unsigned long isolated, pfn, block_end_pfn;
418 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
420 for (; pfn < end_pfn; pfn += isolated,
421 block_end_pfn += pageblock_nr_pages) {
423 block_end_pfn = min(block_end_pfn, end_pfn);
425 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
428 isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
432 * In strict mode, isolate_freepages_block() returns 0 if
433 * there are any holes in the block (ie. invalid PFNs or
440 * If we managed to isolate pages, it is always (1 << n) *
441 * pageblock_nr_pages for some non-negative n. (Max order
442 * page may span two pageblocks).
446 /* split_free_page does not map the pages */
447 map_pages(&freelist);
450 /* Loop terminated early, cleanup. */
451 release_freepages(&freelist);
455 /* We don't use freelists for anything. */
459 /* Update the number of anon and file isolated pages in the zone */
460 static void acct_isolated(struct zone *zone, struct compact_control *cc)
463 unsigned int count[2] = { 0, };
465 if (list_empty(&cc->migratepages))
468 list_for_each_entry(page, &cc->migratepages, lru)
469 count[!!page_is_file_cache(page)]++;
471 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
472 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
475 /* Similar to reclaim, but different enough that they don't share logic */
476 static bool too_many_isolated(struct zone *zone)
478 unsigned long active, inactive, isolated;
480 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
481 zone_page_state(zone, NR_INACTIVE_ANON);
482 active = zone_page_state(zone, NR_ACTIVE_FILE) +
483 zone_page_state(zone, NR_ACTIVE_ANON);
484 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
485 zone_page_state(zone, NR_ISOLATED_ANON);
487 return isolated > (inactive + active) / 2;
491 * isolate_migratepages_block() - isolate all migrate-able pages within
493 * @cc: Compaction control structure.
494 * @low_pfn: The first PFN to isolate
495 * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
496 * @isolate_mode: Isolation mode to be used.
498 * Isolate all pages that can be migrated from the range specified by
499 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
500 * Returns zero if there is a fatal signal pending, otherwise PFN of the
501 * first page that was not scanned (which may be both less, equal to or more
504 * The pages are isolated on cc->migratepages list (not required to be empty),
505 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
506 * is neither read nor updated.
509 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
510 unsigned long end_pfn, isolate_mode_t isolate_mode)
512 struct zone *zone = cc->zone;
513 unsigned long nr_scanned = 0, nr_isolated = 0;
514 struct list_head *migratelist = &cc->migratepages;
515 struct lruvec *lruvec;
518 struct page *page = NULL, *valid_page = NULL;
521 * Ensure that there are not too many pages isolated from the LRU
522 * list by either parallel reclaimers or compaction. If there are,
523 * delay for some time until fewer pages are isolated
525 while (unlikely(too_many_isolated(zone))) {
526 /* async migration should just abort */
527 if (cc->mode == MIGRATE_ASYNC)
530 congestion_wait(BLK_RW_ASYNC, HZ/10);
532 if (fatal_signal_pending(current))
536 if (compact_should_abort(cc))
539 /* Time to isolate some pages for migration */
540 for (; low_pfn < end_pfn; low_pfn++) {
541 /* give a chance to irqs before checking need_resched() */
542 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
543 if (should_release_lock(&zone->lru_lock)) {
544 spin_unlock_irqrestore(&zone->lru_lock, flags);
549 if (!pfn_valid_within(low_pfn))
553 page = pfn_to_page(low_pfn);
559 * Skip if free. page_order cannot be used without zone->lock
560 * as nothing prevents parallel allocations or buddy merging.
566 * Check may be lockless but that's ok as we recheck later.
567 * It's possible to migrate LRU pages and balloon pages
568 * Skip any other type of page
570 if (!PageLRU(page)) {
571 if (unlikely(balloon_page_movable(page))) {
572 if (locked && balloon_page_isolate(page)) {
573 /* Successfully isolated */
574 goto isolate_success;
581 * PageLRU is set. lru_lock normally excludes isolation
582 * splitting and collapsing (collapsing has already happened
583 * if PageLRU is set) but the lock is not necessarily taken
584 * here and it is wasteful to take it just to check transhuge.
585 * Check TransHuge without lock and skip the whole pageblock if
586 * it's either a transhuge or hugetlbfs page, as calling
587 * compound_order() without preventing THP from splitting the
588 * page underneath us may return surprising results.
590 if (PageTransHuge(page)) {
592 low_pfn = ALIGN(low_pfn + 1,
593 pageblock_nr_pages) - 1;
595 low_pfn += (1 << compound_order(page)) - 1;
601 * Migration will fail if an anonymous page is pinned in memory,
602 * so avoid taking lru_lock and isolating it unnecessarily in an
603 * admittedly racy check.
605 if (!page_mapping(page) &&
606 page_count(page) > page_mapcount(page))
609 /* Check if it is ok to still hold the lock */
610 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
612 if (!locked || fatal_signal_pending(current))
615 /* Recheck PageLRU and PageTransHuge under lock */
618 if (PageTransHuge(page)) {
619 low_pfn += (1 << compound_order(page)) - 1;
623 lruvec = mem_cgroup_page_lruvec(page, zone);
625 /* Try isolate the page */
626 if (__isolate_lru_page(page, isolate_mode) != 0)
629 VM_BUG_ON_PAGE(PageTransCompound(page), page);
631 /* Successfully isolated */
632 del_page_from_lru_list(page, lruvec, page_lru(page));
635 cc->finished_update_migrate = true;
636 list_add(&page->lru, migratelist);
637 cc->nr_migratepages++;
640 /* Avoid isolating too much */
641 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
648 spin_unlock_irqrestore(&zone->lru_lock, flags);
651 * Update the pageblock-skip information and cached scanner pfn,
652 * if the whole pageblock was scanned without isolating any page.
654 if (low_pfn == end_pfn)
655 update_pageblock_skip(cc, valid_page, nr_isolated, true);
657 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
659 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
661 count_compact_events(COMPACTISOLATED, nr_isolated);
667 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
668 * @cc: Compaction control structure.
669 * @start_pfn: The first PFN to start isolating.
670 * @end_pfn: The one-past-last PFN.
672 * Returns zero if isolation fails fatally due to e.g. pending signal.
673 * Otherwise, function returns one-past-the-last PFN of isolated page
674 * (which may be greater than end_pfn if end fell in a middle of a THP page).
677 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
678 unsigned long end_pfn)
680 unsigned long pfn, block_end_pfn;
682 /* Scan block by block. First and last block may be incomplete */
684 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
686 for (; pfn < end_pfn; pfn = block_end_pfn,
687 block_end_pfn += pageblock_nr_pages) {
689 block_end_pfn = min(block_end_pfn, end_pfn);
691 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
694 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
695 ISOLATE_UNEVICTABLE);
698 * In case of fatal failure, release everything that might
699 * have been isolated in the previous iteration, and signal
700 * the failure back to caller.
703 putback_movable_pages(&cc->migratepages);
704 cc->nr_migratepages = 0;
708 acct_isolated(cc->zone, cc);
713 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
714 #ifdef CONFIG_COMPACTION
716 * Based on information in the current compact_control, find blocks
717 * suitable for isolating free pages from and then isolate them.
719 static void isolate_freepages(struct compact_control *cc)
721 struct zone *zone = cc->zone;
723 unsigned long block_start_pfn; /* start of current pageblock */
724 unsigned long block_end_pfn; /* end of current pageblock */
725 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
726 int nr_freepages = cc->nr_freepages;
727 struct list_head *freelist = &cc->freepages;
730 * Initialise the free scanner. The starting point is where we last
731 * successfully isolated from, zone-cached value, or the end of the
732 * zone when isolating for the first time. We need this aligned to
733 * the pageblock boundary, because we do
734 * block_start_pfn -= pageblock_nr_pages in the for loop.
735 * For ending point, take care when isolating in last pageblock of a
736 * a zone which ends in the middle of a pageblock.
737 * The low boundary is the end of the pageblock the migration scanner
740 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
741 block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
743 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
746 * Isolate free pages until enough are available to migrate the
747 * pages on cc->migratepages. We stop searching if the migrate
748 * and free page scanners meet or enough free pages are isolated.
750 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
751 block_end_pfn = block_start_pfn,
752 block_start_pfn -= pageblock_nr_pages) {
753 unsigned long isolated;
756 * This can iterate a massively long zone without finding any
757 * suitable migration targets, so periodically check if we need
758 * to schedule, or even abort async compaction.
760 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
761 && compact_should_abort(cc))
764 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
769 /* Check the block is suitable for migration */
770 if (!suitable_migration_target(page))
773 /* If isolation recently failed, do not retry */
774 if (!isolation_suitable(cc, page))
777 /* Found a block suitable for isolating free pages from */
778 cc->free_pfn = block_start_pfn;
779 isolated = isolate_freepages_block(cc, block_start_pfn,
780 block_end_pfn, freelist, false);
781 nr_freepages += isolated;
784 * Set a flag that we successfully isolated in this pageblock.
785 * In the next loop iteration, zone->compact_cached_free_pfn
786 * will not be updated and thus it will effectively contain the
787 * highest pageblock we isolated pages from.
790 cc->finished_update_free = true;
793 * isolate_freepages_block() might have aborted due to async
794 * compaction being contended
800 /* split_free_page does not map the pages */
804 * If we crossed the migrate scanner, we want to keep it that way
805 * so that compact_finished() may detect this
807 if (block_start_pfn < low_pfn)
808 cc->free_pfn = cc->migrate_pfn;
810 cc->nr_freepages = nr_freepages;
814 * This is a migrate-callback that "allocates" freepages by taking pages
815 * from the isolated freelists in the block we are migrating to.
817 static struct page *compaction_alloc(struct page *migratepage,
821 struct compact_control *cc = (struct compact_control *)data;
822 struct page *freepage;
825 * Isolate free pages if necessary, and if we are not aborting due to
828 if (list_empty(&cc->freepages)) {
830 isolate_freepages(cc);
832 if (list_empty(&cc->freepages))
836 freepage = list_entry(cc->freepages.next, struct page, lru);
837 list_del(&freepage->lru);
844 * This is a migrate-callback that "frees" freepages back to the isolated
845 * freelist. All pages on the freelist are from the same zone, so there is no
846 * special handling needed for NUMA.
848 static void compaction_free(struct page *page, unsigned long data)
850 struct compact_control *cc = (struct compact_control *)data;
852 list_add(&page->lru, &cc->freepages);
856 /* possible outcome of isolate_migratepages */
858 ISOLATE_ABORT, /* Abort compaction now */
859 ISOLATE_NONE, /* No pages isolated, continue scanning */
860 ISOLATE_SUCCESS, /* Pages isolated, migrate */
864 * Isolate all pages that can be migrated from the first suitable block,
865 * starting at the block pointed to by the migrate scanner pfn within
868 static isolate_migrate_t isolate_migratepages(struct zone *zone,
869 struct compact_control *cc)
871 unsigned long low_pfn, end_pfn;
873 const isolate_mode_t isolate_mode =
874 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
877 * Start at where we last stopped, or beginning of the zone as
878 * initialized by compact_zone()
880 low_pfn = cc->migrate_pfn;
882 /* Only scan within a pageblock boundary */
883 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
886 * Iterate over whole pageblocks until we find the first suitable.
887 * Do not cross the free scanner.
889 for (; end_pfn <= cc->free_pfn;
890 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
893 * This can potentially iterate a massively long zone with
894 * many pageblocks unsuitable, so periodically check if we
895 * need to schedule, or even abort async compaction.
897 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
898 && compact_should_abort(cc))
901 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
905 /* If isolation recently failed, do not retry */
906 if (!isolation_suitable(cc, page))
910 * For async compaction, also only scan in MOVABLE blocks.
911 * Async compaction is optimistic to see if the minimum amount
912 * of work satisfies the allocation.
914 if (cc->mode == MIGRATE_ASYNC &&
915 !migrate_async_suitable(get_pageblock_migratetype(page)))
918 /* Perform the isolation */
919 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
922 if (!low_pfn || cc->contended)
923 return ISOLATE_ABORT;
926 * Either we isolated something and proceed with migration. Or
927 * we failed and compact_zone should decide if we should
933 acct_isolated(zone, cc);
934 /* Record where migration scanner will be restarted */
935 cc->migrate_pfn = low_pfn;
937 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
940 static int compact_finished(struct zone *zone,
941 struct compact_control *cc)
944 unsigned long watermark;
946 if (cc->contended || fatal_signal_pending(current))
947 return COMPACT_PARTIAL;
949 /* Compaction run completes if the migrate and free scanner meet */
950 if (cc->free_pfn <= cc->migrate_pfn) {
951 /* Let the next compaction start anew. */
952 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
953 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
954 zone->compact_cached_free_pfn = zone_end_pfn(zone);
957 * Mark that the PG_migrate_skip information should be cleared
958 * by kswapd when it goes to sleep. kswapd does not set the
959 * flag itself as the decision to be clear should be directly
960 * based on an allocation request.
962 if (!current_is_kswapd())
963 zone->compact_blockskip_flush = true;
965 return COMPACT_COMPLETE;
969 * order == -1 is expected when compacting via
970 * /proc/sys/vm/compact_memory
973 return COMPACT_CONTINUE;
975 /* Compaction run is not finished if the watermark is not met */
976 watermark = low_wmark_pages(zone);
977 watermark += (1 << cc->order);
979 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
980 return COMPACT_CONTINUE;
982 /* Direct compactor: Is a suitable page free? */
983 for (order = cc->order; order < MAX_ORDER; order++) {
984 struct free_area *area = &zone->free_area[order];
986 /* Job done if page is free of the right migratetype */
987 if (!list_empty(&area->free_list[cc->migratetype]))
988 return COMPACT_PARTIAL;
990 /* Job done if allocation would set block type */
991 if (cc->order >= pageblock_order && area->nr_free)
992 return COMPACT_PARTIAL;
995 return COMPACT_CONTINUE;
999 * compaction_suitable: Is this suitable to run compaction on this zone now?
1001 * COMPACT_SKIPPED - If there are too few free pages for compaction
1002 * COMPACT_PARTIAL - If the allocation would succeed without compaction
1003 * COMPACT_CONTINUE - If compaction should run now
1005 unsigned long compaction_suitable(struct zone *zone, int order)
1008 unsigned long watermark;
1011 * order == -1 is expected when compacting via
1012 * /proc/sys/vm/compact_memory
1015 return COMPACT_CONTINUE;
1018 * Watermarks for order-0 must be met for compaction. Note the 2UL.
1019 * This is because during migration, copies of pages need to be
1020 * allocated and for a short time, the footprint is higher
1022 watermark = low_wmark_pages(zone) + (2UL << order);
1023 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1024 return COMPACT_SKIPPED;
1027 * fragmentation index determines if allocation failures are due to
1028 * low memory or external fragmentation
1030 * index of -1000 implies allocations might succeed depending on
1032 * index towards 0 implies failure is due to lack of memory
1033 * index towards 1000 implies failure is due to fragmentation
1035 * Only compact if a failure would be due to fragmentation.
1037 fragindex = fragmentation_index(zone, order);
1038 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1039 return COMPACT_SKIPPED;
1041 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
1043 return COMPACT_PARTIAL;
1045 return COMPACT_CONTINUE;
1048 static int compact_zone(struct zone *zone, struct compact_control *cc)
1051 unsigned long start_pfn = zone->zone_start_pfn;
1052 unsigned long end_pfn = zone_end_pfn(zone);
1053 const bool sync = cc->mode != MIGRATE_ASYNC;
1055 ret = compaction_suitable(zone, cc->order);
1057 case COMPACT_PARTIAL:
1058 case COMPACT_SKIPPED:
1059 /* Compaction is likely to fail */
1061 case COMPACT_CONTINUE:
1062 /* Fall through to compaction */
1067 * Clear pageblock skip if there were failures recently and compaction
1068 * is about to be retried after being deferred. kswapd does not do
1069 * this reset as it'll reset the cached information when going to sleep.
1071 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
1072 __reset_isolation_suitable(zone);
1075 * Setup to move all movable pages to the end of the zone. Used cached
1076 * information on where the scanners should start but check that it
1077 * is initialised by ensuring the values are within zone boundaries.
1079 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1080 cc->free_pfn = zone->compact_cached_free_pfn;
1081 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
1082 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
1083 zone->compact_cached_free_pfn = cc->free_pfn;
1085 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
1086 cc->migrate_pfn = start_pfn;
1087 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1088 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1091 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
1093 migrate_prep_local();
1095 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
1098 switch (isolate_migratepages(zone, cc)) {
1100 ret = COMPACT_PARTIAL;
1101 putback_movable_pages(&cc->migratepages);
1102 cc->nr_migratepages = 0;
1106 case ISOLATE_SUCCESS:
1110 err = migrate_pages(&cc->migratepages, compaction_alloc,
1111 compaction_free, (unsigned long)cc, cc->mode,
1114 trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1117 /* All pages were either migrated or will be released */
1118 cc->nr_migratepages = 0;
1120 putback_movable_pages(&cc->migratepages);
1122 * migrate_pages() may return -ENOMEM when scanners meet
1123 * and we want compact_finished() to detect it
1125 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
1126 ret = COMPACT_PARTIAL;
1133 /* Release free pages and check accounting */
1134 cc->nr_freepages -= release_freepages(&cc->freepages);
1135 VM_BUG_ON(cc->nr_freepages != 0);
1137 trace_mm_compaction_end(ret);
1142 static unsigned long compact_zone_order(struct zone *zone, int order,
1143 gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
1146 struct compact_control cc = {
1148 .nr_migratepages = 0,
1150 .migratetype = allocflags_to_migratetype(gfp_mask),
1154 INIT_LIST_HEAD(&cc.freepages);
1155 INIT_LIST_HEAD(&cc.migratepages);
1157 ret = compact_zone(zone, &cc);
1159 VM_BUG_ON(!list_empty(&cc.freepages));
1160 VM_BUG_ON(!list_empty(&cc.migratepages));
1162 *contended = cc.contended;
1166 int sysctl_extfrag_threshold = 500;
1169 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1170 * @zonelist: The zonelist used for the current allocation
1171 * @order: The order of the current allocation
1172 * @gfp_mask: The GFP mask of the current allocation
1173 * @nodemask: The allowed nodes to allocate from
1174 * @mode: The migration mode for async, sync light, or sync migration
1175 * @contended: Return value that is true if compaction was aborted due to lock contention
1176 * @candidate_zone: Return the zone where we think allocation should succeed
1178 * This is the main entry point for direct page compaction.
1180 unsigned long try_to_compact_pages(struct zonelist *zonelist,
1181 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1182 enum migrate_mode mode, bool *contended,
1183 struct zone **candidate_zone)
1185 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1186 int may_enter_fs = gfp_mask & __GFP_FS;
1187 int may_perform_io = gfp_mask & __GFP_IO;
1190 int rc = COMPACT_DEFERRED;
1191 int alloc_flags = 0;
1193 /* Check if the GFP flags allow compaction */
1194 if (!order || !may_enter_fs || !may_perform_io)
1195 return COMPACT_SKIPPED;
1198 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1199 alloc_flags |= ALLOC_CMA;
1201 /* Compact each zone in the list */
1202 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1206 if (compaction_deferred(zone, order))
1209 status = compact_zone_order(zone, order, gfp_mask, mode,
1211 rc = max(status, rc);
1213 /* If a normal allocation would succeed, stop compacting */
1214 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
1216 *candidate_zone = zone;
1218 * We think the allocation will succeed in this zone,
1219 * but it is not certain, hence the false. The caller
1220 * will repeat this with true if allocation indeed
1221 * succeeds in this zone.
1223 compaction_defer_reset(zone, order, false);
1225 } else if (mode != MIGRATE_ASYNC) {
1227 * We think that allocation won't succeed in this zone
1228 * so we defer compaction there. If it ends up
1229 * succeeding after all, it will be reset.
1231 defer_compaction(zone, order);
1239 /* Compact all zones within a node */
1240 static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1245 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
1247 zone = &pgdat->node_zones[zoneid];
1248 if (!populated_zone(zone))
1251 cc->nr_freepages = 0;
1252 cc->nr_migratepages = 0;
1254 INIT_LIST_HEAD(&cc->freepages);
1255 INIT_LIST_HEAD(&cc->migratepages);
1257 if (cc->order == -1 || !compaction_deferred(zone, cc->order))
1258 compact_zone(zone, cc);
1260 if (cc->order > 0) {
1261 if (zone_watermark_ok(zone, cc->order,
1262 low_wmark_pages(zone), 0, 0))
1263 compaction_defer_reset(zone, cc->order, false);
1266 VM_BUG_ON(!list_empty(&cc->freepages));
1267 VM_BUG_ON(!list_empty(&cc->migratepages));
1271 void compact_pgdat(pg_data_t *pgdat, int order)
1273 struct compact_control cc = {
1275 .mode = MIGRATE_ASYNC,
1281 __compact_pgdat(pgdat, &cc);
1284 static void compact_node(int nid)
1286 struct compact_control cc = {
1288 .mode = MIGRATE_SYNC,
1289 .ignore_skip_hint = true,
1292 __compact_pgdat(NODE_DATA(nid), &cc);
1295 /* Compact all nodes in the system */
1296 static void compact_nodes(void)
1300 /* Flush pending updates to the LRU lists */
1301 lru_add_drain_all();
1303 for_each_online_node(nid)
1307 /* The written value is actually unused, all memory is compacted */
1308 int sysctl_compact_memory;
1310 /* This is the entry point for compacting all nodes via /proc/sys/vm */
1311 int sysctl_compaction_handler(struct ctl_table *table, int write,
1312 void __user *buffer, size_t *length, loff_t *ppos)
1320 int sysctl_extfrag_handler(struct ctl_table *table, int write,
1321 void __user *buffer, size_t *length, loff_t *ppos)
1323 proc_dointvec_minmax(table, write, buffer, length, ppos);
1328 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1329 static ssize_t sysfs_compact_node(struct device *dev,
1330 struct device_attribute *attr,
1331 const char *buf, size_t count)
1335 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
1336 /* Flush pending updates to the LRU lists */
1337 lru_add_drain_all();
1344 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
1346 int compaction_register_node(struct node *node)
1348 return device_create_file(&node->dev, &dev_attr_compact);
1351 void compaction_unregister_node(struct node *node)
1353 return device_remove_file(&node->dev, &dev_attr_compact);
1355 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
1357 #endif /* CONFIG_COMPACTION */