#include <linux/ftrace_event.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/mm_inline.h>
#include <linux/migrate.h>
#include <linux/page-debug-flags.h>
#include <linux/hugetlb.h>
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we set ->_mapcount -2.
- * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount
+ * PAGE_BUDDY_MAPCOUNT_VALUE.
+ * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
+ * serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount -2. Page's
- * order is recorded in page_private(page) field.
+ * free pages of length of (1 << order) and marked with _mapcount
+ * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
+ * field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
int to_free = count;
spin_lock(&zone->lock);
- zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (to_free) {
int migratetype)
{
spin_lock(&zone->lock);
- zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
__free_one_page(page, zone, order, migratetype);
int order, t;
struct list_head *curr;
- if (!zone->spanned_pages)
+ if (zone_is_empty(zone))
return;
spin_lock_irqsave(&zone->lock, flags);
get_pageblock_migratetype(page));
}
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
}
+static bool zone_local(struct zone *local_zone, struct zone *zone)
+{
+ return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
+}
+
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
{
}
+static bool zone_local(struct zone *local_zone, struct zone *zone)
+{
+ return true;
+}
+
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
+ unsigned long mark;
+
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+ if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
+ goto try_this_zone;
+ /*
+ * Distribute pages in proportion to the individual
+ * zone size to ensure fair page aging. The zone a
+ * page was allocated in should have no effect on the
+ * time the page has in memory before being reclaimed.
+ *
+ * When zone_reclaim_mode is enabled, try to stay in
+ * local zones in the fastpath. If that fails, the
+ * slowpath is entered, which will do another pass
+ * starting with the local zones, but ultimately fall
+ * back to remote zones that do not partake in the
+ * fairness round-robin cycle of this zonelist.
+ */
+ if (alloc_flags & ALLOC_WMARK_LOW) {
+ if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+ continue;
+ if (zone_reclaim_mode &&
+ !zone_local(preferred_zone, zone))
+ continue;
+ }
/*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
goto this_zone_full;
- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
- if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
- unsigned long mark;
+ mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ if (!zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags)) {
int ret;
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- if (zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- goto try_this_zone;
-
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
return page;
}
-static inline
-void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- enum zone_type classzone_idx)
+static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
{
struct zoneref *z;
struct zone *zone;
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order, classzone_idx);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+ /*
+ * Only reset the batches of zones that were actually
+ * considered in the fast path, we don't want to
+ * thrash fairness information for zones that are not
+ * actually part of this zonelist's round-robin cycle.
+ */
+ if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
+ continue;
+ mod_zone_page_state(zone, NR_ALLOC_BATCH,
+ high_wmark_pages(zone) -
+ low_wmark_pages(zone) -
+ zone_page_state(zone, NR_ALLOC_BATCH));
+ }
}
static inline int
goto nopage;
restart:
- if (!(gfp_mask & __GFP_NO_KSWAPD))
- wake_all_kswapd(order, zonelist, high_zoneidx,
- zone_idx(preferred_zone));
+ prepare_slowpath(gfp_mask, order, zonelist,
+ high_zoneidx, preferred_zone);
/*
* OK, we're below the kswapd watermark and have kicked background
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
- (zone->all_unreclaimable ? "yes" : "no")
+ (!zone_reclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
int __meminit __early_pfn_to_nid(unsigned long pfn)
{
unsigned long start_pfn, end_pfn;
- int i, nid;
+ int nid;
/*
* NOTE: The following SMP-unsafe globals are only used early in boot
* when the kernel is running single-threaded.
if (last_start_pfn <= pfn && pfn < last_end_pfn)
return last_nid;
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
- if (start_pfn <= pfn && pfn < end_pfn) {
- last_start_pfn = start_pfn;
- last_end_pfn = end_pfn;
- last_nid = nid;
- return nid;
- }
- /* This is a memory hole */
- return -1;
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+ if (nid != -1) {
+ last_start_pfn = start_pfn;
+ last_end_pfn = end_pfn;
+ last_nid = nid;
+ }
+
+ return nid;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
-
zone_pcp_init(zone);
+
+ /* For bootup, initialized properly in watermark setup */
+ mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
+
lruvec_init(&zone->lruvec);
if (!size)
continue;
* This is only okay since the processor is dead and cannot
* race with what we are doing.
*/
- refresh_cpu_vm_stats(cpu);
+ cpu_vm_stats_fold(cpu);
}
return NOTIFY_OK;
}
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH,
+ high_wmark_pages(zone) -
+ low_wmark_pages(zone) -
+ zone_page_state(zone, NR_ALLOC_BATCH));
+
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
continue;
page = pfn_to_page(check);
+
+ /*
+ * Hugepages are not in LRU lists, but they're movable.
+ * We need not scan over tail pages bacause we don't
+ * handle each tail page individually in migration.
+ */
+ if (PageHuge(page)) {
+ iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
+ continue;
+ }
+
/*
* We can't use page_count without pin a page
* because another CPU can free compound page.
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
-#ifdef CONFIG_HIGHMEM
- if (PageHighMem(page))
- totalhigh_pages -= 1 << order;
-#endif
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);