From c064fb1ff538144420caf614c26e44688bbdd7c2 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Sat, 21 Jul 2012 10:54:16 +1000 Subject: [PATCH] mm/hotplug: correctly setup fallback zonelists when creating new pgdat When hotadd_new_pgdat() is called to create new pgdat for a new node, a fallback zonelist should be created for the new node. There's code to try to achieve that in hotadd_new_pgdat() as below: /* * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ mutex_lock(&zonelists_mutex); build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); But it doesn't work as expected. When hotadd_new_pgdat() is called, the new node is still in offline state because node_set_online(nid) hasn't been called yet. And build_all_zonelists() only builds zonelists for online nodes as: for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); build_zonelist_cache(pgdat); } Though we hope to create zonelist for the new pgdat, but it doesn't. So add a new parameter "pgdat" the build_all_zonelists() to build pgdat for the new pgdat too. Signed-off-by: Jiang Liu Signed-off-by: Xishi Qiu Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Rusty Russell Cc: Yinghai Lu Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- init/main.c | 2 +- kernel/cpu.c | 2 +- mm/memory_hotplug.c | 4 ++-- mm/page_alloc.c | 17 ++++++++++++----- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f64afa5929fe..98f079bcf399 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -721,7 +721,7 @@ typedef struct pglist_data { #include extern struct mutex zonelists_mutex; -void build_all_zonelists(void *data); +void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); diff --git a/init/main.c b/init/main.c index 95316a1b4a76..e60679de61c3 100644 --- a/init/main.c +++ b/init/main.c @@ -506,7 +506,7 @@ asmlinkage void __init start_kernel(void) setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); page_alloc_init(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb5227a19e..14d32588cccd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (pgdat->node_zonelists->_zonerefs->zone == NULL) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 427bb291dd0f..b8731040b9f9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -513,7 +513,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; if (need_zonelists_rebuild) - build_all_zonelists(zone); + build_all_zonelists(NULL, zone); else zone_pcp_update(zone); @@ -562,7 +562,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) * to access not-initialized zonelist, build here. */ mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); return pgdat; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ddb68089b820..f5b6b9130c87 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3031,7 +3031,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } } @@ -3414,10 +3414,17 @@ static __init_refok int __build_all_zonelists(void *data) { int nid; int cpu; + pg_data_t *self = data; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + + if (self && !node_online(self->node_id)) { + build_zonelists(self); + build_zonelist_cache(self); + } + for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3462,7 +3469,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void __ref build_all_zonelists(void *data) +void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); @@ -3474,10 +3481,10 @@ void __ref build_all_zonelists(void *data) /* we have to stop all cpus to guarantee there is no user of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG - if (data) - setup_zone_pageset((struct zone *)data); + if (zone) + setup_zone_pageset(zone); #endif - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- 2.39.5