unsigned long totalram_pages __read_mostly;
unsigned long totalhigh_pages __read_mostly;
long nr_swap_pages;
+int percpu_pagelist_fraction;
static void fastcall free_hot_cold_page(struct page *page, int cold);
+static void __free_pages_ok(struct page *page, unsigned int order);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
static void bad_page(struct page *page)
{
printk(KERN_EMERG "Bad page state in process '%s'\n"
- "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
- "Trying to fix it up, but a reboot is needed\n"
- "Backtrace:\n",
+ KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
+ KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
+ KERN_EMERG "Backtrace:\n",
current->comm, page, (int)(2*sizeof(unsigned long)),
(unsigned long)page->flags, page->mapping,
page_mapcount(page), page_count(page));
* All pages have PG_compound set. All pages have their ->private pointing at
* the head page (even the head page has this).
*
- * The first tail page's ->mapping, if non-zero, holds the address of the
- * compound page's put_page() function.
- *
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present. This usage means that zero-order pages
- * may not be compound.
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function. Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
*/
+
+static void free_compound_page(struct page *page)
+{
+ __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
+
static void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
- page[1].mapping = NULL;
- page[1].index = order;
+ page[1].lru.next = (void *)free_compound_page; /* set dtor */
+ page[1].lru.prev = (void *)order;
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
int i;
int nr_pages = 1 << order;
- if (unlikely(page[1].index != order))
+ if (unlikely((unsigned long)page[1].lru.prev != order))
bad_page(page);
for (i = 0; i < nr_pages; i++) {
* -- wli
*/
-static inline void __free_pages_bulk (struct page *page,
+static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order)
{
unsigned long page_idx;
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
-static int
-free_pages_bulk(struct zone *zone, int count,
- struct list_head *list, unsigned int order)
+static void free_pages_bulk(struct zone *zone, int count,
+ struct list_head *list, int order)
{
- struct page *page = NULL;
- int ret = 0;
-
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- while (!list_empty(list) && count--) {
+ while (count--) {
+ struct page *page;
+
+ BUG_ON(list_empty(list));
page = list_entry(list->prev, struct page, lru);
- /* have to delete it as __free_pages_bulk list manipulates */
+ /* have to delete it as __free_one_page list manipulates */
list_del(&page->lru);
- __free_pages_bulk(page, zone, order);
- ret++;
+ __free_one_page(page, zone, order);
}
spin_unlock(&zone->lock);
- return ret;
}
-void __free_pages_ok(struct page *page, unsigned int order)
+static void free_one_page(struct zone *zone, struct page *page, int order)
{
- unsigned long flags;
LIST_HEAD(list);
+ list_add(&page->lru, &list);
+ free_pages_bulk(zone, 1, &list, order);
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+ unsigned long flags;
int i;
int reserved = 0;
arch_free_page(page, order);
+ if (!PageHighMem(page))
+ mutex_debug_check_no_locks_freed(page_address(page),
+ PAGE_SIZE<<order);
#ifndef CONFIG_MMU
- if (order > 0)
- for (i = 1 ; i < (1 << order) ; ++i)
- __put_page(page + i);
+ for (i = 1 ; i < (1 << order) ; ++i)
+ __put_page(page + i);
#endif
for (i = 0 ; i < (1 << order) ; ++i)
if (reserved)
return;
- list_add(&page->lru, &list);
- kernel_map_pages(page, 1<<order, 0);
+ kernel_map_pages(page, 1 << order, 0);
local_irq_save(flags);
__mod_page_state(pgfree, 1 << order);
- free_pages_bulk(page_zone(page), 1, &list, order);
+ free_one_page(page_zone(page), page, order);
local_irq_restore(flags);
}
if (zone->zone_pgdat->node_id == numa_node_id())
continue;
- pset = zone->pageset[smp_processor_id()];
+ pset = zone_pcp(zone, smp_processor_id());
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
pcp = &pset->pcp[i];
- if (pcp->count)
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
+ free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ pcp->count = 0;
}
}
local_irq_restore(flags);
pcp = &pset->pcp[i];
local_irq_save(flags);
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
+ free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ pcp->count = 0;
local_irq_restore(flags);
}
}
__inc_page_state(pgfree);
list_add(&page->lru, &pcp->list);
pcp->count++;
- if (pcp->count >= pcp->high)
- pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ if (pcp->count >= pcp->high) {
+ free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ pcp->count -= pcp->batch;
+ }
local_irq_restore(flags);
put_cpu();
}
again:
cpu = get_cpu();
- if (order == 0) {
+ if (likely(order == 0)) {
struct per_cpu_pages *pcp;
pcp = &zone_pcp(zone, cpu)->pcp[cold];
mark = (*z)->pages_high;
if (!zone_watermark_ok(*z, order, mark,
classzone_idx, alloc_flags))
- continue;
+ if (!zone_reclaim_mode ||
+ !zone_reclaim(*z, gfp_mask, order))
+ continue;
}
page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
- * policy.
+ * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
+ * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags = ALLOC_WMARK_MIN;
if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
cond_resched();
/* We now go into synchronous reclaim */
+ cpuset_memory_pressure_bump();
p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
if (page)
goto got_pg;
- out_of_memory(gfp_mask, order);
+ out_of_memory(zonelist, gfp_mask, order);
goto restart;
}
{
int cpu = 0;
- memset(ret, 0, sizeof(*ret));
+ memset(ret, 0, nr * sizeof(unsigned long));
+ cpus_and(*cpumask, *cpumask, cpu_online_map);
cpu = first_cpu(*cpumask);
while (cpu < NR_CPUS) {
unsigned long *in, *out, off;
+ if (!cpu_isset(cpu, *cpumask))
+ continue;
+
in = (unsigned long *)&per_cpu(page_states, cpu);
cpu = next_cpu(cpu, *cpumask);
- if (cpu < NR_CPUS)
+ if (likely(cpu < NR_CPUS))
prefetch(&per_cpu(page_states, cpu));
out = (unsigned long *)ret;
unsigned long ret = 0;
int cpu;
- for_each_cpu(cpu) {
+ for_each_online_cpu(cpu) {
unsigned long in;
in = (unsigned long)&per_cpu(page_states, cpu) + offset;
*/
static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
{
- int i, n, val;
+ int n, val;
int min_val = INT_MAX;
int best_node = -1;
- for_each_online_node(i) {
- cpumask_t tmp;
+ /* Use the local node if we haven't already */
+ if (!node_isset(node, *used_node_mask)) {
+ node_set(node, *used_node_mask);
+ return node;
+ }
- /* Start from local node */
- n = (node+i) % num_online_nodes();
+ for_each_online_node(n) {
+ cpumask_t tmp;
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
- /* Use the local node if we haven't already */
- if (!node_isset(node, *used_node_mask)) {
- best_node = node;
- break;
- }
-
/* Use the distance array to find the distance */
val = node_distance(node, n);
+ /* Penalize nodes under us ("prefer the next node") */
+ val += (n < node);
+
/* Give preference to headless and unused nodes */
tmp = node_to_cpumask(n);
if (!cpus_empty(tmp))
prev_node = local_node;
nodes_clear(used_mask);
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ int distance = node_distance(local_node, node);
+
+ /*
+ * If another node is sufficiently far away then it is better
+ * to reclaim pages in a zone before going off node.
+ */
+ if (distance > RECLAIM_DISTANCE)
+ zone_reclaim_mode = 1;
+
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
- if (node_distance(local_node, node) !=
- node_distance(local_node, prev_node))
+
+ if (distance != node_distance(local_node, prev_node))
node_load[node] += load;
prev_node = node;
load--;
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
-void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
- for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!early_pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
memmap_init_zone((size), (nid), (zone), (start_pfn))
#endif
-static int __devinit zone_batchsize(struct zone *zone)
+static int __cpuinit zone_batchsize(struct zone *zone)
{
int batch;
INIT_LIST_HEAD(&pcp->list);
}
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+ unsigned long high)
+{
+ struct per_cpu_pages *pcp;
+
+ pcp = &p->pcp[0]; /* hot list */
+ pcp->high = high;
+ pcp->batch = max(1UL, high/4);
+ if ((high/4) > (PAGE_SHIFT * 8))
+ pcp->batch = PAGE_SHIFT * 8;
+}
+
+
#ifdef CONFIG_NUMA
/*
* Boot pageset table. One per cpu which is going to be used for all
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static struct per_cpu_pageset
- boot_pageset[NR_CPUS];
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
/*
* Dynamically allocate memory for the
* per cpu pageset array in struct zone.
*/
-static int __devinit process_zones(int cpu)
+static int __cpuinit process_zones(int cpu)
{
struct zone *zone, *dzone;
for_each_zone(zone) {
- zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+ zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
GFP_KERNEL, cpu_to_node(cpu));
- if (!zone->pageset[cpu])
+ if (!zone_pcp(zone, cpu))
goto bad;
- setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+ setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(zone_pcp(zone, cpu),
+ (zone->present_pages / percpu_pagelist_fraction));
}
return 0;
for_each_zone(dzone) {
if (dzone == zone)
break;
- kfree(dzone->pageset[cpu]);
- dzone->pageset[cpu] = NULL;
+ kfree(zone_pcp(dzone, cpu));
+ zone_pcp(dzone, cpu) = NULL;
}
return -ENOMEM;
}
static inline void free_zone_pagesets(int cpu)
{
-#ifdef CONFIG_NUMA
struct zone *zone;
for_each_zone(zone) {
zone_pcp(zone, cpu) = NULL;
kfree(pset);
}
-#endif
}
-static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
#endif
-static __devinit
+static __meminit
void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
init_waitqueue_head(zone->wait_table + i);
}
-static __devinit void zone_pcp_init(struct zone *zone)
+static __meminit void zone_pcp_init(struct zone *zone)
{
int cpu;
unsigned long batch = zone_batchsize(zone);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
#ifdef CONFIG_NUMA
/* Early boot. Slab allocator not functional yet */
- zone->pageset[cpu] = &boot_pageset[cpu];
+ zone_pcp(zone, cpu) = &boot_pageset[cpu];
setup_pageset(&boot_pageset[cpu],0);
#else
setup_pageset(zone_pcp(zone,cpu), batch);
zone->name, zone->present_pages, batch);
}
-static __devinit void init_currently_empty_zone(struct zone *zone,
+static __meminit void init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn, unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
seq_printf(m,
")"
"\n pagesets");
- for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+ for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
int j;
return 0;
}
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ unsigned int cpu;
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ if (!write || (ret == -EINVAL))
+ return ret;
+ for_each_zone(zone) {
+ for_each_online_cpu(cpu) {
+ unsigned long high;
+ high = zone->present_pages / percpu_pagelist_fraction;
+ setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ }
+ }
+ return 0;
+}
+
__initdata int hashdist = HASHDIST_DEFAULT;
#ifdef CONFIG_NUMA