2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
8 * NUMA policy allows the user to give hints in which node(s) memory should
11 * Support four policies per VMA and per process:
13 * The VMA policy has priority over the process policy for a page fault.
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
22 * bind Only allocate memory on a specific set of nodes,
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
57 fix mmap readahead to honour policy and enable policy for any page cache
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
62 handle mremap for shared memory (currently ignored for the policy)
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
69 #include <linux/mempolicy.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
88 #include <asm/tlbflush.h>
89 #include <asm/uaccess.h>
91 /* Internal MPOL_MF_xxx flags */
92 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
94 static kmem_cache_t *policy_cache;
95 static kmem_cache_t *sn_cache;
97 #define PDprintk(fmt...)
99 /* Highest zone. An specific allocation for a zone below that is not
101 int policy_zone = ZONE_DMA;
103 struct mempolicy default_policy = {
104 .refcnt = ATOMIC_INIT(1), /* never free it */
105 .policy = MPOL_DEFAULT,
108 /* Do sanity checking on a policy */
109 static int mpol_check_policy(int mode, nodemask_t *nodes)
111 int empty = nodes_empty(*nodes);
119 case MPOL_INTERLEAVE:
120 /* Preferred will only use the first bit, but allow
126 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
128 /* Generate a custom zonelist for the BIND policy. */
129 static struct zonelist *bind_zonelist(nodemask_t *nodes)
134 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
135 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
139 for_each_node_mask(nd, *nodes)
140 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
141 zl->zones[num] = NULL;
145 /* Create a new policy */
146 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
148 struct mempolicy *policy;
150 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
151 if (mode == MPOL_DEFAULT)
153 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
155 return ERR_PTR(-ENOMEM);
156 atomic_set(&policy->refcnt, 1);
158 case MPOL_INTERLEAVE:
159 policy->v.nodes = *nodes;
160 if (nodes_weight(*nodes) == 0) {
161 kmem_cache_free(policy_cache, policy);
162 return ERR_PTR(-EINVAL);
166 policy->v.preferred_node = first_node(*nodes);
167 if (policy->v.preferred_node >= MAX_NUMNODES)
168 policy->v.preferred_node = -1;
171 policy->v.zonelist = bind_zonelist(nodes);
172 if (policy->v.zonelist == NULL) {
173 kmem_cache_free(policy_cache, policy);
174 return ERR_PTR(-ENOMEM);
178 policy->policy = mode;
182 /* Check if we are the only process mapping the page in question */
183 static inline int single_mm_mapping(struct mm_struct *mm,
184 struct address_space *mapping)
186 struct vm_area_struct *vma;
187 struct prio_tree_iter iter;
190 spin_lock(&mapping->i_mmap_lock);
191 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
192 if (mm != vma->vm_mm) {
196 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
197 if (mm != vma->vm_mm) {
202 spin_unlock(&mapping->i_mmap_lock);
207 * Add a page to be migrated to the pagelist
209 static void migrate_page_add(struct vm_area_struct *vma,
210 struct page *page, struct list_head *pagelist, unsigned long flags)
213 * Avoid migrating a page that is shared by others and not writable.
215 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
216 mapping_writably_mapped(page->mapping) ||
217 single_mm_mapping(vma->vm_mm, page->mapping)) {
218 int rc = isolate_lru_page(page);
221 list_add(&page->lru, pagelist);
223 * If the isolate attempt was not successful then we just
224 * encountered an unswappable page. Something must be wrong.
230 /* Ensure all existing pages follow the policy. */
231 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
232 unsigned long addr, unsigned long end,
233 const nodemask_t *nodes, unsigned long flags,
234 struct list_head *pagelist)
240 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
245 if (!pte_present(*pte))
247 page = vm_normal_page(vma, addr, *pte);
250 nid = page_to_nid(page);
251 if (!node_isset(nid, *nodes)) {
253 migrate_page_add(vma, page, pagelist, flags);
257 } while (pte++, addr += PAGE_SIZE, addr != end);
258 pte_unmap_unlock(orig_pte, ptl);
262 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
263 unsigned long addr, unsigned long end,
264 const nodemask_t *nodes, unsigned long flags,
265 struct list_head *pagelist)
270 pmd = pmd_offset(pud, addr);
272 next = pmd_addr_end(addr, end);
273 if (pmd_none_or_clear_bad(pmd))
275 if (check_pte_range(vma, pmd, addr, next, nodes,
278 } while (pmd++, addr = next, addr != end);
282 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
283 unsigned long addr, unsigned long end,
284 const nodemask_t *nodes, unsigned long flags,
285 struct list_head *pagelist)
290 pud = pud_offset(pgd, addr);
292 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud))
295 if (check_pmd_range(vma, pud, addr, next, nodes,
298 } while (pud++, addr = next, addr != end);
302 static inline int check_pgd_range(struct vm_area_struct *vma,
303 unsigned long addr, unsigned long end,
304 const nodemask_t *nodes, unsigned long flags,
305 struct list_head *pagelist)
310 pgd = pgd_offset(vma->vm_mm, addr);
312 next = pgd_addr_end(addr, end);
313 if (pgd_none_or_clear_bad(pgd))
315 if (check_pud_range(vma, pgd, addr, next, nodes,
318 } while (pgd++, addr = next, addr != end);
322 /* Check if a vma is migratable */
323 static inline int vma_migratable(struct vm_area_struct *vma)
325 if (vma->vm_flags & (
326 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
332 * Check if all pages in a range are on a set of nodes.
333 * If pagelist != NULL then isolate pages from the LRU and
334 * put them on the pagelist.
336 static struct vm_area_struct *
337 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
338 const nodemask_t *nodes, unsigned long flags,
339 struct list_head *pagelist)
342 struct vm_area_struct *first, *vma, *prev;
344 first = find_vma(mm, start);
346 return ERR_PTR(-EFAULT);
348 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
349 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
350 if (!vma->vm_next && vma->vm_end < end)
351 return ERR_PTR(-EFAULT);
352 if (prev && prev->vm_end < vma->vm_start)
353 return ERR_PTR(-EFAULT);
355 if (!is_vm_hugetlb_page(vma) &&
356 ((flags & MPOL_MF_STRICT) ||
357 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
358 vma_migratable(vma)))) {
359 unsigned long endvma = vma->vm_end;
363 if (vma->vm_start > start)
364 start = vma->vm_start;
365 err = check_pgd_range(vma, start, endvma, nodes,
368 first = ERR_PTR(err);
377 /* Apply policy to a single VMA */
378 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
381 struct mempolicy *old = vma->vm_policy;
383 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
384 vma->vm_start, vma->vm_end, vma->vm_pgoff,
385 vma->vm_ops, vma->vm_file,
386 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
388 if (vma->vm_ops && vma->vm_ops->set_policy)
389 err = vma->vm_ops->set_policy(vma, new);
392 vma->vm_policy = new;
398 /* Step 2: apply policy to a range and do splits. */
399 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
400 unsigned long end, struct mempolicy *new)
402 struct vm_area_struct *next;
406 for (; vma && vma->vm_start < end; vma = next) {
408 if (vma->vm_start < start)
409 err = split_vma(vma->vm_mm, vma, start, 1);
410 if (!err && vma->vm_end > end)
411 err = split_vma(vma->vm_mm, vma, end, 0);
413 err = policy_vma(vma, new);
420 static int contextualize_policy(int mode, nodemask_t *nodes)
425 /* Update current mems_allowed */
426 cpuset_update_current_mems_allowed();
427 /* Ignore nodes not set in current->mems_allowed */
428 cpuset_restrict_to_mems_allowed(nodes->bits);
429 return mpol_check_policy(mode, nodes);
432 static int swap_pages(struct list_head *pagelist)
438 n = migrate_pages(pagelist, NULL, &moved, &failed);
439 putback_lru_pages(&failed);
440 putback_lru_pages(&moved);
445 long do_mbind(unsigned long start, unsigned long len,
446 unsigned long mode, nodemask_t *nmask, unsigned long flags)
448 struct vm_area_struct *vma;
449 struct mm_struct *mm = current->mm;
450 struct mempolicy *new;
455 if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
458 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
461 if (start & ~PAGE_MASK)
464 if (mode == MPOL_DEFAULT)
465 flags &= ~MPOL_MF_STRICT;
467 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
475 if (mpol_check_policy(mode, nmask))
478 new = mpol_new(mode, nmask);
483 * If we are using the default policy then operation
484 * on discontinuous address spaces is okay after all
487 flags |= MPOL_MF_DISCONTIG_OK;
489 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
490 mode,nodes_addr(nodes)[0]);
492 down_write(&mm->mmap_sem);
493 vma = check_range(mm, start, end, nmask, flags,
494 (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
499 err = mbind_range(vma, start, end, new);
500 if (!list_empty(&pagelist))
501 nr_failed = swap_pages(&pagelist);
503 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
506 if (!list_empty(&pagelist))
507 putback_lru_pages(&pagelist);
509 up_write(&mm->mmap_sem);
514 /* Set the process memory policy */
515 long do_set_mempolicy(int mode, nodemask_t *nodes)
517 struct mempolicy *new;
519 if (contextualize_policy(mode, nodes))
521 new = mpol_new(mode, nodes);
524 mpol_free(current->mempolicy);
525 current->mempolicy = new;
526 if (new && new->policy == MPOL_INTERLEAVE)
527 current->il_next = first_node(new->v.nodes);
531 /* Fill a zone bitmap for a policy */
532 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
539 for (i = 0; p->v.zonelist->zones[i]; i++)
540 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
545 case MPOL_INTERLEAVE:
549 /* or use current node instead of online map? */
550 if (p->v.preferred_node < 0)
551 *nodes = node_online_map;
553 node_set(p->v.preferred_node, *nodes);
560 static int lookup_node(struct mm_struct *mm, unsigned long addr)
565 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
567 err = page_to_nid(p);
573 /* Retrieve NUMA policy */
574 long do_get_mempolicy(int *policy, nodemask_t *nmask,
575 unsigned long addr, unsigned long flags)
578 struct mm_struct *mm = current->mm;
579 struct vm_area_struct *vma = NULL;
580 struct mempolicy *pol = current->mempolicy;
582 cpuset_update_current_mems_allowed();
583 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
585 if (flags & MPOL_F_ADDR) {
586 down_read(&mm->mmap_sem);
587 vma = find_vma_intersection(mm, addr, addr+1);
589 up_read(&mm->mmap_sem);
592 if (vma->vm_ops && vma->vm_ops->get_policy)
593 pol = vma->vm_ops->get_policy(vma, addr);
595 pol = vma->vm_policy;
600 pol = &default_policy;
602 if (flags & MPOL_F_NODE) {
603 if (flags & MPOL_F_ADDR) {
604 err = lookup_node(mm, addr);
608 } else if (pol == current->mempolicy &&
609 pol->policy == MPOL_INTERLEAVE) {
610 *policy = current->il_next;
616 *policy = pol->policy;
619 up_read(¤t->mm->mmap_sem);
625 get_zonemask(pol, nmask);
629 up_read(¤t->mm->mmap_sem);
634 * For now migrate_pages simply swaps out the pages from nodes that are in
635 * the source set but not in the target set. In the future, we would
636 * want a function that moves pages between the two nodesets in such
637 * a way as to preserve the physical layout as much as possible.
639 * Returns the number of page that could not be moved.
641 int do_migrate_pages(struct mm_struct *mm,
642 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
648 nodes_andnot(nodes, *from_nodes, *to_nodes);
649 nodes_complement(nodes, nodes);
651 down_read(&mm->mmap_sem);
652 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
653 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
655 if (!list_empty(&pagelist)) {
656 count = swap_pages(&pagelist);
657 putback_lru_pages(&pagelist);
660 up_read(&mm->mmap_sem);
665 * User space interface with variable sized bitmaps for nodelists.
668 /* Copy a node mask from user space. */
669 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
670 unsigned long maxnode)
673 unsigned long nlongs;
674 unsigned long endmask;
678 if (maxnode == 0 || !nmask)
681 nlongs = BITS_TO_LONGS(maxnode);
682 if ((maxnode % BITS_PER_LONG) == 0)
685 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
687 /* When the user specified more nodes than supported just check
688 if the non supported part is all zero. */
689 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
690 if (nlongs > PAGE_SIZE/sizeof(long))
692 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
694 if (get_user(t, nmask + k))
696 if (k == nlongs - 1) {
702 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
706 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
708 nodes_addr(*nodes)[nlongs-1] &= endmask;
712 /* Copy a kernel node mask to user space */
713 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
716 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
717 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
720 if (copy > PAGE_SIZE)
722 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
726 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
729 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
731 unsigned long __user *nmask, unsigned long maxnode,
737 err = get_nodes(&nodes, nmask, maxnode);
740 return do_mbind(start, len, mode, &nodes, flags);
743 /* Set the process memory policy */
744 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
745 unsigned long maxnode)
750 if (mode < 0 || mode > MPOL_MAX)
752 err = get_nodes(&nodes, nmask, maxnode);
755 return do_set_mempolicy(mode, &nodes);
758 /* Macro needed until Paul implements this function in kernel/cpusets.c */
759 #define cpuset_mems_allowed(task) node_online_map
761 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
762 const unsigned long __user *old_nodes,
763 const unsigned long __user *new_nodes)
765 struct mm_struct *mm;
766 struct task_struct *task;
769 nodemask_t task_nodes;
772 err = get_nodes(&old, old_nodes, maxnode);
776 err = get_nodes(&new, new_nodes, maxnode);
780 /* Find the mm_struct */
781 read_lock(&tasklist_lock);
782 task = pid ? find_task_by_pid(pid) : current;
784 read_unlock(&tasklist_lock);
787 mm = get_task_mm(task);
788 read_unlock(&tasklist_lock);
794 * Check if this process has the right to modify the specified
795 * process. The right exists if the process has administrative
796 * capabilities, superuser priviledges or the same
797 * userid as the target process.
799 if ((current->euid != task->suid) && (current->euid != task->uid) &&
800 (current->uid != task->suid) && (current->uid != task->uid) &&
801 !capable(CAP_SYS_ADMIN)) {
806 task_nodes = cpuset_mems_allowed(task);
807 /* Is the user allowed to access the target nodes? */
808 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
813 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
820 /* Retrieve NUMA policy */
821 asmlinkage long sys_get_mempolicy(int __user *policy,
822 unsigned long __user *nmask,
823 unsigned long maxnode,
824 unsigned long addr, unsigned long flags)
829 if (nmask != NULL && maxnode < MAX_NUMNODES)
832 err = do_get_mempolicy(&pval, &nodes, addr, flags);
837 if (policy && put_user(pval, policy))
841 err = copy_nodes_to_user(nmask, maxnode, &nodes);
848 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
849 compat_ulong_t __user *nmask,
850 compat_ulong_t maxnode,
851 compat_ulong_t addr, compat_ulong_t flags)
854 unsigned long __user *nm = NULL;
855 unsigned long nr_bits, alloc_size;
856 DECLARE_BITMAP(bm, MAX_NUMNODES);
858 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
859 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
862 nm = compat_alloc_user_space(alloc_size);
864 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
867 err = copy_from_user(bm, nm, alloc_size);
868 /* ensure entire bitmap is zeroed */
869 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
870 err |= compat_put_bitmap(nmask, bm, nr_bits);
876 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
877 compat_ulong_t maxnode)
880 unsigned long __user *nm = NULL;
881 unsigned long nr_bits, alloc_size;
882 DECLARE_BITMAP(bm, MAX_NUMNODES);
884 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
885 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
888 err = compat_get_bitmap(bm, nmask, nr_bits);
889 nm = compat_alloc_user_space(alloc_size);
890 err |= copy_to_user(nm, bm, alloc_size);
896 return sys_set_mempolicy(mode, nm, nr_bits+1);
899 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
900 compat_ulong_t mode, compat_ulong_t __user *nmask,
901 compat_ulong_t maxnode, compat_ulong_t flags)
904 unsigned long __user *nm = NULL;
905 unsigned long nr_bits, alloc_size;
908 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
909 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
912 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
913 nm = compat_alloc_user_space(alloc_size);
914 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
920 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
925 /* Return effective policy for a VMA */
927 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
929 struct mempolicy *pol = task->mempolicy;
932 if (vma->vm_ops && vma->vm_ops->get_policy)
933 pol = vma->vm_ops->get_policy(vma, addr);
934 else if (vma->vm_policy &&
935 vma->vm_policy->policy != MPOL_DEFAULT)
936 pol = vma->vm_policy;
939 pol = &default_policy;
943 /* Return a zonelist representing a mempolicy */
944 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
948 switch (policy->policy) {
950 nd = policy->v.preferred_node;
955 /* Lower zones don't get a policy applied */
956 /* Careful: current->mems_allowed might have moved */
957 if (gfp_zone(gfp) >= policy_zone)
958 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
959 return policy->v.zonelist;
961 case MPOL_INTERLEAVE: /* should not happen */
969 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
972 /* Do dynamic interleaving for a process */
973 static unsigned interleave_nodes(struct mempolicy *policy)
976 struct task_struct *me = current;
979 next = next_node(nid, policy->v.nodes);
980 if (next >= MAX_NUMNODES)
981 next = first_node(policy->v.nodes);
986 /* Do static interleaving for a VMA with known offset. */
987 static unsigned offset_il_node(struct mempolicy *pol,
988 struct vm_area_struct *vma, unsigned long off)
990 unsigned nnodes = nodes_weight(pol->v.nodes);
991 unsigned target = (unsigned)off % nnodes;
997 nid = next_node(nid, pol->v.nodes);
999 } while (c <= target);
1003 /* Determine a node number for interleave */
1004 static inline unsigned interleave_nid(struct mempolicy *pol,
1005 struct vm_area_struct *vma, unsigned long addr, int shift)
1010 off = vma->vm_pgoff;
1011 off += (addr - vma->vm_start) >> shift;
1012 return offset_il_node(pol, vma, off);
1014 return interleave_nodes(pol);
1017 /* Return a zonelist suitable for a huge page allocation. */
1018 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1020 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1022 if (pol->policy == MPOL_INTERLEAVE) {
1025 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1026 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1028 return zonelist_policy(GFP_HIGHUSER, pol);
1031 /* Allocate a page in interleaved policy.
1032 Own path because it needs to do special accounting. */
1033 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1036 struct zonelist *zl;
1039 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1040 page = __alloc_pages(gfp, order, zl);
1041 if (page && page_zone(page) == zl->zones[0]) {
1042 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1049 * alloc_page_vma - Allocate a page for a VMA.
1052 * %GFP_USER user allocation.
1053 * %GFP_KERNEL kernel allocations,
1054 * %GFP_HIGHMEM highmem/user allocations,
1055 * %GFP_FS allocation should not call back into a file system.
1056 * %GFP_ATOMIC don't sleep.
1058 * @vma: Pointer to VMA or NULL if not available.
1059 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1061 * This function allocates a page from the kernel page pool and applies
1062 * a NUMA policy associated with the VMA or the current process.
1063 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1064 * mm_struct of the VMA to prevent it from going away. Should be used for
1065 * all allocations for pages that will be mapped into
1066 * user space. Returns NULL when no page can be allocated.
1068 * Should be called with the mm_sem of the vma hold.
1071 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1073 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1075 cpuset_update_current_mems_allowed();
1077 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1080 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1081 return alloc_page_interleave(gfp, 0, nid);
1083 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1087 * alloc_pages_current - Allocate pages.
1090 * %GFP_USER user allocation,
1091 * %GFP_KERNEL kernel allocation,
1092 * %GFP_HIGHMEM highmem allocation,
1093 * %GFP_FS don't call back into a file system.
1094 * %GFP_ATOMIC don't sleep.
1095 * @order: Power of two of allocation size in pages. 0 is a single page.
1097 * Allocate a page from the kernel page pool. When not in
1098 * interrupt context and apply the current process NUMA policy.
1099 * Returns NULL when no page can be allocated.
1101 * Don't call cpuset_update_current_mems_allowed() unless
1102 * 1) it's ok to take cpuset_sem (can WAIT), and
1103 * 2) allocating for current task (not interrupt).
1105 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1107 struct mempolicy *pol = current->mempolicy;
1109 if ((gfp & __GFP_WAIT) && !in_interrupt())
1110 cpuset_update_current_mems_allowed();
1111 if (!pol || in_interrupt())
1112 pol = &default_policy;
1113 if (pol->policy == MPOL_INTERLEAVE)
1114 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1115 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1117 EXPORT_SYMBOL(alloc_pages_current);
1119 /* Slow path of a mempolicy copy */
1120 struct mempolicy *__mpol_copy(struct mempolicy *old)
1122 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1125 return ERR_PTR(-ENOMEM);
1127 atomic_set(&new->refcnt, 1);
1128 if (new->policy == MPOL_BIND) {
1129 int sz = ksize(old->v.zonelist);
1130 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1131 if (!new->v.zonelist) {
1132 kmem_cache_free(policy_cache, new);
1133 return ERR_PTR(-ENOMEM);
1135 memcpy(new->v.zonelist, old->v.zonelist, sz);
1140 /* Slow path of a mempolicy comparison */
1141 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1145 if (a->policy != b->policy)
1147 switch (a->policy) {
1150 case MPOL_INTERLEAVE:
1151 return nodes_equal(a->v.nodes, b->v.nodes);
1152 case MPOL_PREFERRED:
1153 return a->v.preferred_node == b->v.preferred_node;
1156 for (i = 0; a->v.zonelist->zones[i]; i++)
1157 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1159 return b->v.zonelist->zones[i] == NULL;
1167 /* Slow path of a mpol destructor. */
1168 void __mpol_free(struct mempolicy *p)
1170 if (!atomic_dec_and_test(&p->refcnt))
1172 if (p->policy == MPOL_BIND)
1173 kfree(p->v.zonelist);
1174 p->policy = MPOL_DEFAULT;
1175 kmem_cache_free(policy_cache, p);
1179 * Shared memory backing store policy support.
1181 * Remember policies even when nobody has shared memory mapped.
1182 * The policies are kept in Red-Black tree linked from the inode.
1183 * They are protected by the sp->lock spinlock, which should be held
1184 * for any accesses to the tree.
1187 /* lookup first element intersecting start-end */
1188 /* Caller holds sp->lock */
1189 static struct sp_node *
1190 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1192 struct rb_node *n = sp->root.rb_node;
1195 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1197 if (start >= p->end)
1199 else if (end <= p->start)
1207 struct sp_node *w = NULL;
1208 struct rb_node *prev = rb_prev(n);
1211 w = rb_entry(prev, struct sp_node, nd);
1212 if (w->end <= start)
1216 return rb_entry(n, struct sp_node, nd);
1219 /* Insert a new shared policy into the list. */
1220 /* Caller holds sp->lock */
1221 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1223 struct rb_node **p = &sp->root.rb_node;
1224 struct rb_node *parent = NULL;
1229 nd = rb_entry(parent, struct sp_node, nd);
1230 if (new->start < nd->start)
1232 else if (new->end > nd->end)
1233 p = &(*p)->rb_right;
1237 rb_link_node(&new->nd, parent, p);
1238 rb_insert_color(&new->nd, &sp->root);
1239 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1240 new->policy ? new->policy->policy : 0);
1243 /* Find shared policy intersecting idx */
1245 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1247 struct mempolicy *pol = NULL;
1250 if (!sp->root.rb_node)
1252 spin_lock(&sp->lock);
1253 sn = sp_lookup(sp, idx, idx+1);
1255 mpol_get(sn->policy);
1258 spin_unlock(&sp->lock);
1262 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1264 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1265 rb_erase(&n->nd, &sp->root);
1266 mpol_free(n->policy);
1267 kmem_cache_free(sn_cache, n);
1271 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1273 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1284 /* Replace a policy range. */
1285 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1286 unsigned long end, struct sp_node *new)
1288 struct sp_node *n, *new2 = NULL;
1291 spin_lock(&sp->lock);
1292 n = sp_lookup(sp, start, end);
1293 /* Take care of old policies in the same range. */
1294 while (n && n->start < end) {
1295 struct rb_node *next = rb_next(&n->nd);
1296 if (n->start >= start) {
1302 /* Old policy spanning whole new range. */
1305 spin_unlock(&sp->lock);
1306 new2 = sp_alloc(end, n->end, n->policy);
1312 sp_insert(sp, new2);
1320 n = rb_entry(next, struct sp_node, nd);
1324 spin_unlock(&sp->lock);
1326 mpol_free(new2->policy);
1327 kmem_cache_free(sn_cache, new2);
1332 int mpol_set_shared_policy(struct shared_policy *info,
1333 struct vm_area_struct *vma, struct mempolicy *npol)
1336 struct sp_node *new = NULL;
1337 unsigned long sz = vma_pages(vma);
1339 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1341 sz, npol? npol->policy : -1,
1342 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1345 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1349 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1351 kmem_cache_free(sn_cache, new);
1355 /* Free a backing policy store on inode delete. */
1356 void mpol_free_shared_policy(struct shared_policy *p)
1359 struct rb_node *next;
1361 if (!p->root.rb_node)
1363 spin_lock(&p->lock);
1364 next = rb_first(&p->root);
1366 n = rb_entry(next, struct sp_node, nd);
1367 next = rb_next(&n->nd);
1368 rb_erase(&n->nd, &p->root);
1369 mpol_free(n->policy);
1370 kmem_cache_free(sn_cache, n);
1372 spin_unlock(&p->lock);
1375 /* assumes fs == KERNEL_DS */
1376 void __init numa_policy_init(void)
1378 policy_cache = kmem_cache_create("numa_policy",
1379 sizeof(struct mempolicy),
1380 0, SLAB_PANIC, NULL, NULL);
1382 sn_cache = kmem_cache_create("shared_policy_node",
1383 sizeof(struct sp_node),
1384 0, SLAB_PANIC, NULL, NULL);
1386 /* Set interleaving policy for system init. This way not all
1387 the data structures allocated at system boot end up in node zero. */
1389 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1390 printk("numa_policy_init: interleaving failed\n");
1393 /* Reset policy of current process to default */
1394 void numa_default_policy(void)
1396 do_set_mempolicy(MPOL_DEFAULT, NULL);
1399 /* Migrate a policy to a different set of nodes */
1400 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1401 const nodemask_t *new)
1408 switch (pol->policy) {
1411 case MPOL_INTERLEAVE:
1412 nodes_remap(tmp, pol->v.nodes, *old, *new);
1414 current->il_next = node_remap(current->il_next, *old, *new);
1416 case MPOL_PREFERRED:
1417 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1423 struct zonelist *zonelist;
1426 for (z = pol->v.zonelist->zones; *z; z++)
1427 node_set((*z)->zone_pgdat->node_id, nodes);
1428 nodes_remap(tmp, nodes, *old, *new);
1431 zonelist = bind_zonelist(&nodes);
1433 /* If no mem, then zonelist is NULL and we keep old zonelist.
1434 * If that old zonelist has no remaining mems_allowed nodes,
1435 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1439 /* Good - got mem - substitute new zonelist */
1440 kfree(pol->v.zonelist);
1441 pol->v.zonelist = zonelist;
1452 * Someone moved this task to different nodes. Fixup mempolicies.
1454 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1455 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1457 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1459 rebind_policy(current->mempolicy, old, new);