2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
8 * NUMA policy allows the user to give hints in which node(s) memory should
11 * Support four policies per VMA and per process:
13 * The VMA policy has priority over the process policy for a page fault.
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
22 * bind Only allocate memory on a specific set of nodes,
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
57 fix mmap readahead to honour policy and enable policy for any page cache
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
62 handle mremap for shared memory (currently ignored for the policy)
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
69 #include <linux/mempolicy.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <asm/tlbflush.h>
87 #include <asm/uaccess.h>
89 static kmem_cache_t *policy_cache;
90 static kmem_cache_t *sn_cache;
92 #define PDprintk(fmt...)
94 /* Highest zone. An specific allocation for a zone below that is not
96 static int policy_zone;
98 struct mempolicy default_policy = {
99 .refcnt = ATOMIC_INIT(1), /* never free it */
100 .policy = MPOL_DEFAULT,
103 /* Do sanity checking on a policy */
104 static int mpol_check_policy(int mode, nodemask_t *nodes)
106 int empty = nodes_empty(*nodes);
114 case MPOL_INTERLEAVE:
115 /* Preferred will only use the first bit, but allow
121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
123 /* Generate a custom zonelist for the BIND policy. */
124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
134 for_each_node_mask(nd, *nodes) {
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
140 zl->zones[num++] = z;
145 zl->zones[num] = NULL;
149 /* Create a new policy */
150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
152 struct mempolicy *policy;
154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
155 if (mode == MPOL_DEFAULT)
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
162 case MPOL_INTERLEAVE:
163 policy->v.nodes = *nodes;
166 policy->v.preferred_node = first_node(*nodes);
167 if (policy->v.preferred_node >= MAX_NUMNODES)
168 policy->v.preferred_node = -1;
171 policy->v.zonelist = bind_zonelist(nodes);
172 if (policy->v.zonelist == NULL) {
173 kmem_cache_free(policy_cache, policy);
174 return ERR_PTR(-ENOMEM);
178 policy->policy = mode;
182 /* Ensure all existing pages follow the policy. */
183 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184 unsigned long addr, unsigned long end, nodemask_t *nodes)
190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
195 if (!pte_present(*pte))
197 page = vm_normal_page(vma, addr, *pte);
200 nid = page_to_nid(page);
201 if (!node_isset(nid, *nodes))
203 } while (pte++, addr += PAGE_SIZE, addr != end);
204 pte_unmap_unlock(orig_pte, ptl);
208 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
209 unsigned long addr, unsigned long end, nodemask_t *nodes)
214 pmd = pmd_offset(pud, addr);
216 next = pmd_addr_end(addr, end);
217 if (pmd_none_or_clear_bad(pmd))
219 if (check_pte_range(vma, pmd, addr, next, nodes))
221 } while (pmd++, addr = next, addr != end);
225 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
226 unsigned long addr, unsigned long end, nodemask_t *nodes)
231 pud = pud_offset(pgd, addr);
233 next = pud_addr_end(addr, end);
234 if (pud_none_or_clear_bad(pud))
236 if (check_pmd_range(vma, pud, addr, next, nodes))
238 } while (pud++, addr = next, addr != end);
242 static inline int check_pgd_range(struct vm_area_struct *vma,
243 unsigned long addr, unsigned long end, nodemask_t *nodes)
248 pgd = pgd_offset(vma->vm_mm, addr);
250 next = pgd_addr_end(addr, end);
251 if (pgd_none_or_clear_bad(pgd))
253 if (check_pud_range(vma, pgd, addr, next, nodes))
255 } while (pgd++, addr = next, addr != end);
259 /* Step 1: check the range */
260 static struct vm_area_struct *
261 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
262 nodemask_t *nodes, unsigned long flags)
265 struct vm_area_struct *first, *vma, *prev;
267 first = find_vma(mm, start);
269 return ERR_PTR(-EFAULT);
271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
272 if (!vma->vm_next && vma->vm_end < end)
273 return ERR_PTR(-EFAULT);
274 if (prev && prev->vm_end < vma->vm_start)
275 return ERR_PTR(-EFAULT);
276 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
277 unsigned long endvma = vma->vm_end;
280 if (vma->vm_start > start)
281 start = vma->vm_start;
282 err = check_pgd_range(vma, start, endvma, nodes);
284 first = ERR_PTR(err);
293 /* Apply policy to a single VMA */
294 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
297 struct mempolicy *old = vma->vm_policy;
299 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
300 vma->vm_start, vma->vm_end, vma->vm_pgoff,
301 vma->vm_ops, vma->vm_file,
302 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
304 if (vma->vm_ops && vma->vm_ops->set_policy)
305 err = vma->vm_ops->set_policy(vma, new);
308 vma->vm_policy = new;
314 /* Step 2: apply policy to a range and do splits. */
315 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
316 unsigned long end, struct mempolicy *new)
318 struct vm_area_struct *next;
322 for (; vma && vma->vm_start < end; vma = next) {
324 if (vma->vm_start < start)
325 err = split_vma(vma->vm_mm, vma, start, 1);
326 if (!err && vma->vm_end > end)
327 err = split_vma(vma->vm_mm, vma, end, 0);
329 err = policy_vma(vma, new);
336 static int contextualize_policy(int mode, nodemask_t *nodes)
341 /* Update current mems_allowed */
342 cpuset_update_current_mems_allowed();
343 /* Ignore nodes not set in current->mems_allowed */
344 cpuset_restrict_to_mems_allowed(nodes->bits);
345 return mpol_check_policy(mode, nodes);
348 long do_mbind(unsigned long start, unsigned long len,
349 unsigned long mode, nodemask_t *nmask, unsigned long flags)
351 struct vm_area_struct *vma;
352 struct mm_struct *mm = current->mm;
353 struct mempolicy *new;
357 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
359 if (start & ~PAGE_MASK)
361 if (mode == MPOL_DEFAULT)
362 flags &= ~MPOL_MF_STRICT;
363 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
369 if (mpol_check_policy(mode, nmask))
371 new = mpol_new(mode, nmask);
375 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
376 mode,nodes_addr(nodes)[0]);
378 down_write(&mm->mmap_sem);
379 vma = check_range(mm, start, end, nmask, flags);
382 err = mbind_range(vma, start, end, new);
383 up_write(&mm->mmap_sem);
388 /* Set the process memory policy */
389 long do_set_mempolicy(int mode, nodemask_t *nodes)
391 struct mempolicy *new;
393 if (contextualize_policy(mode, nodes))
395 new = mpol_new(mode, nodes);
398 mpol_free(current->mempolicy);
399 current->mempolicy = new;
400 if (new && new->policy == MPOL_INTERLEAVE)
401 current->il_next = first_node(new->v.nodes);
405 /* Fill a zone bitmap for a policy */
406 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
413 for (i = 0; p->v.zonelist->zones[i]; i++)
414 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419 case MPOL_INTERLEAVE:
423 /* or use current node instead of online map? */
424 if (p->v.preferred_node < 0)
425 *nodes = node_online_map;
427 node_set(p->v.preferred_node, *nodes);
434 static int lookup_node(struct mm_struct *mm, unsigned long addr)
439 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
441 err = page_to_nid(p);
447 /* Retrieve NUMA policy */
448 long do_get_mempolicy(int *policy, nodemask_t *nmask,
449 unsigned long addr, unsigned long flags)
452 struct mm_struct *mm = current->mm;
453 struct vm_area_struct *vma = NULL;
454 struct mempolicy *pol = current->mempolicy;
456 cpuset_update_current_mems_allowed();
457 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
459 if (flags & MPOL_F_ADDR) {
460 down_read(&mm->mmap_sem);
461 vma = find_vma_intersection(mm, addr, addr+1);
463 up_read(&mm->mmap_sem);
466 if (vma->vm_ops && vma->vm_ops->get_policy)
467 pol = vma->vm_ops->get_policy(vma, addr);
469 pol = vma->vm_policy;
474 pol = &default_policy;
476 if (flags & MPOL_F_NODE) {
477 if (flags & MPOL_F_ADDR) {
478 err = lookup_node(mm, addr);
482 } else if (pol == current->mempolicy &&
483 pol->policy == MPOL_INTERLEAVE) {
484 *policy = current->il_next;
490 *policy = pol->policy;
493 up_read(¤t->mm->mmap_sem);
499 get_zonemask(pol, nmask);
503 up_read(¤t->mm->mmap_sem);
508 * User space interface with variable sized bitmaps for nodelists.
511 /* Copy a node mask from user space. */
512 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
513 unsigned long maxnode)
516 unsigned long nlongs;
517 unsigned long endmask;
521 if (maxnode == 0 || !nmask)
524 nlongs = BITS_TO_LONGS(maxnode);
525 if ((maxnode % BITS_PER_LONG) == 0)
528 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
530 /* When the user specified more nodes than supported just check
531 if the non supported part is all zero. */
532 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
533 if (nlongs > PAGE_SIZE/sizeof(long))
535 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
537 if (get_user(t, nmask + k))
539 if (k == nlongs - 1) {
545 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
549 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
551 nodes_addr(*nodes)[nlongs-1] &= endmask;
555 /* Copy a kernel node mask to user space */
556 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
559 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
560 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
563 if (copy > PAGE_SIZE)
565 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
569 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
572 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
574 unsigned long __user *nmask, unsigned long maxnode,
580 err = get_nodes(&nodes, nmask, maxnode);
583 return do_mbind(start, len, mode, &nodes, flags);
586 /* Set the process memory policy */
587 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
588 unsigned long maxnode)
593 if (mode < 0 || mode > MPOL_MAX)
595 err = get_nodes(&nodes, nmask, maxnode);
598 return do_set_mempolicy(mode, &nodes);
601 /* Retrieve NUMA policy */
602 asmlinkage long sys_get_mempolicy(int __user *policy,
603 unsigned long __user *nmask,
604 unsigned long maxnode,
605 unsigned long addr, unsigned long flags)
610 if (nmask != NULL && maxnode < MAX_NUMNODES)
613 err = do_get_mempolicy(&pval, &nodes, addr, flags);
618 if (policy && put_user(pval, policy))
622 err = copy_nodes_to_user(nmask, maxnode, &nodes);
629 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
630 compat_ulong_t __user *nmask,
631 compat_ulong_t maxnode,
632 compat_ulong_t addr, compat_ulong_t flags)
635 unsigned long __user *nm = NULL;
636 unsigned long nr_bits, alloc_size;
637 DECLARE_BITMAP(bm, MAX_NUMNODES);
639 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
640 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
643 nm = compat_alloc_user_space(alloc_size);
645 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
648 err = copy_from_user(bm, nm, alloc_size);
649 /* ensure entire bitmap is zeroed */
650 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
651 err |= compat_put_bitmap(nmask, bm, nr_bits);
657 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
658 compat_ulong_t maxnode)
661 unsigned long __user *nm = NULL;
662 unsigned long nr_bits, alloc_size;
663 DECLARE_BITMAP(bm, MAX_NUMNODES);
665 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
666 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
669 err = compat_get_bitmap(bm, nmask, nr_bits);
670 nm = compat_alloc_user_space(alloc_size);
671 err |= copy_to_user(nm, bm, alloc_size);
677 return sys_set_mempolicy(mode, nm, nr_bits+1);
680 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
681 compat_ulong_t mode, compat_ulong_t __user *nmask,
682 compat_ulong_t maxnode, compat_ulong_t flags)
685 unsigned long __user *nm = NULL;
686 unsigned long nr_bits, alloc_size;
689 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
690 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
693 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
694 nm = compat_alloc_user_space(alloc_size);
695 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
701 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
706 /* Return effective policy for a VMA */
708 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
710 struct mempolicy *pol = task->mempolicy;
713 if (vma->vm_ops && vma->vm_ops->get_policy)
714 pol = vma->vm_ops->get_policy(vma, addr);
715 else if (vma->vm_policy &&
716 vma->vm_policy->policy != MPOL_DEFAULT)
717 pol = vma->vm_policy;
720 pol = &default_policy;
724 /* Return a zonelist representing a mempolicy */
725 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
729 switch (policy->policy) {
731 nd = policy->v.preferred_node;
736 /* Lower zones don't get a policy applied */
737 /* Careful: current->mems_allowed might have moved */
738 if (gfp_zone(gfp) >= policy_zone)
739 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
740 return policy->v.zonelist;
742 case MPOL_INTERLEAVE: /* should not happen */
750 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
753 /* Do dynamic interleaving for a process */
754 static unsigned interleave_nodes(struct mempolicy *policy)
757 struct task_struct *me = current;
760 next = next_node(nid, policy->v.nodes);
761 if (next >= MAX_NUMNODES)
762 next = first_node(policy->v.nodes);
767 /* Do static interleaving for a VMA with known offset. */
768 static unsigned offset_il_node(struct mempolicy *pol,
769 struct vm_area_struct *vma, unsigned long off)
771 unsigned nnodes = nodes_weight(pol->v.nodes);
772 unsigned target = (unsigned)off % nnodes;
778 nid = next_node(nid, pol->v.nodes);
780 } while (c <= target);
784 /* Allocate a page in interleaved policy.
785 Own path because it needs to do special accounting. */
786 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
792 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
793 page = __alloc_pages(gfp, order, zl);
794 if (page && page_zone(page) == zl->zones[0]) {
795 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
802 * alloc_page_vma - Allocate a page for a VMA.
805 * %GFP_USER user allocation.
806 * %GFP_KERNEL kernel allocations,
807 * %GFP_HIGHMEM highmem/user allocations,
808 * %GFP_FS allocation should not call back into a file system.
809 * %GFP_ATOMIC don't sleep.
811 * @vma: Pointer to VMA or NULL if not available.
812 * @addr: Virtual Address of the allocation. Must be inside the VMA.
814 * This function allocates a page from the kernel page pool and applies
815 * a NUMA policy associated with the VMA or the current process.
816 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
817 * mm_struct of the VMA to prevent it from going away. Should be used for
818 * all allocations for pages that will be mapped into
819 * user space. Returns NULL when no page can be allocated.
821 * Should be called with the mm_sem of the vma hold.
824 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
826 struct mempolicy *pol = get_vma_policy(current, vma, addr);
828 cpuset_update_current_mems_allowed();
830 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835 off += (addr - vma->vm_start) >> PAGE_SHIFT;
836 nid = offset_il_node(pol, vma, off);
838 /* fall back to process interleaving */
839 nid = interleave_nodes(pol);
841 return alloc_page_interleave(gfp, 0, nid);
843 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
847 * alloc_pages_current - Allocate pages.
850 * %GFP_USER user allocation,
851 * %GFP_KERNEL kernel allocation,
852 * %GFP_HIGHMEM highmem allocation,
853 * %GFP_FS don't call back into a file system.
854 * %GFP_ATOMIC don't sleep.
855 * @order: Power of two of allocation size in pages. 0 is a single page.
857 * Allocate a page from the kernel page pool. When not in
858 * interrupt context and apply the current process NUMA policy.
859 * Returns NULL when no page can be allocated.
861 * Don't call cpuset_update_current_mems_allowed() unless
862 * 1) it's ok to take cpuset_sem (can WAIT), and
863 * 2) allocating for current task (not interrupt).
865 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
867 struct mempolicy *pol = current->mempolicy;
869 if ((gfp & __GFP_WAIT) && !in_interrupt())
870 cpuset_update_current_mems_allowed();
871 if (!pol || in_interrupt())
872 pol = &default_policy;
873 if (pol->policy == MPOL_INTERLEAVE)
874 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
875 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
877 EXPORT_SYMBOL(alloc_pages_current);
879 /* Slow path of a mempolicy copy */
880 struct mempolicy *__mpol_copy(struct mempolicy *old)
882 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
885 return ERR_PTR(-ENOMEM);
887 atomic_set(&new->refcnt, 1);
888 if (new->policy == MPOL_BIND) {
889 int sz = ksize(old->v.zonelist);
890 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
891 if (!new->v.zonelist) {
892 kmem_cache_free(policy_cache, new);
893 return ERR_PTR(-ENOMEM);
895 memcpy(new->v.zonelist, old->v.zonelist, sz);
900 /* Slow path of a mempolicy comparison */
901 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
905 if (a->policy != b->policy)
910 case MPOL_INTERLEAVE:
911 return nodes_equal(a->v.nodes, b->v.nodes);
913 return a->v.preferred_node == b->v.preferred_node;
916 for (i = 0; a->v.zonelist->zones[i]; i++)
917 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
919 return b->v.zonelist->zones[i] == NULL;
927 /* Slow path of a mpol destructor. */
928 void __mpol_free(struct mempolicy *p)
930 if (!atomic_dec_and_test(&p->refcnt))
932 if (p->policy == MPOL_BIND)
933 kfree(p->v.zonelist);
934 p->policy = MPOL_DEFAULT;
935 kmem_cache_free(policy_cache, p);
939 * Hugetlb policy. Same as above, just works with node numbers instead of
943 /* Find first node suitable for an allocation */
944 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
946 struct mempolicy *pol = get_vma_policy(current, vma, addr);
948 switch (pol->policy) {
950 return numa_node_id();
952 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
953 case MPOL_INTERLEAVE:
954 return interleave_nodes(pol);
956 return pol->v.preferred_node >= 0 ?
957 pol->v.preferred_node : numa_node_id();
963 /* Find secondary valid nodes for an allocation */
964 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
966 struct mempolicy *pol = get_vma_policy(current, vma, addr);
968 switch (pol->policy) {
971 case MPOL_INTERLEAVE:
975 for (z = pol->v.zonelist->zones; *z; z++)
976 if ((*z)->zone_pgdat->node_id == nid)
987 * Shared memory backing store policy support.
989 * Remember policies even when nobody has shared memory mapped.
990 * The policies are kept in Red-Black tree linked from the inode.
991 * They are protected by the sp->lock spinlock, which should be held
992 * for any accesses to the tree.
995 /* lookup first element intersecting start-end */
996 /* Caller holds sp->lock */
997 static struct sp_node *
998 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1000 struct rb_node *n = sp->root.rb_node;
1003 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1005 if (start >= p->end)
1007 else if (end <= p->start)
1015 struct sp_node *w = NULL;
1016 struct rb_node *prev = rb_prev(n);
1019 w = rb_entry(prev, struct sp_node, nd);
1020 if (w->end <= start)
1024 return rb_entry(n, struct sp_node, nd);
1027 /* Insert a new shared policy into the list. */
1028 /* Caller holds sp->lock */
1029 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1031 struct rb_node **p = &sp->root.rb_node;
1032 struct rb_node *parent = NULL;
1037 nd = rb_entry(parent, struct sp_node, nd);
1038 if (new->start < nd->start)
1040 else if (new->end > nd->end)
1041 p = &(*p)->rb_right;
1045 rb_link_node(&new->nd, parent, p);
1046 rb_insert_color(&new->nd, &sp->root);
1047 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1048 new->policy ? new->policy->policy : 0);
1051 /* Find shared policy intersecting idx */
1053 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1055 struct mempolicy *pol = NULL;
1058 if (!sp->root.rb_node)
1060 spin_lock(&sp->lock);
1061 sn = sp_lookup(sp, idx, idx+1);
1063 mpol_get(sn->policy);
1066 spin_unlock(&sp->lock);
1070 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1072 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1073 rb_erase(&n->nd, &sp->root);
1074 mpol_free(n->policy);
1075 kmem_cache_free(sn_cache, n);
1079 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1081 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1092 /* Replace a policy range. */
1093 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1094 unsigned long end, struct sp_node *new)
1096 struct sp_node *n, *new2 = NULL;
1099 spin_lock(&sp->lock);
1100 n = sp_lookup(sp, start, end);
1101 /* Take care of old policies in the same range. */
1102 while (n && n->start < end) {
1103 struct rb_node *next = rb_next(&n->nd);
1104 if (n->start >= start) {
1110 /* Old policy spanning whole new range. */
1113 spin_unlock(&sp->lock);
1114 new2 = sp_alloc(end, n->end, n->policy);
1120 sp_insert(sp, new2);
1128 n = rb_entry(next, struct sp_node, nd);
1132 spin_unlock(&sp->lock);
1134 mpol_free(new2->policy);
1135 kmem_cache_free(sn_cache, new2);
1140 int mpol_set_shared_policy(struct shared_policy *info,
1141 struct vm_area_struct *vma, struct mempolicy *npol)
1144 struct sp_node *new = NULL;
1145 unsigned long sz = vma_pages(vma);
1147 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1149 sz, npol? npol->policy : -1,
1150 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1153 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1157 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1159 kmem_cache_free(sn_cache, new);
1163 /* Free a backing policy store on inode delete. */
1164 void mpol_free_shared_policy(struct shared_policy *p)
1167 struct rb_node *next;
1169 if (!p->root.rb_node)
1171 spin_lock(&p->lock);
1172 next = rb_first(&p->root);
1174 n = rb_entry(next, struct sp_node, nd);
1175 next = rb_next(&n->nd);
1176 rb_erase(&n->nd, &p->root);
1177 mpol_free(n->policy);
1178 kmem_cache_free(sn_cache, n);
1180 spin_unlock(&p->lock);
1183 /* assumes fs == KERNEL_DS */
1184 void __init numa_policy_init(void)
1186 policy_cache = kmem_cache_create("numa_policy",
1187 sizeof(struct mempolicy),
1188 0, SLAB_PANIC, NULL, NULL);
1190 sn_cache = kmem_cache_create("shared_policy_node",
1191 sizeof(struct sp_node),
1192 0, SLAB_PANIC, NULL, NULL);
1194 /* Set interleaving policy for system init. This way not all
1195 the data structures allocated at system boot end up in node zero. */
1197 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1198 printk("numa_policy_init: interleaving failed\n");
1201 /* Reset policy of current process to default */
1202 void numa_default_policy(void)
1204 do_set_mempolicy(MPOL_DEFAULT, NULL);
1207 /* Migrate a policy to a different set of nodes */
1208 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1209 const nodemask_t *new)
1216 switch (pol->policy) {
1219 case MPOL_INTERLEAVE:
1220 nodes_remap(tmp, pol->v.nodes, *old, *new);
1222 current->il_next = node_remap(current->il_next, *old, *new);
1224 case MPOL_PREFERRED:
1225 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1231 struct zonelist *zonelist;
1234 for (z = pol->v.zonelist->zones; *z; z++)
1235 node_set((*z)->zone_pgdat->node_id, nodes);
1236 nodes_remap(tmp, nodes, *old, *new);
1239 zonelist = bind_zonelist(&nodes);
1241 /* If no mem, then zonelist is NULL and we keep old zonelist.
1242 * If that old zonelist has no remaining mems_allowed nodes,
1243 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1247 /* Good - got mem - substitute new zonelist */
1248 kfree(pol->v.zonelist);
1249 pol->v.zonelist = zonelist;
1260 * Someone moved this task to different nodes. Fixup mempolicies.
1262 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1263 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1265 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1267 rebind_policy(current->mempolicy, old, new);