mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <asm/tlbflush.h>
  87 #include <asm/uaccess.h>
  88
  89 static kmem_cache_t *policy_cache;
  90 static kmem_cache_t *sn_cache;
  91
  92 #define PDprintk(fmt...)
  93
  94 /* Highest zone. An specific allocation for a zone below that is not
  95    policied. */
  96 static int policy_zone;
  97
  98 struct mempolicy default_policy = {
  99         .refcnt = ATOMIC_INIT(1), /* never free it */
 100         .policy = MPOL_DEFAULT,
 101 };
 102
 103 /* Do sanity checking on a policy */
 104 static int mpol_check_policy(int mode, nodemask_t *nodes)
 105 {
 106         int empty = nodes_empty(*nodes);
 107
 108         switch (mode) {
 109         case MPOL_DEFAULT:
 110                 if (!empty)
 111                         return -EINVAL;
 112                 break;
 113         case MPOL_BIND:
 114         case MPOL_INTERLEAVE:
 115                 /* Preferred will only use the first bit, but allow
 116                    more for now. */
 117                 if (empty)
 118                         return -EINVAL;
 119                 break;
 120         }
 121         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 122 }
 123 /* Generate a custom zonelist for the BIND policy. */
 124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 125 {
 126         struct zonelist *zl;
 127         int num, max, nd;
 128
 129         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 130         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 131         if (!zl)
 132                 return NULL;
 133         num = 0;
 134         for_each_node_mask(nd, *nodes) {
 135                 int k;
 136                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 137                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 138                         if (!z->present_pages)
 139                                 continue;
 140                         zl->zones[num++] = z;
 141                         if (k > policy_zone)
 142                                 policy_zone = k;
 143                 }
 144         }
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         return policy;
 184 }
 185
 186 /* Ensure all existing pages follow the policy. */
 187 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 188                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 189 {
 190         pte_t *orig_pte;
 191         pte_t *pte;
 192         spinlock_t *ptl;
 193
 194         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 195         do {
 196                 struct page *page;
 197                 unsigned int nid;
 198
 199                 if (!pte_present(*pte))
 200                         continue;
 201                 page = vm_normal_page(vma, addr, *pte);
 202                 if (!page)
 203                         continue;
 204                 nid = page_to_nid(page);
 205                 if (!node_isset(nid, *nodes))
 206                         break;
 207         } while (pte++, addr += PAGE_SIZE, addr != end);
 208         pte_unmap_unlock(orig_pte, ptl);
 209         return addr != end;
 210 }
 211
 212 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 213                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 214 {
 215         pmd_t *pmd;
 216         unsigned long next;
 217
 218         pmd = pmd_offset(pud, addr);
 219         do {
 220                 next = pmd_addr_end(addr, end);
 221                 if (pmd_none_or_clear_bad(pmd))
 222                         continue;
 223                 if (check_pte_range(vma, pmd, addr, next, nodes))
 224                         return -EIO;
 225         } while (pmd++, addr = next, addr != end);
 226         return 0;
 227 }
 228
 229 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 230                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 231 {
 232         pud_t *pud;
 233         unsigned long next;
 234
 235         pud = pud_offset(pgd, addr);
 236         do {
 237                 next = pud_addr_end(addr, end);
 238                 if (pud_none_or_clear_bad(pud))
 239                         continue;
 240                 if (check_pmd_range(vma, pud, addr, next, nodes))
 241                         return -EIO;
 242         } while (pud++, addr = next, addr != end);
 243         return 0;
 244 }
 245
 246 static inline int check_pgd_range(struct vm_area_struct *vma,
 247                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 248 {
 249         pgd_t *pgd;
 250         unsigned long next;
 251
 252         pgd = pgd_offset(vma->vm_mm, addr);
 253         do {
 254                 next = pgd_addr_end(addr, end);
 255                 if (pgd_none_or_clear_bad(pgd))
 256                         continue;
 257                 if (check_pud_range(vma, pgd, addr, next, nodes))
 258                         return -EIO;
 259         } while (pgd++, addr = next, addr != end);
 260         return 0;
 261 }
 262
 263 /* Step 1: check the range */
 264 static struct vm_area_struct *
 265 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 266             nodemask_t *nodes, unsigned long flags)
 267 {
 268         int err;
 269         struct vm_area_struct *first, *vma, *prev;
 270
 271         first = find_vma(mm, start);
 272         if (!first)
 273                 return ERR_PTR(-EFAULT);
 274         prev = NULL;
 275         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 276                 if (!vma->vm_next && vma->vm_end < end)
 277                         return ERR_PTR(-EFAULT);
 278                 if (prev && prev->vm_end < vma->vm_start)
 279                         return ERR_PTR(-EFAULT);
 280                 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
 281                         unsigned long endvma = vma->vm_end;
 282                         if (endvma > end)
 283                                 endvma = end;
 284                         if (vma->vm_start > start)
 285                                 start = vma->vm_start;
 286                         err = check_pgd_range(vma, start, endvma, nodes);
 287                         if (err) {
 288                                 first = ERR_PTR(err);
 289                                 break;
 290                         }
 291                 }
 292                 prev = vma;
 293         }
 294         return first;
 295 }
 296
 297 /* Apply policy to a single VMA */
 298 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 299 {
 300         int err = 0;
 301         struct mempolicy *old = vma->vm_policy;
 302
 303         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 304                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 305                  vma->vm_ops, vma->vm_file,
 306                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 307
 308         if (vma->vm_ops && vma->vm_ops->set_policy)
 309                 err = vma->vm_ops->set_policy(vma, new);
 310         if (!err) {
 311                 mpol_get(new);
 312                 vma->vm_policy = new;
 313                 mpol_free(old);
 314         }
 315         return err;
 316 }
 317
 318 /* Step 2: apply policy to a range and do splits. */
 319 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 320                        unsigned long end, struct mempolicy *new)
 321 {
 322         struct vm_area_struct *next;
 323         int err;
 324
 325         err = 0;
 326         for (; vma && vma->vm_start < end; vma = next) {
 327                 next = vma->vm_next;
 328                 if (vma->vm_start < start)
 329                         err = split_vma(vma->vm_mm, vma, start, 1);
 330                 if (!err && vma->vm_end > end)
 331                         err = split_vma(vma->vm_mm, vma, end, 0);
 332                 if (!err)
 333                         err = policy_vma(vma, new);
 334                 if (err)
 335                         break;
 336         }
 337         return err;
 338 }
 339
 340 static int contextualize_policy(int mode, nodemask_t *nodes)
 341 {
 342         if (!nodes)
 343                 return 0;
 344
 345         /* Update current mems_allowed */
 346         cpuset_update_current_mems_allowed();
 347         /* Ignore nodes not set in current->mems_allowed */
 348         cpuset_restrict_to_mems_allowed(nodes->bits);
 349         return mpol_check_policy(mode, nodes);
 350 }
 351
 352 long do_mbind(unsigned long start, unsigned long len,
 353                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 354 {
 355         struct vm_area_struct *vma;
 356         struct mm_struct *mm = current->mm;
 357         struct mempolicy *new;
 358         unsigned long end;
 359         int err;
 360
 361         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
 362                 return -EINVAL;
 363         if (start & ~PAGE_MASK)
 364                 return -EINVAL;
 365         if (mode == MPOL_DEFAULT)
 366                 flags &= ~MPOL_MF_STRICT;
 367         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 368         end = start + len;
 369         if (end < start)
 370                 return -EINVAL;
 371         if (end == start)
 372                 return 0;
 373         if (mpol_check_policy(mode, nmask))
 374                 return -EINVAL;
 375         new = mpol_new(mode, nmask);
 376         if (IS_ERR(new))
 377                 return PTR_ERR(new);
 378
 379         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 380                         mode,nodes_addr(nodes)[0]);
 381
 382         down_write(&mm->mmap_sem);
 383         vma = check_range(mm, start, end, nmask, flags);
 384         err = PTR_ERR(vma);
 385         if (!IS_ERR(vma))
 386                 err = mbind_range(vma, start, end, new);
 387         up_write(&mm->mmap_sem);
 388         mpol_free(new);
 389         return err;
 390 }
 391
 392 /* Set the process memory policy */
 393 long do_set_mempolicy(int mode, nodemask_t *nodes)
 394 {
 395         struct mempolicy *new;
 396
 397         if (contextualize_policy(mode, nodes))
 398                 return -EINVAL;
 399         new = mpol_new(mode, nodes);
 400         if (IS_ERR(new))
 401                 return PTR_ERR(new);
 402         mpol_free(current->mempolicy);
 403         current->mempolicy = new;
 404         if (new && new->policy == MPOL_INTERLEAVE)
 405                 current->il_next = first_node(new->v.nodes);
 406         return 0;
 407 }
 408
 409 /* Fill a zone bitmap for a policy */
 410 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 411 {
 412         int i;
 413
 414         nodes_clear(*nodes);
 415         switch (p->policy) {
 416         case MPOL_BIND:
 417                 for (i = 0; p->v.zonelist->zones[i]; i++)
 418                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 419                                 *nodes);
 420                 break;
 421         case MPOL_DEFAULT:
 422                 break;
 423         case MPOL_INTERLEAVE:
 424                 *nodes = p->v.nodes;
 425                 break;
 426         case MPOL_PREFERRED:
 427                 /* or use current node instead of online map? */
 428                 if (p->v.preferred_node < 0)
 429                         *nodes = node_online_map;
 430                 else
 431                         node_set(p->v.preferred_node, *nodes);
 432                 break;
 433         default:
 434                 BUG();
 435         }
 436 }
 437
 438 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 439 {
 440         struct page *p;
 441         int err;
 442
 443         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 444         if (err >= 0) {
 445                 err = page_to_nid(p);
 446                 put_page(p);
 447         }
 448         return err;
 449 }
 450
 451 /* Retrieve NUMA policy */
 452 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 453                         unsigned long addr, unsigned long flags)
 454 {
 455         int err;
 456         struct mm_struct *mm = current->mm;
 457         struct vm_area_struct *vma = NULL;
 458         struct mempolicy *pol = current->mempolicy;
 459
 460         cpuset_update_current_mems_allowed();
 461         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 462                 return -EINVAL;
 463         if (flags & MPOL_F_ADDR) {
 464                 down_read(&mm->mmap_sem);
 465                 vma = find_vma_intersection(mm, addr, addr+1);
 466                 if (!vma) {
 467                         up_read(&mm->mmap_sem);
 468                         return -EFAULT;
 469                 }
 470                 if (vma->vm_ops && vma->vm_ops->get_policy)
 471                         pol = vma->vm_ops->get_policy(vma, addr);
 472                 else
 473                         pol = vma->vm_policy;
 474         } else if (addr)
 475                 return -EINVAL;
 476
 477         if (!pol)
 478                 pol = &default_policy;
 479
 480         if (flags & MPOL_F_NODE) {
 481                 if (flags & MPOL_F_ADDR) {
 482                         err = lookup_node(mm, addr);
 483                         if (err < 0)
 484                                 goto out;
 485                         *policy = err;
 486                 } else if (pol == current->mempolicy &&
 487                                 pol->policy == MPOL_INTERLEAVE) {
 488                         *policy = current->il_next;
 489                 } else {
 490                         err = -EINVAL;
 491                         goto out;
 492                 }
 493         } else
 494                 *policy = pol->policy;
 495
 496         if (vma) {
 497                 up_read(&current->mm->mmap_sem);
 498                 vma = NULL;
 499         }
 500
 501         err = 0;
 502         if (nmask)
 503                 get_zonemask(pol, nmask);
 504
 505  out:
 506         if (vma)
 507                 up_read(&current->mm->mmap_sem);
 508         return err;
 509 }
 510
 511 /*
 512  * User space interface with variable sized bitmaps for nodelists.
 513  */
 514
 515 /* Copy a node mask from user space. */
 516 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
 517                      unsigned long maxnode)
 518 {
 519         unsigned long k;
 520         unsigned long nlongs;
 521         unsigned long endmask;
 522
 523         --maxnode;
 524         nodes_clear(*nodes);
 525         if (maxnode == 0 || !nmask)
 526                 return 0;
 527
 528         nlongs = BITS_TO_LONGS(maxnode);
 529         if ((maxnode % BITS_PER_LONG) == 0)
 530                 endmask = ~0UL;
 531         else
 532                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 533
 534         /* When the user specified more nodes than supported just check
 535            if the non supported part is all zero. */
 536         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 537                 if (nlongs > PAGE_SIZE/sizeof(long))
 538                         return -EINVAL;
 539                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 540                         unsigned long t;
 541                         if (get_user(t, nmask + k))
 542                                 return -EFAULT;
 543                         if (k == nlongs - 1) {
 544                                 if (t & endmask)
 545                                         return -EINVAL;
 546                         } else if (t)
 547                                 return -EINVAL;
 548                 }
 549                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 550                 endmask = ~0UL;
 551         }
 552
 553         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 554                 return -EFAULT;
 555         nodes_addr(*nodes)[nlongs-1] &= endmask;
 556         return 0;
 557 }
 558
 559 /* Copy a kernel node mask to user space */
 560 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 561                               nodemask_t *nodes)
 562 {
 563         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 564         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 565
 566         if (copy > nbytes) {
 567                 if (copy > PAGE_SIZE)
 568                         return -EINVAL;
 569                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 570                         return -EFAULT;
 571                 copy = nbytes;
 572         }
 573         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 574 }
 575
 576 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 577                         unsigned long mode,
 578                         unsigned long __user *nmask, unsigned long maxnode,
 579                         unsigned flags)
 580 {
 581         nodemask_t nodes;
 582         int err;
 583
 584         err = get_nodes(&nodes, nmask, maxnode);
 585         if (err)
 586                 return err;
 587         return do_mbind(start, len, mode, &nodes, flags);
 588 }
 589
 590 /* Set the process memory policy */
 591 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 592                 unsigned long maxnode)
 593 {
 594         int err;
 595         nodemask_t nodes;
 596
 597         if (mode < 0 || mode > MPOL_MAX)
 598                 return -EINVAL;
 599         err = get_nodes(&nodes, nmask, maxnode);
 600         if (err)
 601                 return err;
 602         return do_set_mempolicy(mode, &nodes);
 603 }
 604
 605 /* Retrieve NUMA policy */
 606 asmlinkage long sys_get_mempolicy(int __user *policy,
 607                                 unsigned long __user *nmask,
 608                                 unsigned long maxnode,
 609                                 unsigned long addr, unsigned long flags)
 610 {
 611         int err, pval;
 612         nodemask_t nodes;
 613
 614         if (nmask != NULL && maxnode < MAX_NUMNODES)
 615                 return -EINVAL;
 616
 617         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 618
 619         if (err)
 620                 return err;
 621
 622         if (policy && put_user(pval, policy))
 623                 return -EFAULT;
 624
 625         if (nmask)
 626                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 627
 628         return err;
 629 }
 630
 631 #ifdef CONFIG_COMPAT
 632
 633 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 634                                      compat_ulong_t __user *nmask,
 635                                      compat_ulong_t maxnode,
 636                                      compat_ulong_t addr, compat_ulong_t flags)
 637 {
 638         long err;
 639         unsigned long __user *nm = NULL;
 640         unsigned long nr_bits, alloc_size;
 641         DECLARE_BITMAP(bm, MAX_NUMNODES);
 642
 643         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 644         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 645
 646         if (nmask)
 647                 nm = compat_alloc_user_space(alloc_size);
 648
 649         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 650
 651         if (!err && nmask) {
 652                 err = copy_from_user(bm, nm, alloc_size);
 653                 /* ensure entire bitmap is zeroed */
 654                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 655                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 656         }
 657
 658         return err;
 659 }
 660
 661 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 662                                      compat_ulong_t maxnode)
 663 {
 664         long err = 0;
 665         unsigned long __user *nm = NULL;
 666         unsigned long nr_bits, alloc_size;
 667         DECLARE_BITMAP(bm, MAX_NUMNODES);
 668
 669         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 670         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 671
 672         if (nmask) {
 673                 err = compat_get_bitmap(bm, nmask, nr_bits);
 674                 nm = compat_alloc_user_space(alloc_size);
 675                 err |= copy_to_user(nm, bm, alloc_size);
 676         }
 677
 678         if (err)
 679                 return -EFAULT;
 680
 681         return sys_set_mempolicy(mode, nm, nr_bits+1);
 682 }
 683
 684 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 685                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 686                              compat_ulong_t maxnode, compat_ulong_t flags)
 687 {
 688         long err = 0;
 689         unsigned long __user *nm = NULL;
 690         unsigned long nr_bits, alloc_size;
 691         nodemask_t bm;
 692
 693         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 694         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 695
 696         if (nmask) {
 697                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 698                 nm = compat_alloc_user_space(alloc_size);
 699                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 700         }
 701
 702         if (err)
 703                 return -EFAULT;
 704
 705         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 706 }
 707
 708 #endif
 709
 710 /* Return effective policy for a VMA */
 711 struct mempolicy *
 712 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 713 {
 714         struct mempolicy *pol = task->mempolicy;
 715
 716         if (vma) {
 717                 if (vma->vm_ops && vma->vm_ops->get_policy)
 718                         pol = vma->vm_ops->get_policy(vma, addr);
 719                 else if (vma->vm_policy &&
 720                                 vma->vm_policy->policy != MPOL_DEFAULT)
 721                         pol = vma->vm_policy;
 722         }
 723         if (!pol)
 724                 pol = &default_policy;
 725         return pol;
 726 }
 727
 728 /* Return a zonelist representing a mempolicy */
 729 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 730 {
 731         int nd;
 732
 733         switch (policy->policy) {
 734         case MPOL_PREFERRED:
 735                 nd = policy->v.preferred_node;
 736                 if (nd < 0)
 737                         nd = numa_node_id();
 738                 break;
 739         case MPOL_BIND:
 740                 /* Lower zones don't get a policy applied */
 741                 /* Careful: current->mems_allowed might have moved */
 742                 if (gfp_zone(gfp) >= policy_zone)
 743                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 744                                 return policy->v.zonelist;
 745                 /*FALL THROUGH*/
 746         case MPOL_INTERLEAVE: /* should not happen */
 747         case MPOL_DEFAULT:
 748                 nd = numa_node_id();
 749                 break;
 750         default:
 751                 nd = 0;
 752                 BUG();
 753         }
 754         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 755 }
 756
 757 /* Do dynamic interleaving for a process */
 758 static unsigned interleave_nodes(struct mempolicy *policy)
 759 {
 760         unsigned nid, next;
 761         struct task_struct *me = current;
 762
 763         nid = me->il_next;
 764         next = next_node(nid, policy->v.nodes);
 765         if (next >= MAX_NUMNODES)
 766                 next = first_node(policy->v.nodes);
 767         me->il_next = next;
 768         return nid;
 769 }
 770
 771 /* Do static interleaving for a VMA with known offset. */
 772 static unsigned offset_il_node(struct mempolicy *pol,
 773                 struct vm_area_struct *vma, unsigned long off)
 774 {
 775         unsigned nnodes = nodes_weight(pol->v.nodes);
 776         unsigned target = (unsigned)off % nnodes;
 777         int c;
 778         int nid = -1;
 779
 780         c = 0;
 781         do {
 782                 nid = next_node(nid, pol->v.nodes);
 783                 c++;
 784         } while (c <= target);
 785         return nid;
 786 }
 787
 788 /* Determine a node number for interleave */
 789 static inline unsigned interleave_nid(struct mempolicy *pol,
 790                  struct vm_area_struct *vma, unsigned long addr, int shift)
 791 {
 792         if (vma) {
 793                 unsigned long off;
 794
 795                 off = vma->vm_pgoff;
 796                 off += (addr - vma->vm_start) >> shift;
 797                 return offset_il_node(pol, vma, off);
 798         } else
 799                 return interleave_nodes(pol);
 800 }
 801
 802 /* Return a zonelist suitable for a huge page allocation. */
 803 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 804 {
 805         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 806
 807         if (pol->policy == MPOL_INTERLEAVE) {
 808                 unsigned nid;
 809
 810                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
 811                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
 812         }
 813         return zonelist_policy(GFP_HIGHUSER, pol);
 814 }
 815
 816 /* Allocate a page in interleaved policy.
 817    Own path because it needs to do special accounting. */
 818 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 819                                         unsigned nid)
 820 {
 821         struct zonelist *zl;
 822         struct page *page;
 823
 824         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 825         page = __alloc_pages(gfp, order, zl);
 826         if (page && page_zone(page) == zl->zones[0]) {
 827                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
 828                 put_cpu();
 829         }
 830         return page;
 831 }
 832
 833 /**
 834  *      alloc_page_vma  - Allocate a page for a VMA.
 835  *
 836  *      @gfp:
 837  *      %GFP_USER    user allocation.
 838  *      %GFP_KERNEL  kernel allocations,
 839  *      %GFP_HIGHMEM highmem/user allocations,
 840  *      %GFP_FS      allocation should not call back into a file system.
 841  *      %GFP_ATOMIC  don't sleep.
 842  *
 843  *      @vma:  Pointer to VMA or NULL if not available.
 844  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 845  *
 846  *      This function allocates a page from the kernel page pool and applies
 847  *      a NUMA policy associated with the VMA or the current process.
 848  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 849  *      mm_struct of the VMA to prevent it from going away. Should be used for
 850  *      all allocations for pages that will be mapped into
 851  *      user space. Returns NULL when no page can be allocated.
 852  *
 853  *      Should be called with the mm_sem of the vma hold.
 854  */
 855 struct page *
 856 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 857 {
 858         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 859
 860         cpuset_update_current_mems_allowed();
 861
 862         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 863                 unsigned nid;
 864
 865                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 866                 return alloc_page_interleave(gfp, 0, nid);
 867         }
 868         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
 869 }
 870
 871 /**
 872  *      alloc_pages_current - Allocate pages.
 873  *
 874  *      @gfp:
 875  *              %GFP_USER   user allocation,
 876  *              %GFP_KERNEL kernel allocation,
 877  *              %GFP_HIGHMEM highmem allocation,
 878  *              %GFP_FS     don't call back into a file system.
 879  *              %GFP_ATOMIC don't sleep.
 880  *      @order: Power of two of allocation size in pages. 0 is a single page.
 881  *
 882  *      Allocate a page from the kernel page pool.  When not in
 883  *      interrupt context and apply the current process NUMA policy.
 884  *      Returns NULL when no page can be allocated.
 885  *
 886  *      Don't call cpuset_update_current_mems_allowed() unless
 887  *      1) it's ok to take cpuset_sem (can WAIT), and
 888  *      2) allocating for current task (not interrupt).
 889  */
 890 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 891 {
 892         struct mempolicy *pol = current->mempolicy;
 893
 894         if ((gfp & __GFP_WAIT) && !in_interrupt())
 895                 cpuset_update_current_mems_allowed();
 896         if (!pol || in_interrupt())
 897                 pol = &default_policy;
 898         if (pol->policy == MPOL_INTERLEAVE)
 899                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 900         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 901 }
 902 EXPORT_SYMBOL(alloc_pages_current);
 903
 904 /* Slow path of a mempolicy copy */
 905 struct mempolicy *__mpol_copy(struct mempolicy *old)
 906 {
 907         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 908
 909         if (!new)
 910                 return ERR_PTR(-ENOMEM);
 911         *new = *old;
 912         atomic_set(&new->refcnt, 1);
 913         if (new->policy == MPOL_BIND) {
 914                 int sz = ksize(old->v.zonelist);
 915                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
 916                 if (!new->v.zonelist) {
 917                         kmem_cache_free(policy_cache, new);
 918                         return ERR_PTR(-ENOMEM);
 919                 }
 920                 memcpy(new->v.zonelist, old->v.zonelist, sz);
 921         }
 922         return new;
 923 }
 924
 925 /* Slow path of a mempolicy comparison */
 926 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 927 {
 928         if (!a || !b)
 929                 return 0;
 930         if (a->policy != b->policy)
 931                 return 0;
 932         switch (a->policy) {
 933         case MPOL_DEFAULT:
 934                 return 1;
 935         case MPOL_INTERLEAVE:
 936                 return nodes_equal(a->v.nodes, b->v.nodes);
 937         case MPOL_PREFERRED:
 938                 return a->v.preferred_node == b->v.preferred_node;
 939         case MPOL_BIND: {
 940                 int i;
 941                 for (i = 0; a->v.zonelist->zones[i]; i++)
 942                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
 943                                 return 0;
 944                 return b->v.zonelist->zones[i] == NULL;
 945         }
 946         default:
 947                 BUG();
 948                 return 0;
 949         }
 950 }
 951
 952 /* Slow path of a mpol destructor. */
 953 void __mpol_free(struct mempolicy *p)
 954 {
 955         if (!atomic_dec_and_test(&p->refcnt))
 956                 return;
 957         if (p->policy == MPOL_BIND)
 958                 kfree(p->v.zonelist);
 959         p->policy = MPOL_DEFAULT;
 960         kmem_cache_free(policy_cache, p);
 961 }
 962
 963 /*
 964  * Hugetlb policy. Same as above, just works with node numbers instead of
 965  * zonelists.
 966  */
 967
 968 /* Find first node suitable for an allocation */
 969 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 970 {
 971         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 972
 973         switch (pol->policy) {
 974         case MPOL_DEFAULT:
 975                 return numa_node_id();
 976         case MPOL_BIND:
 977                 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
 978         case MPOL_INTERLEAVE:
 979                 return interleave_nodes(pol);
 980         case MPOL_PREFERRED:
 981                 return pol->v.preferred_node >= 0 ?
 982                                 pol->v.preferred_node : numa_node_id();
 983         }
 984         BUG();
 985         return 0;
 986 }
 987
 988 /* Find secondary valid nodes for an allocation */
 989 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 990 {
 991         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 992
 993         switch (pol->policy) {
 994         case MPOL_PREFERRED:
 995         case MPOL_DEFAULT:
 996         case MPOL_INTERLEAVE:
 997                 return 1;
 998         case MPOL_BIND: {
 999                 struct zone **z;
1000                 for (z = pol->v.zonelist->zones; *z; z++)
1001                         if ((*z)->zone_pgdat->node_id == nid)
1002                                 return 1;
1003                 return 0;
1004         }
1005         default:
1006                 BUG();
1007                 return 0;
1008         }
1009 }
1010
1011 /*
1012  * Shared memory backing store policy support.
1013  *
1014  * Remember policies even when nobody has shared memory mapped.
1015  * The policies are kept in Red-Black tree linked from the inode.
1016  * They are protected by the sp->lock spinlock, which should be held
1017  * for any accesses to the tree.
1018  */
1019
1020 /* lookup first element intersecting start-end */
1021 /* Caller holds sp->lock */
1022 static struct sp_node *
1023 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1024 {
1025         struct rb_node *n = sp->root.rb_node;
1026
1027         while (n) {
1028                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1029
1030                 if (start >= p->end)
1031                         n = n->rb_right;
1032                 else if (end <= p->start)
1033                         n = n->rb_left;
1034                 else
1035                         break;
1036         }
1037         if (!n)
1038                 return NULL;
1039         for (;;) {
1040                 struct sp_node *w = NULL;
1041                 struct rb_node *prev = rb_prev(n);
1042                 if (!prev)
1043                         break;
1044                 w = rb_entry(prev, struct sp_node, nd);
1045                 if (w->end <= start)
1046                         break;
1047                 n = prev;
1048         }
1049         return rb_entry(n, struct sp_node, nd);
1050 }
1051
1052 /* Insert a new shared policy into the list. */
1053 /* Caller holds sp->lock */
1054 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1055 {
1056         struct rb_node **p = &sp->root.rb_node;
1057         struct rb_node *parent = NULL;
1058         struct sp_node *nd;
1059
1060         while (*p) {
1061                 parent = *p;
1062                 nd = rb_entry(parent, struct sp_node, nd);
1063                 if (new->start < nd->start)
1064                         p = &(*p)->rb_left;
1065                 else if (new->end > nd->end)
1066                         p = &(*p)->rb_right;
1067                 else
1068                         BUG();
1069         }
1070         rb_link_node(&new->nd, parent, p);
1071         rb_insert_color(&new->nd, &sp->root);
1072         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1073                  new->policy ? new->policy->policy : 0);
1074 }
1075
1076 /* Find shared policy intersecting idx */
1077 struct mempolicy *
1078 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1079 {
1080         struct mempolicy *pol = NULL;
1081         struct sp_node *sn;
1082
1083         if (!sp->root.rb_node)
1084                 return NULL;
1085         spin_lock(&sp->lock);
1086         sn = sp_lookup(sp, idx, idx+1);
1087         if (sn) {
1088                 mpol_get(sn->policy);
1089                 pol = sn->policy;
1090         }
1091         spin_unlock(&sp->lock);
1092         return pol;
1093 }
1094
1095 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1096 {
1097         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1098         rb_erase(&n->nd, &sp->root);
1099         mpol_free(n->policy);
1100         kmem_cache_free(sn_cache, n);
1101 }
1102
1103 struct sp_node *
1104 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1105 {
1106         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1107
1108         if (!n)
1109                 return NULL;
1110         n->start = start;
1111         n->end = end;
1112         mpol_get(pol);
1113         n->policy = pol;
1114         return n;
1115 }
1116
1117 /* Replace a policy range. */
1118 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1119                                  unsigned long end, struct sp_node *new)
1120 {
1121         struct sp_node *n, *new2 = NULL;
1122
1123 restart:
1124         spin_lock(&sp->lock);
1125         n = sp_lookup(sp, start, end);
1126         /* Take care of old policies in the same range. */
1127         while (n && n->start < end) {
1128                 struct rb_node *next = rb_next(&n->nd);
1129                 if (n->start >= start) {
1130                         if (n->end <= end)
1131                                 sp_delete(sp, n);
1132                         else
1133                                 n->start = end;
1134                 } else {
1135                         /* Old policy spanning whole new range. */
1136                         if (n->end > end) {
1137                                 if (!new2) {
1138                                         spin_unlock(&sp->lock);
1139                                         new2 = sp_alloc(end, n->end, n->policy);
1140                                         if (!new2)
1141                                                 return -ENOMEM;
1142                                         goto restart;
1143                                 }
1144                                 n->end = start;
1145                                 sp_insert(sp, new2);
1146                                 new2 = NULL;
1147                                 break;
1148                         } else
1149                                 n->end = start;
1150                 }
1151                 if (!next)
1152                         break;
1153                 n = rb_entry(next, struct sp_node, nd);
1154         }
1155         if (new)
1156                 sp_insert(sp, new);
1157         spin_unlock(&sp->lock);
1158         if (new2) {
1159                 mpol_free(new2->policy);
1160                 kmem_cache_free(sn_cache, new2);
1161         }
1162         return 0;
1163 }
1164
1165 int mpol_set_shared_policy(struct shared_policy *info,
1166                         struct vm_area_struct *vma, struct mempolicy *npol)
1167 {
1168         int err;
1169         struct sp_node *new = NULL;
1170         unsigned long sz = vma_pages(vma);
1171
1172         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1173                  vma->vm_pgoff,
1174                  sz, npol? npol->policy : -1,
1175                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1176
1177         if (npol) {
1178                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1179                 if (!new)
1180                         return -ENOMEM;
1181         }
1182         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1183         if (err && new)
1184                 kmem_cache_free(sn_cache, new);
1185         return err;
1186 }
1187
1188 /* Free a backing policy store on inode delete. */
1189 void mpol_free_shared_policy(struct shared_policy *p)
1190 {
1191         struct sp_node *n;
1192         struct rb_node *next;
1193
1194         if (!p->root.rb_node)
1195                 return;
1196         spin_lock(&p->lock);
1197         next = rb_first(&p->root);
1198         while (next) {
1199                 n = rb_entry(next, struct sp_node, nd);
1200                 next = rb_next(&n->nd);
1201                 rb_erase(&n->nd, &p->root);
1202                 mpol_free(n->policy);
1203                 kmem_cache_free(sn_cache, n);
1204         }
1205         spin_unlock(&p->lock);
1206 }
1207
1208 /* assumes fs == KERNEL_DS */
1209 void __init numa_policy_init(void)
1210 {
1211         policy_cache = kmem_cache_create("numa_policy",
1212                                          sizeof(struct mempolicy),
1213                                          0, SLAB_PANIC, NULL, NULL);
1214
1215         sn_cache = kmem_cache_create("shared_policy_node",
1216                                      sizeof(struct sp_node),
1217                                      0, SLAB_PANIC, NULL, NULL);
1218
1219         /* Set interleaving policy for system init. This way not all
1220            the data structures allocated at system boot end up in node zero. */
1221
1222         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1223                 printk("numa_policy_init: interleaving failed\n");
1224 }
1225
1226 /* Reset policy of current process to default */
1227 void numa_default_policy(void)
1228 {
1229         do_set_mempolicy(MPOL_DEFAULT, NULL);
1230 }
1231
1232 /* Migrate a policy to a different set of nodes */
1233 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1234                                                         const nodemask_t *new)
1235 {
1236         nodemask_t tmp;
1237
1238         if (!pol)
1239                 return;
1240
1241         switch (pol->policy) {
1242         case MPOL_DEFAULT:
1243                 break;
1244         case MPOL_INTERLEAVE:
1245                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1246                 pol->v.nodes = tmp;
1247                 current->il_next = node_remap(current->il_next, *old, *new);
1248                 break;
1249         case MPOL_PREFERRED:
1250                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1251                                                                 *old, *new);
1252                 break;
1253         case MPOL_BIND: {
1254                 nodemask_t nodes;
1255                 struct zone **z;
1256                 struct zonelist *zonelist;
1257
1258                 nodes_clear(nodes);
1259                 for (z = pol->v.zonelist->zones; *z; z++)
1260                         node_set((*z)->zone_pgdat->node_id, nodes);
1261                 nodes_remap(tmp, nodes, *old, *new);
1262                 nodes = tmp;
1263
1264                 zonelist = bind_zonelist(&nodes);
1265
1266                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1267                  * If that old zonelist has no remaining mems_allowed nodes,
1268                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1269                  */
1270
1271                 if (zonelist) {
1272                         /* Good - got mem - substitute new zonelist */
1273                         kfree(pol->v.zonelist);
1274                         pol->v.zonelist = zonelist;
1275                 }
1276                 break;
1277         }
1278         default:
1279                 BUG();
1280                 break;
1281         }
1282 }
1283
1284 /*
1285  * Someone moved this task to different nodes.  Fixup mempolicies.
1286  *
1287  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1288  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1289  */
1290 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1291 {
1292         rebind_policy(current->mempolicy, old, new);
1293 }