mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 184         return policy;
 185 }
 186
 187 static void gather_stats(struct page *, void *);
 188 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 189                                 unsigned long flags);
 190
 191 /* Scan through pages checking if pages follow certain conditions. */
 192 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 193                 unsigned long addr, unsigned long end,
 194                 const nodemask_t *nodes, unsigned long flags,
 195                 void *private)
 196 {
 197         pte_t *orig_pte;
 198         pte_t *pte;
 199         spinlock_t *ptl;
 200
 201         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 202         do {
 203                 struct page *page;
 204                 unsigned int nid;
 205
 206                 if (!pte_present(*pte))
 207                         continue;
 208                 page = vm_normal_page(vma, addr, *pte);
 209                 if (!page)
 210                         continue;
 211                 /*
 212                  * The check for PageReserved here is important to avoid
 213                  * handling zero pages and other pages that may have been
 214                  * marked special by the system.
 215                  *
 216                  * If the PageReserved would not be checked here then f.e.
 217                  * the location of the zero page could have an influence
 218                  * on MPOL_MF_STRICT, zero pages would be counted for
 219                  * the per node stats, and there would be useless attempts
 220                  * to put zero pages on the migration list.
 221                  */
 222                 if (PageReserved(page))
 223                         continue;
 224                 nid = page_to_nid(page);
 225                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 226                         continue;
 227
 228                 if (flags & MPOL_MF_STATS)
 229                         gather_stats(page, private);
 230                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 231                         migrate_page_add(page, private, flags);
 232                 else
 233                         break;
 234         } while (pte++, addr += PAGE_SIZE, addr != end);
 235         pte_unmap_unlock(orig_pte, ptl);
 236         return addr != end;
 237 }
 238
 239 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 240                 unsigned long addr, unsigned long end,
 241                 const nodemask_t *nodes, unsigned long flags,
 242                 void *private)
 243 {
 244         pmd_t *pmd;
 245         unsigned long next;
 246
 247         pmd = pmd_offset(pud, addr);
 248         do {
 249                 next = pmd_addr_end(addr, end);
 250                 if (pmd_none_or_clear_bad(pmd))
 251                         continue;
 252                 if (check_pte_range(vma, pmd, addr, next, nodes,
 253                                     flags, private))
 254                         return -EIO;
 255         } while (pmd++, addr = next, addr != end);
 256         return 0;
 257 }
 258
 259 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 260                 unsigned long addr, unsigned long end,
 261                 const nodemask_t *nodes, unsigned long flags,
 262                 void *private)
 263 {
 264         pud_t *pud;
 265         unsigned long next;
 266
 267         pud = pud_offset(pgd, addr);
 268         do {
 269                 next = pud_addr_end(addr, end);
 270                 if (pud_none_or_clear_bad(pud))
 271                         continue;
 272                 if (check_pmd_range(vma, pud, addr, next, nodes,
 273                                     flags, private))
 274                         return -EIO;
 275         } while (pud++, addr = next, addr != end);
 276         return 0;
 277 }
 278
 279 static inline int check_pgd_range(struct vm_area_struct *vma,
 280                 unsigned long addr, unsigned long end,
 281                 const nodemask_t *nodes, unsigned long flags,
 282                 void *private)
 283 {
 284         pgd_t *pgd;
 285         unsigned long next;
 286
 287         pgd = pgd_offset(vma->vm_mm, addr);
 288         do {
 289                 next = pgd_addr_end(addr, end);
 290                 if (pgd_none_or_clear_bad(pgd))
 291                         continue;
 292                 if (check_pud_range(vma, pgd, addr, next, nodes,
 293                                     flags, private))
 294                         return -EIO;
 295         } while (pgd++, addr = next, addr != end);
 296         return 0;
 297 }
 298
 299 /* Check if a vma is migratable */
 300 static inline int vma_migratable(struct vm_area_struct *vma)
 301 {
 302         if (vma->vm_flags & (
 303                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 304                 return 0;
 305         return 1;
 306 }
 307
 308 /*
 309  * Check if all pages in a range are on a set of nodes.
 310  * If pagelist != NULL then isolate pages from the LRU and
 311  * put them on the pagelist.
 312  */
 313 static struct vm_area_struct *
 314 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 315                 const nodemask_t *nodes, unsigned long flags, void *private)
 316 {
 317         int err;
 318         struct vm_area_struct *first, *vma, *prev;
 319
 320         /* Clear the LRU lists so pages can be isolated */
 321         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 322                 lru_add_drain_all();
 323
 324         first = find_vma(mm, start);
 325         if (!first)
 326                 return ERR_PTR(-EFAULT);
 327         prev = NULL;
 328         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 329                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 330                         if (!vma->vm_next && vma->vm_end < end)
 331                                 return ERR_PTR(-EFAULT);
 332                         if (prev && prev->vm_end < vma->vm_start)
 333                                 return ERR_PTR(-EFAULT);
 334                 }
 335                 if (!is_vm_hugetlb_page(vma) &&
 336                     ((flags & MPOL_MF_STRICT) ||
 337                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 338                                 vma_migratable(vma)))) {
 339                         unsigned long endvma = vma->vm_end;
 340
 341                         if (endvma > end)
 342                                 endvma = end;
 343                         if (vma->vm_start > start)
 344                                 start = vma->vm_start;
 345                         err = check_pgd_range(vma, start, endvma, nodes,
 346                                                 flags, private);
 347                         if (err) {
 348                                 first = ERR_PTR(err);
 349                                 break;
 350                         }
 351                 }
 352                 prev = vma;
 353         }
 354         return first;
 355 }
 356
 357 /* Apply policy to a single VMA */
 358 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 359 {
 360         int err = 0;
 361         struct mempolicy *old = vma->vm_policy;
 362
 363         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 364                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 365                  vma->vm_ops, vma->vm_file,
 366                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 367
 368         if (vma->vm_ops && vma->vm_ops->set_policy)
 369                 err = vma->vm_ops->set_policy(vma, new);
 370         if (!err) {
 371                 mpol_get(new);
 372                 vma->vm_policy = new;
 373                 mpol_free(old);
 374         }
 375         return err;
 376 }
 377
 378 /* Step 2: apply policy to a range and do splits. */
 379 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 380                        unsigned long end, struct mempolicy *new)
 381 {
 382         struct vm_area_struct *next;
 383         int err;
 384
 385         err = 0;
 386         for (; vma && vma->vm_start < end; vma = next) {
 387                 next = vma->vm_next;
 388                 if (vma->vm_start < start)
 389                         err = split_vma(vma->vm_mm, vma, start, 1);
 390                 if (!err && vma->vm_end > end)
 391                         err = split_vma(vma->vm_mm, vma, end, 0);
 392                 if (!err)
 393                         err = policy_vma(vma, new);
 394                 if (err)
 395                         break;
 396         }
 397         return err;
 398 }
 399
 400 static int contextualize_policy(int mode, nodemask_t *nodes)
 401 {
 402         if (!nodes)
 403                 return 0;
 404
 405         cpuset_update_task_memory_state();
 406         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 407                 return -EINVAL;
 408         return mpol_check_policy(mode, nodes);
 409 }
 410
 411 /* Set the process memory policy */
 412 long do_set_mempolicy(int mode, nodemask_t *nodes)
 413 {
 414         struct mempolicy *new;
 415
 416         if (contextualize_policy(mode, nodes))
 417                 return -EINVAL;
 418         new = mpol_new(mode, nodes);
 419         if (IS_ERR(new))
 420                 return PTR_ERR(new);
 421         mpol_free(current->mempolicy);
 422         current->mempolicy = new;
 423         if (new && new->policy == MPOL_INTERLEAVE)
 424                 current->il_next = first_node(new->v.nodes);
 425         return 0;
 426 }
 427
 428 /* Fill a zone bitmap for a policy */
 429 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 430 {
 431         int i;
 432
 433         nodes_clear(*nodes);
 434         switch (p->policy) {
 435         case MPOL_BIND:
 436                 for (i = 0; p->v.zonelist->zones[i]; i++)
 437                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 438                                 *nodes);
 439                 break;
 440         case MPOL_DEFAULT:
 441                 break;
 442         case MPOL_INTERLEAVE:
 443                 *nodes = p->v.nodes;
 444                 break;
 445         case MPOL_PREFERRED:
 446                 /* or use current node instead of online map? */
 447                 if (p->v.preferred_node < 0)
 448                         *nodes = node_online_map;
 449                 else
 450                         node_set(p->v.preferred_node, *nodes);
 451                 break;
 452         default:
 453                 BUG();
 454         }
 455 }
 456
 457 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 458 {
 459         struct page *p;
 460         int err;
 461
 462         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 463         if (err >= 0) {
 464                 err = page_to_nid(p);
 465                 put_page(p);
 466         }
 467         return err;
 468 }
 469
 470 /* Retrieve NUMA policy */
 471 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 472                         unsigned long addr, unsigned long flags)
 473 {
 474         int err;
 475         struct mm_struct *mm = current->mm;
 476         struct vm_area_struct *vma = NULL;
 477         struct mempolicy *pol = current->mempolicy;
 478
 479         cpuset_update_task_memory_state();
 480         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 481                 return -EINVAL;
 482         if (flags & MPOL_F_ADDR) {
 483                 down_read(&mm->mmap_sem);
 484                 vma = find_vma_intersection(mm, addr, addr+1);
 485                 if (!vma) {
 486                         up_read(&mm->mmap_sem);
 487                         return -EFAULT;
 488                 }
 489                 if (vma->vm_ops && vma->vm_ops->get_policy)
 490                         pol = vma->vm_ops->get_policy(vma, addr);
 491                 else
 492                         pol = vma->vm_policy;
 493         } else if (addr)
 494                 return -EINVAL;
 495
 496         if (!pol)
 497                 pol = &default_policy;
 498
 499         if (flags & MPOL_F_NODE) {
 500                 if (flags & MPOL_F_ADDR) {
 501                         err = lookup_node(mm, addr);
 502                         if (err < 0)
 503                                 goto out;
 504                         *policy = err;
 505                 } else if (pol == current->mempolicy &&
 506                                 pol->policy == MPOL_INTERLEAVE) {
 507                         *policy = current->il_next;
 508                 } else {
 509                         err = -EINVAL;
 510                         goto out;
 511                 }
 512         } else
 513                 *policy = pol->policy;
 514
 515         if (vma) {
 516                 up_read(&current->mm->mmap_sem);
 517                 vma = NULL;
 518         }
 519
 520         err = 0;
 521         if (nmask)
 522                 get_zonemask(pol, nmask);
 523
 524  out:
 525         if (vma)
 526                 up_read(&current->mm->mmap_sem);
 527         return err;
 528 }
 529
 530 /*
 531  * page migration
 532  */
 533
 534 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 535                                 unsigned long flags)
 536 {
 537         /*
 538          * Avoid migrating a page that is shared with others.
 539          */
 540         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 541                 if (isolate_lru_page(page))
 542                         list_add(&page->lru, pagelist);
 543         }
 544 }
 545
 546 static int swap_pages(struct list_head *pagelist)
 547 {
 548         LIST_HEAD(moved);
 549         LIST_HEAD(failed);
 550         int n;
 551
 552         n = migrate_pages(pagelist, NULL, &moved, &failed);
 553         putback_lru_pages(&failed);
 554         putback_lru_pages(&moved);
 555
 556         return n;
 557 }
 558
 559 /*
 560  * For now migrate_pages simply swaps out the pages from nodes that are in
 561  * the source set but not in the target set. In the future, we would
 562  * want a function that moves pages between the two nodesets in such
 563  * a way as to preserve the physical layout as much as possible.
 564  *
 565  * Returns the number of page that could not be moved.
 566  */
 567 int do_migrate_pages(struct mm_struct *mm,
 568         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 569 {
 570         LIST_HEAD(pagelist);
 571         int count = 0;
 572         nodemask_t nodes;
 573
 574         nodes_andnot(nodes, *from_nodes, *to_nodes);
 575
 576         down_read(&mm->mmap_sem);
 577         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 578                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 579
 580         if (!list_empty(&pagelist)) {
 581                 count = swap_pages(&pagelist);
 582                 putback_lru_pages(&pagelist);
 583         }
 584
 585         up_read(&mm->mmap_sem);
 586         return count;
 587 }
 588
 589 long do_mbind(unsigned long start, unsigned long len,
 590                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 591 {
 592         struct vm_area_struct *vma;
 593         struct mm_struct *mm = current->mm;
 594         struct mempolicy *new;
 595         unsigned long end;
 596         int err;
 597         LIST_HEAD(pagelist);
 598
 599         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 600                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 601             || mode > MPOL_MAX)
 602                 return -EINVAL;
 603         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 604                 return -EPERM;
 605
 606         if (start & ~PAGE_MASK)
 607                 return -EINVAL;
 608
 609         if (mode == MPOL_DEFAULT)
 610                 flags &= ~MPOL_MF_STRICT;
 611
 612         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 613         end = start + len;
 614
 615         if (end < start)
 616                 return -EINVAL;
 617         if (end == start)
 618                 return 0;
 619
 620         if (mpol_check_policy(mode, nmask))
 621                 return -EINVAL;
 622
 623         new = mpol_new(mode, nmask);
 624         if (IS_ERR(new))
 625                 return PTR_ERR(new);
 626
 627         /*
 628          * If we are using the default policy then operation
 629          * on discontinuous address spaces is okay after all
 630          */
 631         if (!new)
 632                 flags |= MPOL_MF_DISCONTIG_OK;
 633
 634         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 635                         mode,nodes_addr(nodes)[0]);
 636
 637         down_write(&mm->mmap_sem);
 638         vma = check_range(mm, start, end, nmask,
 639                           flags | MPOL_MF_INVERT, &pagelist);
 640
 641         err = PTR_ERR(vma);
 642         if (!IS_ERR(vma)) {
 643                 int nr_failed = 0;
 644
 645                 err = mbind_range(vma, start, end, new);
 646                 if (!list_empty(&pagelist))
 647                         nr_failed = swap_pages(&pagelist);
 648
 649                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 650                         err = -EIO;
 651         }
 652         if (!list_empty(&pagelist))
 653                 putback_lru_pages(&pagelist);
 654
 655         up_write(&mm->mmap_sem);
 656         mpol_free(new);
 657         return err;
 658 }
 659
 660 /*
 661  * User space interface with variable sized bitmaps for nodelists.
 662  */
 663
 664 /* Copy a node mask from user space. */
 665 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 666                      unsigned long maxnode)
 667 {
 668         unsigned long k;
 669         unsigned long nlongs;
 670         unsigned long endmask;
 671
 672         --maxnode;
 673         nodes_clear(*nodes);
 674         if (maxnode == 0 || !nmask)
 675                 return 0;
 676
 677         nlongs = BITS_TO_LONGS(maxnode);
 678         if ((maxnode % BITS_PER_LONG) == 0)
 679                 endmask = ~0UL;
 680         else
 681                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 682
 683         /* When the user specified more nodes than supported just check
 684            if the non supported part is all zero. */
 685         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 686                 if (nlongs > PAGE_SIZE/sizeof(long))
 687                         return -EINVAL;
 688                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 689                         unsigned long t;
 690                         if (get_user(t, nmask + k))
 691                                 return -EFAULT;
 692                         if (k == nlongs - 1) {
 693                                 if (t & endmask)
 694                                         return -EINVAL;
 695                         } else if (t)
 696                                 return -EINVAL;
 697                 }
 698                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 699                 endmask = ~0UL;
 700         }
 701
 702         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 703                 return -EFAULT;
 704         nodes_addr(*nodes)[nlongs-1] &= endmask;
 705         return 0;
 706 }
 707
 708 /* Copy a kernel node mask to user space */
 709 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 710                               nodemask_t *nodes)
 711 {
 712         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 713         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 714
 715         if (copy > nbytes) {
 716                 if (copy > PAGE_SIZE)
 717                         return -EINVAL;
 718                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 719                         return -EFAULT;
 720                 copy = nbytes;
 721         }
 722         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 723 }
 724
 725 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 726                         unsigned long mode,
 727                         unsigned long __user *nmask, unsigned long maxnode,
 728                         unsigned flags)
 729 {
 730         nodemask_t nodes;
 731         int err;
 732
 733         err = get_nodes(&nodes, nmask, maxnode);
 734         if (err)
 735                 return err;
 736         return do_mbind(start, len, mode, &nodes, flags);
 737 }
 738
 739 /* Set the process memory policy */
 740 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 741                 unsigned long maxnode)
 742 {
 743         int err;
 744         nodemask_t nodes;
 745
 746         if (mode < 0 || mode > MPOL_MAX)
 747                 return -EINVAL;
 748         err = get_nodes(&nodes, nmask, maxnode);
 749         if (err)
 750                 return err;
 751         return do_set_mempolicy(mode, &nodes);
 752 }
 753
 754 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 755                 const unsigned long __user *old_nodes,
 756                 const unsigned long __user *new_nodes)
 757 {
 758         struct mm_struct *mm;
 759         struct task_struct *task;
 760         nodemask_t old;
 761         nodemask_t new;
 762         nodemask_t task_nodes;
 763         int err;
 764
 765         err = get_nodes(&old, old_nodes, maxnode);
 766         if (err)
 767                 return err;
 768
 769         err = get_nodes(&new, new_nodes, maxnode);
 770         if (err)
 771                 return err;
 772
 773         /* Find the mm_struct */
 774         read_lock(&tasklist_lock);
 775         task = pid ? find_task_by_pid(pid) : current;
 776         if (!task) {
 777                 read_unlock(&tasklist_lock);
 778                 return -ESRCH;
 779         }
 780         mm = get_task_mm(task);
 781         read_unlock(&tasklist_lock);
 782
 783         if (!mm)
 784                 return -EINVAL;
 785
 786         /*
 787          * Check if this process has the right to modify the specified
 788          * process. The right exists if the process has administrative
 789          * capabilities, superuser priviledges or the same
 790          * userid as the target process.
 791          */
 792         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 793             (current->uid != task->suid) && (current->uid != task->uid) &&
 794             !capable(CAP_SYS_ADMIN)) {
 795                 err = -EPERM;
 796                 goto out;
 797         }
 798
 799         task_nodes = cpuset_mems_allowed(task);
 800         /* Is the user allowed to access the target nodes? */
 801         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 802                 err = -EPERM;
 803                 goto out;
 804         }
 805
 806         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 807 out:
 808         mmput(mm);
 809         return err;
 810 }
 811
 812
 813 /* Retrieve NUMA policy */
 814 asmlinkage long sys_get_mempolicy(int __user *policy,
 815                                 unsigned long __user *nmask,
 816                                 unsigned long maxnode,
 817                                 unsigned long addr, unsigned long flags)
 818 {
 819         int err, pval;
 820         nodemask_t nodes;
 821
 822         if (nmask != NULL && maxnode < MAX_NUMNODES)
 823                 return -EINVAL;
 824
 825         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 826
 827         if (err)
 828                 return err;
 829
 830         if (policy && put_user(pval, policy))
 831                 return -EFAULT;
 832
 833         if (nmask)
 834                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 835
 836         return err;
 837 }
 838
 839 #ifdef CONFIG_COMPAT
 840
 841 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 842                                      compat_ulong_t __user *nmask,
 843                                      compat_ulong_t maxnode,
 844                                      compat_ulong_t addr, compat_ulong_t flags)
 845 {
 846         long err;
 847         unsigned long __user *nm = NULL;
 848         unsigned long nr_bits, alloc_size;
 849         DECLARE_BITMAP(bm, MAX_NUMNODES);
 850
 851         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 852         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 853
 854         if (nmask)
 855                 nm = compat_alloc_user_space(alloc_size);
 856
 857         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 858
 859         if (!err && nmask) {
 860                 err = copy_from_user(bm, nm, alloc_size);
 861                 /* ensure entire bitmap is zeroed */
 862                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 863                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 864         }
 865
 866         return err;
 867 }
 868
 869 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 870                                      compat_ulong_t maxnode)
 871 {
 872         long err = 0;
 873         unsigned long __user *nm = NULL;
 874         unsigned long nr_bits, alloc_size;
 875         DECLARE_BITMAP(bm, MAX_NUMNODES);
 876
 877         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 878         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 879
 880         if (nmask) {
 881                 err = compat_get_bitmap(bm, nmask, nr_bits);
 882                 nm = compat_alloc_user_space(alloc_size);
 883                 err |= copy_to_user(nm, bm, alloc_size);
 884         }
 885
 886         if (err)
 887                 return -EFAULT;
 888
 889         return sys_set_mempolicy(mode, nm, nr_bits+1);
 890 }
 891
 892 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 893                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 894                              compat_ulong_t maxnode, compat_ulong_t flags)
 895 {
 896         long err = 0;
 897         unsigned long __user *nm = NULL;
 898         unsigned long nr_bits, alloc_size;
 899         nodemask_t bm;
 900
 901         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 902         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 903
 904         if (nmask) {
 905                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 906                 nm = compat_alloc_user_space(alloc_size);
 907                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 908         }
 909
 910         if (err)
 911                 return -EFAULT;
 912
 913         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 914 }
 915
 916 #endif
 917
 918 /* Return effective policy for a VMA */
 919 static struct mempolicy * get_vma_policy(struct task_struct *task,
 920                 struct vm_area_struct *vma, unsigned long addr)
 921 {
 922         struct mempolicy *pol = task->mempolicy;
 923
 924         if (vma) {
 925                 if (vma->vm_ops && vma->vm_ops->get_policy)
 926                         pol = vma->vm_ops->get_policy(vma, addr);
 927                 else if (vma->vm_policy &&
 928                                 vma->vm_policy->policy != MPOL_DEFAULT)
 929                         pol = vma->vm_policy;
 930         }
 931         if (!pol)
 932                 pol = &default_policy;
 933         return pol;
 934 }
 935
 936 /* Return a zonelist representing a mempolicy */
 937 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 938 {
 939         int nd;
 940
 941         switch (policy->policy) {
 942         case MPOL_PREFERRED:
 943                 nd = policy->v.preferred_node;
 944                 if (nd < 0)
 945                         nd = numa_node_id();
 946                 break;
 947         case MPOL_BIND:
 948                 /* Lower zones don't get a policy applied */
 949                 /* Careful: current->mems_allowed might have moved */
 950                 if (gfp_zone(gfp) >= policy_zone)
 951                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 952                                 return policy->v.zonelist;
 953                 /*FALL THROUGH*/
 954         case MPOL_INTERLEAVE: /* should not happen */
 955         case MPOL_DEFAULT:
 956                 nd = numa_node_id();
 957                 break;
 958         default:
 959                 nd = 0;
 960                 BUG();
 961         }
 962         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 963 }
 964
 965 /* Do dynamic interleaving for a process */
 966 static unsigned interleave_nodes(struct mempolicy *policy)
 967 {
 968         unsigned nid, next;
 969         struct task_struct *me = current;
 970
 971         nid = me->il_next;
 972         next = next_node(nid, policy->v.nodes);
 973         if (next >= MAX_NUMNODES)
 974                 next = first_node(policy->v.nodes);
 975         me->il_next = next;
 976         return nid;
 977 }
 978
 979 /*
 980  * Depending on the memory policy provide a node from which to allocate the
 981  * next slab entry.
 982  */
 983 unsigned slab_node(struct mempolicy *policy)
 984 {
 985         if (in_interrupt())
 986                 return numa_node_id();
 987
 988         switch (policy->policy) {
 989         case MPOL_INTERLEAVE:
 990                 return interleave_nodes(policy);
 991
 992         case MPOL_BIND:
 993                 /*
 994                  * Follow bind policy behavior and start allocation at the
 995                  * first node.
 996                  */
 997                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
 998
 999         case MPOL_PREFERRED:
1000                 if (policy->v.preferred_node >= 0)
1001                         return policy->v.preferred_node;
1002                 /* Fall through */
1003
1004         default:
1005                 return numa_node_id();
1006         }
1007 }
1008
1009 /* Do static interleaving for a VMA with known offset. */
1010 static unsigned offset_il_node(struct mempolicy *pol,
1011                 struct vm_area_struct *vma, unsigned long off)
1012 {
1013         unsigned nnodes = nodes_weight(pol->v.nodes);
1014         unsigned target = (unsigned)off % nnodes;
1015         int c;
1016         int nid = -1;
1017
1018         c = 0;
1019         do {
1020                 nid = next_node(nid, pol->v.nodes);
1021                 c++;
1022         } while (c <= target);
1023         return nid;
1024 }
1025
1026 /* Determine a node number for interleave */
1027 static inline unsigned interleave_nid(struct mempolicy *pol,
1028                  struct vm_area_struct *vma, unsigned long addr, int shift)
1029 {
1030         if (vma) {
1031                 unsigned long off;
1032
1033                 off = vma->vm_pgoff;
1034                 off += (addr - vma->vm_start) >> shift;
1035                 return offset_il_node(pol, vma, off);
1036         } else
1037                 return interleave_nodes(pol);
1038 }
1039
1040 /* Return a zonelist suitable for a huge page allocation. */
1041 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1042 {
1043         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1044
1045         if (pol->policy == MPOL_INTERLEAVE) {
1046                 unsigned nid;
1047
1048                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1049                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1050         }
1051         return zonelist_policy(GFP_HIGHUSER, pol);
1052 }
1053
1054 /* Allocate a page in interleaved policy.
1055    Own path because it needs to do special accounting. */
1056 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1057                                         unsigned nid)
1058 {
1059         struct zonelist *zl;
1060         struct page *page;
1061
1062         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1063         page = __alloc_pages(gfp, order, zl);
1064         if (page && page_zone(page) == zl->zones[0]) {
1065                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1066                 put_cpu();
1067         }
1068         return page;
1069 }
1070
1071 /**
1072  *      alloc_page_vma  - Allocate a page for a VMA.
1073  *
1074  *      @gfp:
1075  *      %GFP_USER    user allocation.
1076  *      %GFP_KERNEL  kernel allocations,
1077  *      %GFP_HIGHMEM highmem/user allocations,
1078  *      %GFP_FS      allocation should not call back into a file system.
1079  *      %GFP_ATOMIC  don't sleep.
1080  *
1081  *      @vma:  Pointer to VMA or NULL if not available.
1082  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1083  *
1084  *      This function allocates a page from the kernel page pool and applies
1085  *      a NUMA policy associated with the VMA or the current process.
1086  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1087  *      mm_struct of the VMA to prevent it from going away. Should be used for
1088  *      all allocations for pages that will be mapped into
1089  *      user space. Returns NULL when no page can be allocated.
1090  *
1091  *      Should be called with the mm_sem of the vma hold.
1092  */
1093 struct page *
1094 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1095 {
1096         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1097
1098         cpuset_update_task_memory_state();
1099
1100         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1101                 unsigned nid;
1102
1103                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1104                 return alloc_page_interleave(gfp, 0, nid);
1105         }
1106         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1107 }
1108
1109 /**
1110  *      alloc_pages_current - Allocate pages.
1111  *
1112  *      @gfp:
1113  *              %GFP_USER   user allocation,
1114  *              %GFP_KERNEL kernel allocation,
1115  *              %GFP_HIGHMEM highmem allocation,
1116  *              %GFP_FS     don't call back into a file system.
1117  *              %GFP_ATOMIC don't sleep.
1118  *      @order: Power of two of allocation size in pages. 0 is a single page.
1119  *
1120  *      Allocate a page from the kernel page pool.  When not in
1121  *      interrupt context and apply the current process NUMA policy.
1122  *      Returns NULL when no page can be allocated.
1123  *
1124  *      Don't call cpuset_update_task_memory_state() unless
1125  *      1) it's ok to take cpuset_sem (can WAIT), and
1126  *      2) allocating for current task (not interrupt).
1127  */
1128 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1129 {
1130         struct mempolicy *pol = current->mempolicy;
1131
1132         if ((gfp & __GFP_WAIT) && !in_interrupt())
1133                 cpuset_update_task_memory_state();
1134         if (!pol || in_interrupt())
1135                 pol = &default_policy;
1136         if (pol->policy == MPOL_INTERLEAVE)
1137                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1138         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1139 }
1140 EXPORT_SYMBOL(alloc_pages_current);
1141
1142 /*
1143  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1144  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1145  * with the mems_allowed returned by cpuset_mems_allowed().  This
1146  * keeps mempolicies cpuset relative after its cpuset moves.  See
1147  * further kernel/cpuset.c update_nodemask().
1148  */
1149 void *cpuset_being_rebound;
1150
1151 /* Slow path of a mempolicy copy */
1152 struct mempolicy *__mpol_copy(struct mempolicy *old)
1153 {
1154         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1155
1156         if (!new)
1157                 return ERR_PTR(-ENOMEM);
1158         if (current_cpuset_is_being_rebound()) {
1159                 nodemask_t mems = cpuset_mems_allowed(current);
1160                 mpol_rebind_policy(old, &mems);
1161         }
1162         *new = *old;
1163         atomic_set(&new->refcnt, 1);
1164         if (new->policy == MPOL_BIND) {
1165                 int sz = ksize(old->v.zonelist);
1166                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1167                 if (!new->v.zonelist) {
1168                         kmem_cache_free(policy_cache, new);
1169                         return ERR_PTR(-ENOMEM);
1170                 }
1171                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1172         }
1173         return new;
1174 }
1175
1176 /* Slow path of a mempolicy comparison */
1177 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1178 {
1179         if (!a || !b)
1180                 return 0;
1181         if (a->policy != b->policy)
1182                 return 0;
1183         switch (a->policy) {
1184         case MPOL_DEFAULT:
1185                 return 1;
1186         case MPOL_INTERLEAVE:
1187                 return nodes_equal(a->v.nodes, b->v.nodes);
1188         case MPOL_PREFERRED:
1189                 return a->v.preferred_node == b->v.preferred_node;
1190         case MPOL_BIND: {
1191                 int i;
1192                 for (i = 0; a->v.zonelist->zones[i]; i++)
1193                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1194                                 return 0;
1195                 return b->v.zonelist->zones[i] == NULL;
1196         }
1197         default:
1198                 BUG();
1199                 return 0;
1200         }
1201 }
1202
1203 /* Slow path of a mpol destructor. */
1204 void __mpol_free(struct mempolicy *p)
1205 {
1206         if (!atomic_dec_and_test(&p->refcnt))
1207                 return;
1208         if (p->policy == MPOL_BIND)
1209                 kfree(p->v.zonelist);
1210         p->policy = MPOL_DEFAULT;
1211         kmem_cache_free(policy_cache, p);
1212 }
1213
1214 /*
1215  * Shared memory backing store policy support.
1216  *
1217  * Remember policies even when nobody has shared memory mapped.
1218  * The policies are kept in Red-Black tree linked from the inode.
1219  * They are protected by the sp->lock spinlock, which should be held
1220  * for any accesses to the tree.
1221  */
1222
1223 /* lookup first element intersecting start-end */
1224 /* Caller holds sp->lock */
1225 static struct sp_node *
1226 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1227 {
1228         struct rb_node *n = sp->root.rb_node;
1229
1230         while (n) {
1231                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1232
1233                 if (start >= p->end)
1234                         n = n->rb_right;
1235                 else if (end <= p->start)
1236                         n = n->rb_left;
1237                 else
1238                         break;
1239         }
1240         if (!n)
1241                 return NULL;
1242         for (;;) {
1243                 struct sp_node *w = NULL;
1244                 struct rb_node *prev = rb_prev(n);
1245                 if (!prev)
1246                         break;
1247                 w = rb_entry(prev, struct sp_node, nd);
1248                 if (w->end <= start)
1249                         break;
1250                 n = prev;
1251         }
1252         return rb_entry(n, struct sp_node, nd);
1253 }
1254
1255 /* Insert a new shared policy into the list. */
1256 /* Caller holds sp->lock */
1257 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1258 {
1259         struct rb_node **p = &sp->root.rb_node;
1260         struct rb_node *parent = NULL;
1261         struct sp_node *nd;
1262
1263         while (*p) {
1264                 parent = *p;
1265                 nd = rb_entry(parent, struct sp_node, nd);
1266                 if (new->start < nd->start)
1267                         p = &(*p)->rb_left;
1268                 else if (new->end > nd->end)
1269                         p = &(*p)->rb_right;
1270                 else
1271                         BUG();
1272         }
1273         rb_link_node(&new->nd, parent, p);
1274         rb_insert_color(&new->nd, &sp->root);
1275         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1276                  new->policy ? new->policy->policy : 0);
1277 }
1278
1279 /* Find shared policy intersecting idx */
1280 struct mempolicy *
1281 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1282 {
1283         struct mempolicy *pol = NULL;
1284         struct sp_node *sn;
1285
1286         if (!sp->root.rb_node)
1287                 return NULL;
1288         spin_lock(&sp->lock);
1289         sn = sp_lookup(sp, idx, idx+1);
1290         if (sn) {
1291                 mpol_get(sn->policy);
1292                 pol = sn->policy;
1293         }
1294         spin_unlock(&sp->lock);
1295         return pol;
1296 }
1297
1298 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1299 {
1300         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1301         rb_erase(&n->nd, &sp->root);
1302         mpol_free(n->policy);
1303         kmem_cache_free(sn_cache, n);
1304 }
1305
1306 struct sp_node *
1307 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1308 {
1309         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1310
1311         if (!n)
1312                 return NULL;
1313         n->start = start;
1314         n->end = end;
1315         mpol_get(pol);
1316         n->policy = pol;
1317         return n;
1318 }
1319
1320 /* Replace a policy range. */
1321 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1322                                  unsigned long end, struct sp_node *new)
1323 {
1324         struct sp_node *n, *new2 = NULL;
1325
1326 restart:
1327         spin_lock(&sp->lock);
1328         n = sp_lookup(sp, start, end);
1329         /* Take care of old policies in the same range. */
1330         while (n && n->start < end) {
1331                 struct rb_node *next = rb_next(&n->nd);
1332                 if (n->start >= start) {
1333                         if (n->end <= end)
1334                                 sp_delete(sp, n);
1335                         else
1336                                 n->start = end;
1337                 } else {
1338                         /* Old policy spanning whole new range. */
1339                         if (n->end > end) {
1340                                 if (!new2) {
1341                                         spin_unlock(&sp->lock);
1342                                         new2 = sp_alloc(end, n->end, n->policy);
1343                                         if (!new2)
1344                                                 return -ENOMEM;
1345                                         goto restart;
1346                                 }
1347                                 n->end = start;
1348                                 sp_insert(sp, new2);
1349                                 new2 = NULL;
1350                                 break;
1351                         } else
1352                                 n->end = start;
1353                 }
1354                 if (!next)
1355                         break;
1356                 n = rb_entry(next, struct sp_node, nd);
1357         }
1358         if (new)
1359                 sp_insert(sp, new);
1360         spin_unlock(&sp->lock);
1361         if (new2) {
1362                 mpol_free(new2->policy);
1363                 kmem_cache_free(sn_cache, new2);
1364         }
1365         return 0;
1366 }
1367
1368 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1369                                 nodemask_t *policy_nodes)
1370 {
1371         info->root = RB_ROOT;
1372         spin_lock_init(&info->lock);
1373
1374         if (policy != MPOL_DEFAULT) {
1375                 struct mempolicy *newpol;
1376
1377                 /* Falls back to MPOL_DEFAULT on any error */
1378                 newpol = mpol_new(policy, policy_nodes);
1379                 if (!IS_ERR(newpol)) {
1380                         /* Create pseudo-vma that contains just the policy */
1381                         struct vm_area_struct pvma;
1382
1383                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1384                         /* Policy covers entire file */
1385                         pvma.vm_end = TASK_SIZE;
1386                         mpol_set_shared_policy(info, &pvma, newpol);
1387                         mpol_free(newpol);
1388                 }
1389         }
1390 }
1391
1392 int mpol_set_shared_policy(struct shared_policy *info,
1393                         struct vm_area_struct *vma, struct mempolicy *npol)
1394 {
1395         int err;
1396         struct sp_node *new = NULL;
1397         unsigned long sz = vma_pages(vma);
1398
1399         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1400                  vma->vm_pgoff,
1401                  sz, npol? npol->policy : -1,
1402                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1403
1404         if (npol) {
1405                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1406                 if (!new)
1407                         return -ENOMEM;
1408         }
1409         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1410         if (err && new)
1411                 kmem_cache_free(sn_cache, new);
1412         return err;
1413 }
1414
1415 /* Free a backing policy store on inode delete. */
1416 void mpol_free_shared_policy(struct shared_policy *p)
1417 {
1418         struct sp_node *n;
1419         struct rb_node *next;
1420
1421         if (!p->root.rb_node)
1422                 return;
1423         spin_lock(&p->lock);
1424         next = rb_first(&p->root);
1425         while (next) {
1426                 n = rb_entry(next, struct sp_node, nd);
1427                 next = rb_next(&n->nd);
1428                 rb_erase(&n->nd, &p->root);
1429                 mpol_free(n->policy);
1430                 kmem_cache_free(sn_cache, n);
1431         }
1432         spin_unlock(&p->lock);
1433 }
1434
1435 /* assumes fs == KERNEL_DS */
1436 void __init numa_policy_init(void)
1437 {
1438         policy_cache = kmem_cache_create("numa_policy",
1439                                          sizeof(struct mempolicy),
1440                                          0, SLAB_PANIC, NULL, NULL);
1441
1442         sn_cache = kmem_cache_create("shared_policy_node",
1443                                      sizeof(struct sp_node),
1444                                      0, SLAB_PANIC, NULL, NULL);
1445
1446         /* Set interleaving policy for system init. This way not all
1447            the data structures allocated at system boot end up in node zero. */
1448
1449         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1450                 printk("numa_policy_init: interleaving failed\n");
1451 }
1452
1453 /* Reset policy of current process to default */
1454 void numa_default_policy(void)
1455 {
1456         do_set_mempolicy(MPOL_DEFAULT, NULL);
1457 }
1458
1459 /* Migrate a policy to a different set of nodes */
1460 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1461 {
1462         nodemask_t *mpolmask;
1463         nodemask_t tmp;
1464
1465         if (!pol)
1466                 return;
1467         mpolmask = &pol->cpuset_mems_allowed;
1468         if (nodes_equal(*mpolmask, *newmask))
1469                 return;
1470
1471         switch (pol->policy) {
1472         case MPOL_DEFAULT:
1473                 break;
1474         case MPOL_INTERLEAVE:
1475                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1476                 pol->v.nodes = tmp;
1477                 *mpolmask = *newmask;
1478                 current->il_next = node_remap(current->il_next,
1479                                                 *mpolmask, *newmask);
1480                 break;
1481         case MPOL_PREFERRED:
1482                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1483                                                 *mpolmask, *newmask);
1484                 *mpolmask = *newmask;
1485                 break;
1486         case MPOL_BIND: {
1487                 nodemask_t nodes;
1488                 struct zone **z;
1489                 struct zonelist *zonelist;
1490
1491                 nodes_clear(nodes);
1492                 for (z = pol->v.zonelist->zones; *z; z++)
1493                         node_set((*z)->zone_pgdat->node_id, nodes);
1494                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1495                 nodes = tmp;
1496
1497                 zonelist = bind_zonelist(&nodes);
1498
1499                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1500                  * If that old zonelist has no remaining mems_allowed nodes,
1501                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1502                  */
1503
1504                 if (zonelist) {
1505                         /* Good - got mem - substitute new zonelist */
1506                         kfree(pol->v.zonelist);
1507                         pol->v.zonelist = zonelist;
1508                 }
1509                 *mpolmask = *newmask;
1510                 break;
1511         }
1512         default:
1513                 BUG();
1514                 break;
1515         }
1516 }
1517
1518 /*
1519  * Wrapper for mpol_rebind_policy() that just requires task
1520  * pointer, and updates task mempolicy.
1521  */
1522
1523 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1524 {
1525         mpol_rebind_policy(tsk->mempolicy, new);
1526 }
1527
1528 /*
1529  * Rebind each vma in mm to new nodemask.
1530  *
1531  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1532  */
1533
1534 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1535 {
1536         struct vm_area_struct *vma;
1537
1538         down_write(&mm->mmap_sem);
1539         for (vma = mm->mmap; vma; vma = vma->vm_next)
1540                 mpol_rebind_policy(vma->vm_policy, new);
1541         up_write(&mm->mmap_sem);
1542 }
1543
1544 /*
1545  * Display pages allocated per node and memory policy via /proc.
1546  */
1547
1548 static const char *policy_types[] = { "default", "prefer", "bind",
1549                                       "interleave" };
1550
1551 /*
1552  * Convert a mempolicy into a string.
1553  * Returns the number of characters in buffer (if positive)
1554  * or an error (negative)
1555  */
1556 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1557 {
1558         char *p = buffer;
1559         int l;
1560         nodemask_t nodes;
1561         int mode = pol ? pol->policy : MPOL_DEFAULT;
1562
1563         switch (mode) {
1564         case MPOL_DEFAULT:
1565                 nodes_clear(nodes);
1566                 break;
1567
1568         case MPOL_PREFERRED:
1569                 nodes_clear(nodes);
1570                 node_set(pol->v.preferred_node, nodes);
1571                 break;
1572
1573         case MPOL_BIND:
1574                 get_zonemask(pol, &nodes);
1575                 break;
1576
1577         case MPOL_INTERLEAVE:
1578                 nodes = pol->v.nodes;
1579                 break;
1580
1581         default:
1582                 BUG();
1583                 return -EFAULT;
1584         }
1585
1586         l = strlen(policy_types[mode]);
1587         if (buffer + maxlen < p + l + 1)
1588                 return -ENOSPC;
1589
1590         strcpy(p, policy_types[mode]);
1591         p += l;
1592
1593         if (!nodes_empty(nodes)) {
1594                 if (buffer + maxlen < p + 2)
1595                         return -ENOSPC;
1596                 *p++ = '=';
1597                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1598         }
1599         return p - buffer;
1600 }
1601
1602 struct numa_maps {
1603         unsigned long pages;
1604         unsigned long anon;
1605         unsigned long mapped;
1606         unsigned long mapcount_max;
1607         unsigned long node[MAX_NUMNODES];
1608 };
1609
1610 static void gather_stats(struct page *page, void *private)
1611 {
1612         struct numa_maps *md = private;
1613         int count = page_mapcount(page);
1614
1615         if (count)
1616                 md->mapped++;
1617
1618         if (count > md->mapcount_max)
1619                 md->mapcount_max = count;
1620
1621         md->pages++;
1622
1623         if (PageAnon(page))
1624                 md->anon++;
1625
1626         md->node[page_to_nid(page)]++;
1627         cond_resched();
1628 }
1629
1630 int show_numa_map(struct seq_file *m, void *v)
1631 {
1632         struct task_struct *task = m->private;
1633         struct vm_area_struct *vma = v;
1634         struct numa_maps *md;
1635         int n;
1636         char buffer[50];
1637
1638         if (!vma->vm_mm)
1639                 return 0;
1640
1641         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1642         if (!md)
1643                 return 0;
1644
1645         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1646                     &node_online_map, MPOL_MF_STATS, md);
1647
1648         if (md->pages) {
1649                 mpol_to_str(buffer, sizeof(buffer),
1650                             get_vma_policy(task, vma, vma->vm_start));
1651
1652                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1653                            vma->vm_start, buffer, md->pages,
1654                            md->mapped, md->mapcount_max);
1655
1656                 if (md->anon)
1657                         seq_printf(m," anon=%lu",md->anon);
1658
1659                 for_each_online_node(n)
1660                         if (md->node[n])
1661                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1662
1663                 seq_putc(m, '\n');
1664         }
1665         kfree(md);
1666
1667         if (m->count < m->size)
1668                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1669         return 0;
1670 }
1671