mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 184         return policy;
 185 }
 186
 187 static void gather_stats(struct page *, void *);
 188 static void migrate_page_add(struct vm_area_struct *vma,
 189         struct page *page, struct list_head *pagelist, unsigned long flags);
 190
 191 /* Scan through pages checking if pages follow certain conditions. */
 192 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 193                 unsigned long addr, unsigned long end,
 194                 const nodemask_t *nodes, unsigned long flags,
 195                 void *private)
 196 {
 197         pte_t *orig_pte;
 198         pte_t *pte;
 199         spinlock_t *ptl;
 200
 201         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 202         do {
 203                 struct page *page;
 204                 unsigned int nid;
 205
 206                 if (!pte_present(*pte))
 207                         continue;
 208                 page = vm_normal_page(vma, addr, *pte);
 209                 if (!page)
 210                         continue;
 211                 if (PageReserved(page))
 212                         continue;
 213                 nid = page_to_nid(page);
 214                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 215                         continue;
 216
 217                 if (flags & MPOL_MF_STATS)
 218                         gather_stats(page, private);
 219                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 220                         spin_unlock(ptl);
 221                         migrate_page_add(vma, page, private, flags);
 222                         spin_lock(ptl);
 223                 }
 224                 else
 225                         break;
 226         } while (pte++, addr += PAGE_SIZE, addr != end);
 227         pte_unmap_unlock(orig_pte, ptl);
 228         return addr != end;
 229 }
 230
 231 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 232                 unsigned long addr, unsigned long end,
 233                 const nodemask_t *nodes, unsigned long flags,
 234                 void *private)
 235 {
 236         pmd_t *pmd;
 237         unsigned long next;
 238
 239         pmd = pmd_offset(pud, addr);
 240         do {
 241                 next = pmd_addr_end(addr, end);
 242                 if (pmd_none_or_clear_bad(pmd))
 243                         continue;
 244                 if (check_pte_range(vma, pmd, addr, next, nodes,
 245                                     flags, private))
 246                         return -EIO;
 247         } while (pmd++, addr = next, addr != end);
 248         return 0;
 249 }
 250
 251 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 252                 unsigned long addr, unsigned long end,
 253                 const nodemask_t *nodes, unsigned long flags,
 254                 void *private)
 255 {
 256         pud_t *pud;
 257         unsigned long next;
 258
 259         pud = pud_offset(pgd, addr);
 260         do {
 261                 next = pud_addr_end(addr, end);
 262                 if (pud_none_or_clear_bad(pud))
 263                         continue;
 264                 if (check_pmd_range(vma, pud, addr, next, nodes,
 265                                     flags, private))
 266                         return -EIO;
 267         } while (pud++, addr = next, addr != end);
 268         return 0;
 269 }
 270
 271 static inline int check_pgd_range(struct vm_area_struct *vma,
 272                 unsigned long addr, unsigned long end,
 273                 const nodemask_t *nodes, unsigned long flags,
 274                 void *private)
 275 {
 276         pgd_t *pgd;
 277         unsigned long next;
 278
 279         pgd = pgd_offset(vma->vm_mm, addr);
 280         do {
 281                 next = pgd_addr_end(addr, end);
 282                 if (pgd_none_or_clear_bad(pgd))
 283                         continue;
 284                 if (check_pud_range(vma, pgd, addr, next, nodes,
 285                                     flags, private))
 286                         return -EIO;
 287         } while (pgd++, addr = next, addr != end);
 288         return 0;
 289 }
 290
 291 /* Check if a vma is migratable */
 292 static inline int vma_migratable(struct vm_area_struct *vma)
 293 {
 294         if (vma->vm_flags & (
 295                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 296                 return 0;
 297         return 1;
 298 }
 299
 300 /*
 301  * Check if all pages in a range are on a set of nodes.
 302  * If pagelist != NULL then isolate pages from the LRU and
 303  * put them on the pagelist.
 304  */
 305 static struct vm_area_struct *
 306 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 307                 const nodemask_t *nodes, unsigned long flags, void *private)
 308 {
 309         int err;
 310         struct vm_area_struct *first, *vma, *prev;
 311
 312         first = find_vma(mm, start);
 313         if (!first)
 314                 return ERR_PTR(-EFAULT);
 315         prev = NULL;
 316         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 317                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 318                         if (!vma->vm_next && vma->vm_end < end)
 319                                 return ERR_PTR(-EFAULT);
 320                         if (prev && prev->vm_end < vma->vm_start)
 321                                 return ERR_PTR(-EFAULT);
 322                 }
 323                 if (!is_vm_hugetlb_page(vma) &&
 324                     ((flags & MPOL_MF_STRICT) ||
 325                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 326                                 vma_migratable(vma)))) {
 327                         unsigned long endvma = vma->vm_end;
 328
 329                         if (endvma > end)
 330                                 endvma = end;
 331                         if (vma->vm_start > start)
 332                                 start = vma->vm_start;
 333                         err = check_pgd_range(vma, start, endvma, nodes,
 334                                                 flags, private);
 335                         if (err) {
 336                                 first = ERR_PTR(err);
 337                                 break;
 338                         }
 339                 }
 340                 prev = vma;
 341         }
 342         return first;
 343 }
 344
 345 /* Apply policy to a single VMA */
 346 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 347 {
 348         int err = 0;
 349         struct mempolicy *old = vma->vm_policy;
 350
 351         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 352                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 353                  vma->vm_ops, vma->vm_file,
 354                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 355
 356         if (vma->vm_ops && vma->vm_ops->set_policy)
 357                 err = vma->vm_ops->set_policy(vma, new);
 358         if (!err) {
 359                 mpol_get(new);
 360                 vma->vm_policy = new;
 361                 mpol_free(old);
 362         }
 363         return err;
 364 }
 365
 366 /* Step 2: apply policy to a range and do splits. */
 367 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 368                        unsigned long end, struct mempolicy *new)
 369 {
 370         struct vm_area_struct *next;
 371         int err;
 372
 373         err = 0;
 374         for (; vma && vma->vm_start < end; vma = next) {
 375                 next = vma->vm_next;
 376                 if (vma->vm_start < start)
 377                         err = split_vma(vma->vm_mm, vma, start, 1);
 378                 if (!err && vma->vm_end > end)
 379                         err = split_vma(vma->vm_mm, vma, end, 0);
 380                 if (!err)
 381                         err = policy_vma(vma, new);
 382                 if (err)
 383                         break;
 384         }
 385         return err;
 386 }
 387
 388 static int contextualize_policy(int mode, nodemask_t *nodes)
 389 {
 390         if (!nodes)
 391                 return 0;
 392
 393         cpuset_update_task_memory_state();
 394         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 395                 return -EINVAL;
 396         return mpol_check_policy(mode, nodes);
 397 }
 398
 399 /* Set the process memory policy */
 400 long do_set_mempolicy(int mode, nodemask_t *nodes)
 401 {
 402         struct mempolicy *new;
 403
 404         if (contextualize_policy(mode, nodes))
 405                 return -EINVAL;
 406         new = mpol_new(mode, nodes);
 407         if (IS_ERR(new))
 408                 return PTR_ERR(new);
 409         mpol_free(current->mempolicy);
 410         current->mempolicy = new;
 411         if (new && new->policy == MPOL_INTERLEAVE)
 412                 current->il_next = first_node(new->v.nodes);
 413         return 0;
 414 }
 415
 416 /* Fill a zone bitmap for a policy */
 417 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 418 {
 419         int i;
 420
 421         nodes_clear(*nodes);
 422         switch (p->policy) {
 423         case MPOL_BIND:
 424                 for (i = 0; p->v.zonelist->zones[i]; i++)
 425                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 426                                 *nodes);
 427                 break;
 428         case MPOL_DEFAULT:
 429                 break;
 430         case MPOL_INTERLEAVE:
 431                 *nodes = p->v.nodes;
 432                 break;
 433         case MPOL_PREFERRED:
 434                 /* or use current node instead of online map? */
 435                 if (p->v.preferred_node < 0)
 436                         *nodes = node_online_map;
 437                 else
 438                         node_set(p->v.preferred_node, *nodes);
 439                 break;
 440         default:
 441                 BUG();
 442         }
 443 }
 444
 445 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 446 {
 447         struct page *p;
 448         int err;
 449
 450         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 451         if (err >= 0) {
 452                 err = page_to_nid(p);
 453                 put_page(p);
 454         }
 455         return err;
 456 }
 457
 458 /* Retrieve NUMA policy */
 459 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 460                         unsigned long addr, unsigned long flags)
 461 {
 462         int err;
 463         struct mm_struct *mm = current->mm;
 464         struct vm_area_struct *vma = NULL;
 465         struct mempolicy *pol = current->mempolicy;
 466
 467         cpuset_update_task_memory_state();
 468         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 469                 return -EINVAL;
 470         if (flags & MPOL_F_ADDR) {
 471                 down_read(&mm->mmap_sem);
 472                 vma = find_vma_intersection(mm, addr, addr+1);
 473                 if (!vma) {
 474                         up_read(&mm->mmap_sem);
 475                         return -EFAULT;
 476                 }
 477                 if (vma->vm_ops && vma->vm_ops->get_policy)
 478                         pol = vma->vm_ops->get_policy(vma, addr);
 479                 else
 480                         pol = vma->vm_policy;
 481         } else if (addr)
 482                 return -EINVAL;
 483
 484         if (!pol)
 485                 pol = &default_policy;
 486
 487         if (flags & MPOL_F_NODE) {
 488                 if (flags & MPOL_F_ADDR) {
 489                         err = lookup_node(mm, addr);
 490                         if (err < 0)
 491                                 goto out;
 492                         *policy = err;
 493                 } else if (pol == current->mempolicy &&
 494                                 pol->policy == MPOL_INTERLEAVE) {
 495                         *policy = current->il_next;
 496                 } else {
 497                         err = -EINVAL;
 498                         goto out;
 499                 }
 500         } else
 501                 *policy = pol->policy;
 502
 503         if (vma) {
 504                 up_read(&current->mm->mmap_sem);
 505                 vma = NULL;
 506         }
 507
 508         err = 0;
 509         if (nmask)
 510                 get_zonemask(pol, nmask);
 511
 512  out:
 513         if (vma)
 514                 up_read(&current->mm->mmap_sem);
 515         return err;
 516 }
 517
 518 /*
 519  * page migration
 520  */
 521
 522 /* Check if we are the only process mapping the page in question */
 523 static inline int single_mm_mapping(struct mm_struct *mm,
 524                         struct address_space *mapping)
 525 {
 526         struct vm_area_struct *vma;
 527         struct prio_tree_iter iter;
 528         int rc = 1;
 529
 530         spin_lock(&mapping->i_mmap_lock);
 531         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 532                 if (mm != vma->vm_mm) {
 533                         rc = 0;
 534                         goto out;
 535                 }
 536         list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 537                 if (mm != vma->vm_mm) {
 538                         rc = 0;
 539                         goto out;
 540                 }
 541 out:
 542         spin_unlock(&mapping->i_mmap_lock);
 543         return rc;
 544 }
 545
 546 /*
 547  * Add a page to be migrated to the pagelist
 548  */
 549 static void migrate_page_add(struct vm_area_struct *vma,
 550         struct page *page, struct list_head *pagelist, unsigned long flags)
 551 {
 552         /*
 553          * Avoid migrating a page that is shared by others and not writable.
 554          */
 555         if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 556             mapping_writably_mapped(page->mapping) ||
 557             single_mm_mapping(vma->vm_mm, page->mapping)) {
 558                 int rc = isolate_lru_page(page);
 559
 560                 if (rc == 1)
 561                         list_add(&page->lru, pagelist);
 562                 /*
 563                  * If the isolate attempt was not successful then we just
 564                  * encountered an unswappable page. Something must be wrong.
 565                  */
 566                 WARN_ON(rc == 0);
 567         }
 568 }
 569
 570 static int swap_pages(struct list_head *pagelist)
 571 {
 572         LIST_HEAD(moved);
 573         LIST_HEAD(failed);
 574         int n;
 575
 576         n = migrate_pages(pagelist, NULL, &moved, &failed);
 577         putback_lru_pages(&failed);
 578         putback_lru_pages(&moved);
 579
 580         return n;
 581 }
 582
 583 /*
 584  * For now migrate_pages simply swaps out the pages from nodes that are in
 585  * the source set but not in the target set. In the future, we would
 586  * want a function that moves pages between the two nodesets in such
 587  * a way as to preserve the physical layout as much as possible.
 588  *
 589  * Returns the number of page that could not be moved.
 590  */
 591 int do_migrate_pages(struct mm_struct *mm,
 592         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 593 {
 594         LIST_HEAD(pagelist);
 595         int count = 0;
 596         nodemask_t nodes;
 597
 598         nodes_andnot(nodes, *from_nodes, *to_nodes);
 599
 600         down_read(&mm->mmap_sem);
 601         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 602                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 603
 604         if (!list_empty(&pagelist)) {
 605                 count = swap_pages(&pagelist);
 606                 putback_lru_pages(&pagelist);
 607         }
 608
 609         up_read(&mm->mmap_sem);
 610         return count;
 611 }
 612
 613 long do_mbind(unsigned long start, unsigned long len,
 614                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 615 {
 616         struct vm_area_struct *vma;
 617         struct mm_struct *mm = current->mm;
 618         struct mempolicy *new;
 619         unsigned long end;
 620         int err;
 621         LIST_HEAD(pagelist);
 622
 623         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 624                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 625             || mode > MPOL_MAX)
 626                 return -EINVAL;
 627         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 628                 return -EPERM;
 629
 630         if (start & ~PAGE_MASK)
 631                 return -EINVAL;
 632
 633         if (mode == MPOL_DEFAULT)
 634                 flags &= ~MPOL_MF_STRICT;
 635
 636         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 637         end = start + len;
 638
 639         if (end < start)
 640                 return -EINVAL;
 641         if (end == start)
 642                 return 0;
 643
 644         if (mpol_check_policy(mode, nmask))
 645                 return -EINVAL;
 646
 647         new = mpol_new(mode, nmask);
 648         if (IS_ERR(new))
 649                 return PTR_ERR(new);
 650
 651         /*
 652          * If we are using the default policy then operation
 653          * on discontinuous address spaces is okay after all
 654          */
 655         if (!new)
 656                 flags |= MPOL_MF_DISCONTIG_OK;
 657
 658         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 659                         mode,nodes_addr(nodes)[0]);
 660
 661         down_write(&mm->mmap_sem);
 662         vma = check_range(mm, start, end, nmask,
 663                           flags | MPOL_MF_INVERT, &pagelist);
 664
 665         err = PTR_ERR(vma);
 666         if (!IS_ERR(vma)) {
 667                 int nr_failed = 0;
 668
 669                 err = mbind_range(vma, start, end, new);
 670                 if (!list_empty(&pagelist))
 671                         nr_failed = swap_pages(&pagelist);
 672
 673                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 674                         err = -EIO;
 675         }
 676         if (!list_empty(&pagelist))
 677                 putback_lru_pages(&pagelist);
 678
 679         up_write(&mm->mmap_sem);
 680         mpol_free(new);
 681         return err;
 682 }
 683
 684 /*
 685  * User space interface with variable sized bitmaps for nodelists.
 686  */
 687
 688 /* Copy a node mask from user space. */
 689 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 690                      unsigned long maxnode)
 691 {
 692         unsigned long k;
 693         unsigned long nlongs;
 694         unsigned long endmask;
 695
 696         --maxnode;
 697         nodes_clear(*nodes);
 698         if (maxnode == 0 || !nmask)
 699                 return 0;
 700
 701         nlongs = BITS_TO_LONGS(maxnode);
 702         if ((maxnode % BITS_PER_LONG) == 0)
 703                 endmask = ~0UL;
 704         else
 705                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 706
 707         /* When the user specified more nodes than supported just check
 708            if the non supported part is all zero. */
 709         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 710                 if (nlongs > PAGE_SIZE/sizeof(long))
 711                         return -EINVAL;
 712                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 713                         unsigned long t;
 714                         if (get_user(t, nmask + k))
 715                                 return -EFAULT;
 716                         if (k == nlongs - 1) {
 717                                 if (t & endmask)
 718                                         return -EINVAL;
 719                         } else if (t)
 720                                 return -EINVAL;
 721                 }
 722                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 723                 endmask = ~0UL;
 724         }
 725
 726         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 727                 return -EFAULT;
 728         nodes_addr(*nodes)[nlongs-1] &= endmask;
 729         return 0;
 730 }
 731
 732 /* Copy a kernel node mask to user space */
 733 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 734                               nodemask_t *nodes)
 735 {
 736         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 737         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 738
 739         if (copy > nbytes) {
 740                 if (copy > PAGE_SIZE)
 741                         return -EINVAL;
 742                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 743                         return -EFAULT;
 744                 copy = nbytes;
 745         }
 746         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 747 }
 748
 749 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 750                         unsigned long mode,
 751                         unsigned long __user *nmask, unsigned long maxnode,
 752                         unsigned flags)
 753 {
 754         nodemask_t nodes;
 755         int err;
 756
 757         err = get_nodes(&nodes, nmask, maxnode);
 758         if (err)
 759                 return err;
 760         return do_mbind(start, len, mode, &nodes, flags);
 761 }
 762
 763 /* Set the process memory policy */
 764 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 765                 unsigned long maxnode)
 766 {
 767         int err;
 768         nodemask_t nodes;
 769
 770         if (mode < 0 || mode > MPOL_MAX)
 771                 return -EINVAL;
 772         err = get_nodes(&nodes, nmask, maxnode);
 773         if (err)
 774                 return err;
 775         return do_set_mempolicy(mode, &nodes);
 776 }
 777
 778 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 779                 const unsigned long __user *old_nodes,
 780                 const unsigned long __user *new_nodes)
 781 {
 782         struct mm_struct *mm;
 783         struct task_struct *task;
 784         nodemask_t old;
 785         nodemask_t new;
 786         nodemask_t task_nodes;
 787         int err;
 788
 789         err = get_nodes(&old, old_nodes, maxnode);
 790         if (err)
 791                 return err;
 792
 793         err = get_nodes(&new, new_nodes, maxnode);
 794         if (err)
 795                 return err;
 796
 797         /* Find the mm_struct */
 798         read_lock(&tasklist_lock);
 799         task = pid ? find_task_by_pid(pid) : current;
 800         if (!task) {
 801                 read_unlock(&tasklist_lock);
 802                 return -ESRCH;
 803         }
 804         mm = get_task_mm(task);
 805         read_unlock(&tasklist_lock);
 806
 807         if (!mm)
 808                 return -EINVAL;
 809
 810         /*
 811          * Check if this process has the right to modify the specified
 812          * process. The right exists if the process has administrative
 813          * capabilities, superuser priviledges or the same
 814          * userid as the target process.
 815          */
 816         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 817             (current->uid != task->suid) && (current->uid != task->uid) &&
 818             !capable(CAP_SYS_ADMIN)) {
 819                 err = -EPERM;
 820                 goto out;
 821         }
 822
 823         task_nodes = cpuset_mems_allowed(task);
 824         /* Is the user allowed to access the target nodes? */
 825         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 826                 err = -EPERM;
 827                 goto out;
 828         }
 829
 830         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 831 out:
 832         mmput(mm);
 833         return err;
 834 }
 835
 836
 837 /* Retrieve NUMA policy */
 838 asmlinkage long sys_get_mempolicy(int __user *policy,
 839                                 unsigned long __user *nmask,
 840                                 unsigned long maxnode,
 841                                 unsigned long addr, unsigned long flags)
 842 {
 843         int err, pval;
 844         nodemask_t nodes;
 845
 846         if (nmask != NULL && maxnode < MAX_NUMNODES)
 847                 return -EINVAL;
 848
 849         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 850
 851         if (err)
 852                 return err;
 853
 854         if (policy && put_user(pval, policy))
 855                 return -EFAULT;
 856
 857         if (nmask)
 858                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 859
 860         return err;
 861 }
 862
 863 #ifdef CONFIG_COMPAT
 864
 865 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 866                                      compat_ulong_t __user *nmask,
 867                                      compat_ulong_t maxnode,
 868                                      compat_ulong_t addr, compat_ulong_t flags)
 869 {
 870         long err;
 871         unsigned long __user *nm = NULL;
 872         unsigned long nr_bits, alloc_size;
 873         DECLARE_BITMAP(bm, MAX_NUMNODES);
 874
 875         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 876         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 877
 878         if (nmask)
 879                 nm = compat_alloc_user_space(alloc_size);
 880
 881         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 882
 883         if (!err && nmask) {
 884                 err = copy_from_user(bm, nm, alloc_size);
 885                 /* ensure entire bitmap is zeroed */
 886                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 887                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 888         }
 889
 890         return err;
 891 }
 892
 893 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 894                                      compat_ulong_t maxnode)
 895 {
 896         long err = 0;
 897         unsigned long __user *nm = NULL;
 898         unsigned long nr_bits, alloc_size;
 899         DECLARE_BITMAP(bm, MAX_NUMNODES);
 900
 901         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 902         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 903
 904         if (nmask) {
 905                 err = compat_get_bitmap(bm, nmask, nr_bits);
 906                 nm = compat_alloc_user_space(alloc_size);
 907                 err |= copy_to_user(nm, bm, alloc_size);
 908         }
 909
 910         if (err)
 911                 return -EFAULT;
 912
 913         return sys_set_mempolicy(mode, nm, nr_bits+1);
 914 }
 915
 916 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 917                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 918                              compat_ulong_t maxnode, compat_ulong_t flags)
 919 {
 920         long err = 0;
 921         unsigned long __user *nm = NULL;
 922         unsigned long nr_bits, alloc_size;
 923         nodemask_t bm;
 924
 925         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 926         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 927
 928         if (nmask) {
 929                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 930                 nm = compat_alloc_user_space(alloc_size);
 931                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 932         }
 933
 934         if (err)
 935                 return -EFAULT;
 936
 937         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 938 }
 939
 940 #endif
 941
 942 /* Return effective policy for a VMA */
 943 static struct mempolicy * get_vma_policy(struct task_struct *task,
 944                 struct vm_area_struct *vma, unsigned long addr)
 945 {
 946         struct mempolicy *pol = task->mempolicy;
 947
 948         if (vma) {
 949                 if (vma->vm_ops && vma->vm_ops->get_policy)
 950                         pol = vma->vm_ops->get_policy(vma, addr);
 951                 else if (vma->vm_policy &&
 952                                 vma->vm_policy->policy != MPOL_DEFAULT)
 953                         pol = vma->vm_policy;
 954         }
 955         if (!pol)
 956                 pol = &default_policy;
 957         return pol;
 958 }
 959
 960 /* Return a zonelist representing a mempolicy */
 961 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 962 {
 963         int nd;
 964
 965         switch (policy->policy) {
 966         case MPOL_PREFERRED:
 967                 nd = policy->v.preferred_node;
 968                 if (nd < 0)
 969                         nd = numa_node_id();
 970                 break;
 971         case MPOL_BIND:
 972                 /* Lower zones don't get a policy applied */
 973                 /* Careful: current->mems_allowed might have moved */
 974                 if (gfp_zone(gfp) >= policy_zone)
 975                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 976                                 return policy->v.zonelist;
 977                 /*FALL THROUGH*/
 978         case MPOL_INTERLEAVE: /* should not happen */
 979         case MPOL_DEFAULT:
 980                 nd = numa_node_id();
 981                 break;
 982         default:
 983                 nd = 0;
 984                 BUG();
 985         }
 986         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 987 }
 988
 989 /* Do dynamic interleaving for a process */
 990 static unsigned interleave_nodes(struct mempolicy *policy)
 991 {
 992         unsigned nid, next;
 993         struct task_struct *me = current;
 994
 995         nid = me->il_next;
 996         next = next_node(nid, policy->v.nodes);
 997         if (next >= MAX_NUMNODES)
 998                 next = first_node(policy->v.nodes);
 999         me->il_next = next;
1000         return nid;
1001 }
1002
1003 /* Do static interleaving for a VMA with known offset. */
1004 static unsigned offset_il_node(struct mempolicy *pol,
1005                 struct vm_area_struct *vma, unsigned long off)
1006 {
1007         unsigned nnodes = nodes_weight(pol->v.nodes);
1008         unsigned target = (unsigned)off % nnodes;
1009         int c;
1010         int nid = -1;
1011
1012         c = 0;
1013         do {
1014                 nid = next_node(nid, pol->v.nodes);
1015                 c++;
1016         } while (c <= target);
1017         return nid;
1018 }
1019
1020 /* Determine a node number for interleave */
1021 static inline unsigned interleave_nid(struct mempolicy *pol,
1022                  struct vm_area_struct *vma, unsigned long addr, int shift)
1023 {
1024         if (vma) {
1025                 unsigned long off;
1026
1027                 off = vma->vm_pgoff;
1028                 off += (addr - vma->vm_start) >> shift;
1029                 return offset_il_node(pol, vma, off);
1030         } else
1031                 return interleave_nodes(pol);
1032 }
1033
1034 /* Return a zonelist suitable for a huge page allocation. */
1035 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1036 {
1037         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1038
1039         if (pol->policy == MPOL_INTERLEAVE) {
1040                 unsigned nid;
1041
1042                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1043                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1044         }
1045         return zonelist_policy(GFP_HIGHUSER, pol);
1046 }
1047
1048 /* Allocate a page in interleaved policy.
1049    Own path because it needs to do special accounting. */
1050 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1051                                         unsigned nid)
1052 {
1053         struct zonelist *zl;
1054         struct page *page;
1055
1056         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1057         page = __alloc_pages(gfp, order, zl);
1058         if (page && page_zone(page) == zl->zones[0]) {
1059                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1060                 put_cpu();
1061         }
1062         return page;
1063 }
1064
1065 /**
1066  *      alloc_page_vma  - Allocate a page for a VMA.
1067  *
1068  *      @gfp:
1069  *      %GFP_USER    user allocation.
1070  *      %GFP_KERNEL  kernel allocations,
1071  *      %GFP_HIGHMEM highmem/user allocations,
1072  *      %GFP_FS      allocation should not call back into a file system.
1073  *      %GFP_ATOMIC  don't sleep.
1074  *
1075  *      @vma:  Pointer to VMA or NULL if not available.
1076  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1077  *
1078  *      This function allocates a page from the kernel page pool and applies
1079  *      a NUMA policy associated with the VMA or the current process.
1080  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1081  *      mm_struct of the VMA to prevent it from going away. Should be used for
1082  *      all allocations for pages that will be mapped into
1083  *      user space. Returns NULL when no page can be allocated.
1084  *
1085  *      Should be called with the mm_sem of the vma hold.
1086  */
1087 struct page *
1088 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1089 {
1090         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1091
1092         cpuset_update_task_memory_state();
1093
1094         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1095                 unsigned nid;
1096
1097                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1098                 return alloc_page_interleave(gfp, 0, nid);
1099         }
1100         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1101 }
1102
1103 /**
1104  *      alloc_pages_current - Allocate pages.
1105  *
1106  *      @gfp:
1107  *              %GFP_USER   user allocation,
1108  *              %GFP_KERNEL kernel allocation,
1109  *              %GFP_HIGHMEM highmem allocation,
1110  *              %GFP_FS     don't call back into a file system.
1111  *              %GFP_ATOMIC don't sleep.
1112  *      @order: Power of two of allocation size in pages. 0 is a single page.
1113  *
1114  *      Allocate a page from the kernel page pool.  When not in
1115  *      interrupt context and apply the current process NUMA policy.
1116  *      Returns NULL when no page can be allocated.
1117  *
1118  *      Don't call cpuset_update_task_memory_state() unless
1119  *      1) it's ok to take cpuset_sem (can WAIT), and
1120  *      2) allocating for current task (not interrupt).
1121  */
1122 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1123 {
1124         struct mempolicy *pol = current->mempolicy;
1125
1126         if ((gfp & __GFP_WAIT) && !in_interrupt())
1127                 cpuset_update_task_memory_state();
1128         if (!pol || in_interrupt())
1129                 pol = &default_policy;
1130         if (pol->policy == MPOL_INTERLEAVE)
1131                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1132         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1133 }
1134 EXPORT_SYMBOL(alloc_pages_current);
1135
1136 /*
1137  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1138  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1139  * with the mems_allowed returned by cpuset_mems_allowed().  This
1140  * keeps mempolicies cpuset relative after its cpuset moves.  See
1141  * further kernel/cpuset.c update_nodemask().
1142  */
1143 void *cpuset_being_rebound;
1144
1145 /* Slow path of a mempolicy copy */
1146 struct mempolicy *__mpol_copy(struct mempolicy *old)
1147 {
1148         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1149
1150         if (!new)
1151                 return ERR_PTR(-ENOMEM);
1152         if (current_cpuset_is_being_rebound()) {
1153                 nodemask_t mems = cpuset_mems_allowed(current);
1154                 mpol_rebind_policy(old, &mems);
1155         }
1156         *new = *old;
1157         atomic_set(&new->refcnt, 1);
1158         if (new->policy == MPOL_BIND) {
1159                 int sz = ksize(old->v.zonelist);
1160                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1161                 if (!new->v.zonelist) {
1162                         kmem_cache_free(policy_cache, new);
1163                         return ERR_PTR(-ENOMEM);
1164                 }
1165                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1166         }
1167         return new;
1168 }
1169
1170 /* Slow path of a mempolicy comparison */
1171 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1172 {
1173         if (!a || !b)
1174                 return 0;
1175         if (a->policy != b->policy)
1176                 return 0;
1177         switch (a->policy) {
1178         case MPOL_DEFAULT:
1179                 return 1;
1180         case MPOL_INTERLEAVE:
1181                 return nodes_equal(a->v.nodes, b->v.nodes);
1182         case MPOL_PREFERRED:
1183                 return a->v.preferred_node == b->v.preferred_node;
1184         case MPOL_BIND: {
1185                 int i;
1186                 for (i = 0; a->v.zonelist->zones[i]; i++)
1187                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1188                                 return 0;
1189                 return b->v.zonelist->zones[i] == NULL;
1190         }
1191         default:
1192                 BUG();
1193                 return 0;
1194         }
1195 }
1196
1197 /* Slow path of a mpol destructor. */
1198 void __mpol_free(struct mempolicy *p)
1199 {
1200         if (!atomic_dec_and_test(&p->refcnt))
1201                 return;
1202         if (p->policy == MPOL_BIND)
1203                 kfree(p->v.zonelist);
1204         p->policy = MPOL_DEFAULT;
1205         kmem_cache_free(policy_cache, p);
1206 }
1207
1208 /*
1209  * Shared memory backing store policy support.
1210  *
1211  * Remember policies even when nobody has shared memory mapped.
1212  * The policies are kept in Red-Black tree linked from the inode.
1213  * They are protected by the sp->lock spinlock, which should be held
1214  * for any accesses to the tree.
1215  */
1216
1217 /* lookup first element intersecting start-end */
1218 /* Caller holds sp->lock */
1219 static struct sp_node *
1220 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1221 {
1222         struct rb_node *n = sp->root.rb_node;
1223
1224         while (n) {
1225                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1226
1227                 if (start >= p->end)
1228                         n = n->rb_right;
1229                 else if (end <= p->start)
1230                         n = n->rb_left;
1231                 else
1232                         break;
1233         }
1234         if (!n)
1235                 return NULL;
1236         for (;;) {
1237                 struct sp_node *w = NULL;
1238                 struct rb_node *prev = rb_prev(n);
1239                 if (!prev)
1240                         break;
1241                 w = rb_entry(prev, struct sp_node, nd);
1242                 if (w->end <= start)
1243                         break;
1244                 n = prev;
1245         }
1246         return rb_entry(n, struct sp_node, nd);
1247 }
1248
1249 /* Insert a new shared policy into the list. */
1250 /* Caller holds sp->lock */
1251 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1252 {
1253         struct rb_node **p = &sp->root.rb_node;
1254         struct rb_node *parent = NULL;
1255         struct sp_node *nd;
1256
1257         while (*p) {
1258                 parent = *p;
1259                 nd = rb_entry(parent, struct sp_node, nd);
1260                 if (new->start < nd->start)
1261                         p = &(*p)->rb_left;
1262                 else if (new->end > nd->end)
1263                         p = &(*p)->rb_right;
1264                 else
1265                         BUG();
1266         }
1267         rb_link_node(&new->nd, parent, p);
1268         rb_insert_color(&new->nd, &sp->root);
1269         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1270                  new->policy ? new->policy->policy : 0);
1271 }
1272
1273 /* Find shared policy intersecting idx */
1274 struct mempolicy *
1275 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1276 {
1277         struct mempolicy *pol = NULL;
1278         struct sp_node *sn;
1279
1280         if (!sp->root.rb_node)
1281                 return NULL;
1282         spin_lock(&sp->lock);
1283         sn = sp_lookup(sp, idx, idx+1);
1284         if (sn) {
1285                 mpol_get(sn->policy);
1286                 pol = sn->policy;
1287         }
1288         spin_unlock(&sp->lock);
1289         return pol;
1290 }
1291
1292 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1293 {
1294         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1295         rb_erase(&n->nd, &sp->root);
1296         mpol_free(n->policy);
1297         kmem_cache_free(sn_cache, n);
1298 }
1299
1300 struct sp_node *
1301 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1302 {
1303         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1304
1305         if (!n)
1306                 return NULL;
1307         n->start = start;
1308         n->end = end;
1309         mpol_get(pol);
1310         n->policy = pol;
1311         return n;
1312 }
1313
1314 /* Replace a policy range. */
1315 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1316                                  unsigned long end, struct sp_node *new)
1317 {
1318         struct sp_node *n, *new2 = NULL;
1319
1320 restart:
1321         spin_lock(&sp->lock);
1322         n = sp_lookup(sp, start, end);
1323         /* Take care of old policies in the same range. */
1324         while (n && n->start < end) {
1325                 struct rb_node *next = rb_next(&n->nd);
1326                 if (n->start >= start) {
1327                         if (n->end <= end)
1328                                 sp_delete(sp, n);
1329                         else
1330                                 n->start = end;
1331                 } else {
1332                         /* Old policy spanning whole new range. */
1333                         if (n->end > end) {
1334                                 if (!new2) {
1335                                         spin_unlock(&sp->lock);
1336                                         new2 = sp_alloc(end, n->end, n->policy);
1337                                         if (!new2)
1338                                                 return -ENOMEM;
1339                                         goto restart;
1340                                 }
1341                                 n->end = start;
1342                                 sp_insert(sp, new2);
1343                                 new2 = NULL;
1344                                 break;
1345                         } else
1346                                 n->end = start;
1347                 }
1348                 if (!next)
1349                         break;
1350                 n = rb_entry(next, struct sp_node, nd);
1351         }
1352         if (new)
1353                 sp_insert(sp, new);
1354         spin_unlock(&sp->lock);
1355         if (new2) {
1356                 mpol_free(new2->policy);
1357                 kmem_cache_free(sn_cache, new2);
1358         }
1359         return 0;
1360 }
1361
1362 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1363                                 nodemask_t *policy_nodes)
1364 {
1365         info->root = RB_ROOT;
1366         spin_lock_init(&info->lock);
1367
1368         if (policy != MPOL_DEFAULT) {
1369                 struct mempolicy *newpol;
1370
1371                 /* Falls back to MPOL_DEFAULT on any error */
1372                 newpol = mpol_new(policy, policy_nodes);
1373                 if (!IS_ERR(newpol)) {
1374                         /* Create pseudo-vma that contains just the policy */
1375                         struct vm_area_struct pvma;
1376
1377                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1378                         /* Policy covers entire file */
1379                         pvma.vm_end = TASK_SIZE;
1380                         mpol_set_shared_policy(info, &pvma, newpol);
1381                         mpol_free(newpol);
1382                 }
1383         }
1384 }
1385
1386 int mpol_set_shared_policy(struct shared_policy *info,
1387                         struct vm_area_struct *vma, struct mempolicy *npol)
1388 {
1389         int err;
1390         struct sp_node *new = NULL;
1391         unsigned long sz = vma_pages(vma);
1392
1393         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1394                  vma->vm_pgoff,
1395                  sz, npol? npol->policy : -1,
1396                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1397
1398         if (npol) {
1399                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1400                 if (!new)
1401                         return -ENOMEM;
1402         }
1403         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1404         if (err && new)
1405                 kmem_cache_free(sn_cache, new);
1406         return err;
1407 }
1408
1409 /* Free a backing policy store on inode delete. */
1410 void mpol_free_shared_policy(struct shared_policy *p)
1411 {
1412         struct sp_node *n;
1413         struct rb_node *next;
1414
1415         if (!p->root.rb_node)
1416                 return;
1417         spin_lock(&p->lock);
1418         next = rb_first(&p->root);
1419         while (next) {
1420                 n = rb_entry(next, struct sp_node, nd);
1421                 next = rb_next(&n->nd);
1422                 rb_erase(&n->nd, &p->root);
1423                 mpol_free(n->policy);
1424                 kmem_cache_free(sn_cache, n);
1425         }
1426         spin_unlock(&p->lock);
1427 }
1428
1429 /* assumes fs == KERNEL_DS */
1430 void __init numa_policy_init(void)
1431 {
1432         policy_cache = kmem_cache_create("numa_policy",
1433                                          sizeof(struct mempolicy),
1434                                          0, SLAB_PANIC, NULL, NULL);
1435
1436         sn_cache = kmem_cache_create("shared_policy_node",
1437                                      sizeof(struct sp_node),
1438                                      0, SLAB_PANIC, NULL, NULL);
1439
1440         /* Set interleaving policy for system init. This way not all
1441            the data structures allocated at system boot end up in node zero. */
1442
1443         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1444                 printk("numa_policy_init: interleaving failed\n");
1445 }
1446
1447 /* Reset policy of current process to default */
1448 void numa_default_policy(void)
1449 {
1450         do_set_mempolicy(MPOL_DEFAULT, NULL);
1451 }
1452
1453 /* Migrate a policy to a different set of nodes */
1454 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1455 {
1456         nodemask_t *mpolmask;
1457         nodemask_t tmp;
1458
1459         if (!pol)
1460                 return;
1461         mpolmask = &pol->cpuset_mems_allowed;
1462         if (nodes_equal(*mpolmask, *newmask))
1463                 return;
1464
1465         switch (pol->policy) {
1466         case MPOL_DEFAULT:
1467                 break;
1468         case MPOL_INTERLEAVE:
1469                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1470                 pol->v.nodes = tmp;
1471                 *mpolmask = *newmask;
1472                 current->il_next = node_remap(current->il_next,
1473                                                 *mpolmask, *newmask);
1474                 break;
1475         case MPOL_PREFERRED:
1476                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1477                                                 *mpolmask, *newmask);
1478                 *mpolmask = *newmask;
1479                 break;
1480         case MPOL_BIND: {
1481                 nodemask_t nodes;
1482                 struct zone **z;
1483                 struct zonelist *zonelist;
1484
1485                 nodes_clear(nodes);
1486                 for (z = pol->v.zonelist->zones; *z; z++)
1487                         node_set((*z)->zone_pgdat->node_id, nodes);
1488                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1489                 nodes = tmp;
1490
1491                 zonelist = bind_zonelist(&nodes);
1492
1493                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1494                  * If that old zonelist has no remaining mems_allowed nodes,
1495                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1496                  */
1497
1498                 if (zonelist) {
1499                         /* Good - got mem - substitute new zonelist */
1500                         kfree(pol->v.zonelist);
1501                         pol->v.zonelist = zonelist;
1502                 }
1503                 *mpolmask = *newmask;
1504                 break;
1505         }
1506         default:
1507                 BUG();
1508                 break;
1509         }
1510 }
1511
1512 /*
1513  * Wrapper for mpol_rebind_policy() that just requires task
1514  * pointer, and updates task mempolicy.
1515  */
1516
1517 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1518 {
1519         mpol_rebind_policy(tsk->mempolicy, new);
1520 }
1521
1522 /*
1523  * Rebind each vma in mm to new nodemask.
1524  *
1525  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1526  */
1527
1528 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1529 {
1530         struct vm_area_struct *vma;
1531
1532         down_write(&mm->mmap_sem);
1533         for (vma = mm->mmap; vma; vma = vma->vm_next)
1534                 mpol_rebind_policy(vma->vm_policy, new);
1535         up_write(&mm->mmap_sem);
1536 }
1537
1538 /*
1539  * Display pages allocated per node and memory policy via /proc.
1540  */
1541
1542 static const char *policy_types[] = { "default", "prefer", "bind",
1543                                       "interleave" };
1544
1545 /*
1546  * Convert a mempolicy into a string.
1547  * Returns the number of characters in buffer (if positive)
1548  * or an error (negative)
1549  */
1550 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1551 {
1552         char *p = buffer;
1553         int l;
1554         nodemask_t nodes;
1555         int mode = pol ? pol->policy : MPOL_DEFAULT;
1556
1557         switch (mode) {
1558         case MPOL_DEFAULT:
1559                 nodes_clear(nodes);
1560                 break;
1561
1562         case MPOL_PREFERRED:
1563                 nodes_clear(nodes);
1564                 node_set(pol->v.preferred_node, nodes);
1565                 break;
1566
1567         case MPOL_BIND:
1568                 get_zonemask(pol, &nodes);
1569                 break;
1570
1571         case MPOL_INTERLEAVE:
1572                 nodes = pol->v.nodes;
1573                 break;
1574
1575         default:
1576                 BUG();
1577                 return -EFAULT;
1578         }
1579
1580         l = strlen(policy_types[mode]);
1581         if (buffer + maxlen < p + l + 1)
1582                 return -ENOSPC;
1583
1584         strcpy(p, policy_types[mode]);
1585         p += l;
1586
1587         if (!nodes_empty(nodes)) {
1588                 if (buffer + maxlen < p + 2)
1589                         return -ENOSPC;
1590                 *p++ = '=';
1591                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1592         }
1593         return p - buffer;
1594 }
1595
1596 struct numa_maps {
1597         unsigned long pages;
1598         unsigned long anon;
1599         unsigned long mapped;
1600         unsigned long mapcount_max;
1601         unsigned long node[MAX_NUMNODES];
1602 };
1603
1604 static void gather_stats(struct page *page, void *private)
1605 {
1606         struct numa_maps *md = private;
1607         int count = page_mapcount(page);
1608
1609         if (count)
1610                 md->mapped++;
1611
1612         if (count > md->mapcount_max)
1613                 md->mapcount_max = count;
1614
1615         md->pages++;
1616
1617         if (PageAnon(page))
1618                 md->anon++;
1619
1620         md->node[page_to_nid(page)]++;
1621         cond_resched();
1622 }
1623
1624 int show_numa_map(struct seq_file *m, void *v)
1625 {
1626         struct task_struct *task = m->private;
1627         struct vm_area_struct *vma = v;
1628         struct numa_maps *md;
1629         int n;
1630         char buffer[50];
1631
1632         if (!vma->vm_mm)
1633                 return 0;
1634
1635         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1636         if (!md)
1637                 return 0;
1638
1639         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1640                     &node_online_map, MPOL_MF_STATS, md);
1641
1642         if (md->pages) {
1643                 mpol_to_str(buffer, sizeof(buffer),
1644                             get_vma_policy(task, vma, vma->vm_start));
1645
1646                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1647                            vma->vm_start, buffer, md->pages,
1648                            md->mapped, md->mapcount_max);
1649
1650                 if (md->anon)
1651                         seq_printf(m," anon=%lu",md->anon);
1652
1653                 for_each_online_node(n)
1654                         if (md->node[n])
1655                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1656
1657                 seq_putc(m, '\n');
1658         }
1659         kfree(md);
1660
1661         if (m->count < m->size)
1662                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1663         return 0;
1664 }
1665