mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         return policy;
 184 }
 185
 186 static void gather_stats(struct page *, void *);
 187 static void migrate_page_add(struct vm_area_struct *vma,
 188         struct page *page, struct list_head *pagelist, unsigned long flags);
 189
 190 /* Scan through pages checking if pages follow certain conditions. */
 191 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 192                 unsigned long addr, unsigned long end,
 193                 const nodemask_t *nodes, unsigned long flags,
 194                 void *private)
 195 {
 196         pte_t *orig_pte;
 197         pte_t *pte;
 198         spinlock_t *ptl;
 199
 200         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 201         do {
 202                 struct page *page;
 203                 unsigned int nid;
 204
 205                 if (!pte_present(*pte))
 206                         continue;
 207                 page = vm_normal_page(vma, addr, *pte);
 208                 if (!page)
 209                         continue;
 210                 nid = page_to_nid(page);
 211                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 212                         continue;
 213
 214                 if (flags & MPOL_MF_STATS)
 215                         gather_stats(page, private);
 216                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 217                         spin_unlock(ptl);
 218                         migrate_page_add(vma, page, private, flags);
 219                         spin_lock(ptl);
 220                 }
 221                 else
 222                         break;
 223         } while (pte++, addr += PAGE_SIZE, addr != end);
 224         pte_unmap_unlock(orig_pte, ptl);
 225         return addr != end;
 226 }
 227
 228 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 229                 unsigned long addr, unsigned long end,
 230                 const nodemask_t *nodes, unsigned long flags,
 231                 void *private)
 232 {
 233         pmd_t *pmd;
 234         unsigned long next;
 235
 236         pmd = pmd_offset(pud, addr);
 237         do {
 238                 next = pmd_addr_end(addr, end);
 239                 if (pmd_none_or_clear_bad(pmd))
 240                         continue;
 241                 if (check_pte_range(vma, pmd, addr, next, nodes,
 242                                     flags, private))
 243                         return -EIO;
 244         } while (pmd++, addr = next, addr != end);
 245         return 0;
 246 }
 247
 248 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 249                 unsigned long addr, unsigned long end,
 250                 const nodemask_t *nodes, unsigned long flags,
 251                 void *private)
 252 {
 253         pud_t *pud;
 254         unsigned long next;
 255
 256         pud = pud_offset(pgd, addr);
 257         do {
 258                 next = pud_addr_end(addr, end);
 259                 if (pud_none_or_clear_bad(pud))
 260                         continue;
 261                 if (check_pmd_range(vma, pud, addr, next, nodes,
 262                                     flags, private))
 263                         return -EIO;
 264         } while (pud++, addr = next, addr != end);
 265         return 0;
 266 }
 267
 268 static inline int check_pgd_range(struct vm_area_struct *vma,
 269                 unsigned long addr, unsigned long end,
 270                 const nodemask_t *nodes, unsigned long flags,
 271                 void *private)
 272 {
 273         pgd_t *pgd;
 274         unsigned long next;
 275
 276         pgd = pgd_offset(vma->vm_mm, addr);
 277         do {
 278                 next = pgd_addr_end(addr, end);
 279                 if (pgd_none_or_clear_bad(pgd))
 280                         continue;
 281                 if (check_pud_range(vma, pgd, addr, next, nodes,
 282                                     flags, private))
 283                         return -EIO;
 284         } while (pgd++, addr = next, addr != end);
 285         return 0;
 286 }
 287
 288 /* Check if a vma is migratable */
 289 static inline int vma_migratable(struct vm_area_struct *vma)
 290 {
 291         if (vma->vm_flags & (
 292                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
 293                 return 0;
 294         return 1;
 295 }
 296
 297 /*
 298  * Check if all pages in a range are on a set of nodes.
 299  * If pagelist != NULL then isolate pages from the LRU and
 300  * put them on the pagelist.
 301  */
 302 static struct vm_area_struct *
 303 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 304                 const nodemask_t *nodes, unsigned long flags, void *private)
 305 {
 306         int err;
 307         struct vm_area_struct *first, *vma, *prev;
 308
 309         first = find_vma(mm, start);
 310         if (!first)
 311                 return ERR_PTR(-EFAULT);
 312         prev = NULL;
 313         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 314                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 315                         if (!vma->vm_next && vma->vm_end < end)
 316                                 return ERR_PTR(-EFAULT);
 317                         if (prev && prev->vm_end < vma->vm_start)
 318                                 return ERR_PTR(-EFAULT);
 319                 }
 320                 if (!is_vm_hugetlb_page(vma) &&
 321                     ((flags & MPOL_MF_STRICT) ||
 322                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 323                                 vma_migratable(vma)))) {
 324                         unsigned long endvma = vma->vm_end;
 325
 326                         if (endvma > end)
 327                                 endvma = end;
 328                         if (vma->vm_start > start)
 329                                 start = vma->vm_start;
 330                         err = check_pgd_range(vma, start, endvma, nodes,
 331                                                 flags, private);
 332                         if (err) {
 333                                 first = ERR_PTR(err);
 334                                 break;
 335                         }
 336                 }
 337                 prev = vma;
 338         }
 339         return first;
 340 }
 341
 342 /* Apply policy to a single VMA */
 343 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 344 {
 345         int err = 0;
 346         struct mempolicy *old = vma->vm_policy;
 347
 348         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 349                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 350                  vma->vm_ops, vma->vm_file,
 351                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 352
 353         if (vma->vm_ops && vma->vm_ops->set_policy)
 354                 err = vma->vm_ops->set_policy(vma, new);
 355         if (!err) {
 356                 mpol_get(new);
 357                 vma->vm_policy = new;
 358                 mpol_free(old);
 359         }
 360         return err;
 361 }
 362
 363 /* Step 2: apply policy to a range and do splits. */
 364 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 365                        unsigned long end, struct mempolicy *new)
 366 {
 367         struct vm_area_struct *next;
 368         int err;
 369
 370         err = 0;
 371         for (; vma && vma->vm_start < end; vma = next) {
 372                 next = vma->vm_next;
 373                 if (vma->vm_start < start)
 374                         err = split_vma(vma->vm_mm, vma, start, 1);
 375                 if (!err && vma->vm_end > end)
 376                         err = split_vma(vma->vm_mm, vma, end, 0);
 377                 if (!err)
 378                         err = policy_vma(vma, new);
 379                 if (err)
 380                         break;
 381         }
 382         return err;
 383 }
 384
 385 static int contextualize_policy(int mode, nodemask_t *nodes)
 386 {
 387         if (!nodes)
 388                 return 0;
 389
 390         /* Update current mems_allowed */
 391         cpuset_update_current_mems_allowed();
 392         /* Ignore nodes not set in current->mems_allowed */
 393         cpuset_restrict_to_mems_allowed(nodes->bits);
 394         return mpol_check_policy(mode, nodes);
 395 }
 396
 397 /* Set the process memory policy */
 398 long do_set_mempolicy(int mode, nodemask_t *nodes)
 399 {
 400         struct mempolicy *new;
 401
 402         if (contextualize_policy(mode, nodes))
 403                 return -EINVAL;
 404         new = mpol_new(mode, nodes);
 405         if (IS_ERR(new))
 406                 return PTR_ERR(new);
 407         mpol_free(current->mempolicy);
 408         current->mempolicy = new;
 409         if (new && new->policy == MPOL_INTERLEAVE)
 410                 current->il_next = first_node(new->v.nodes);
 411         return 0;
 412 }
 413
 414 /* Fill a zone bitmap for a policy */
 415 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 416 {
 417         int i;
 418
 419         nodes_clear(*nodes);
 420         switch (p->policy) {
 421         case MPOL_BIND:
 422                 for (i = 0; p->v.zonelist->zones[i]; i++)
 423                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 424                                 *nodes);
 425                 break;
 426         case MPOL_DEFAULT:
 427                 break;
 428         case MPOL_INTERLEAVE:
 429                 *nodes = p->v.nodes;
 430                 break;
 431         case MPOL_PREFERRED:
 432                 /* or use current node instead of online map? */
 433                 if (p->v.preferred_node < 0)
 434                         *nodes = node_online_map;
 435                 else
 436                         node_set(p->v.preferred_node, *nodes);
 437                 break;
 438         default:
 439                 BUG();
 440         }
 441 }
 442
 443 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 444 {
 445         struct page *p;
 446         int err;
 447
 448         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 449         if (err >= 0) {
 450                 err = page_to_nid(p);
 451                 put_page(p);
 452         }
 453         return err;
 454 }
 455
 456 /* Retrieve NUMA policy */
 457 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 458                         unsigned long addr, unsigned long flags)
 459 {
 460         int err;
 461         struct mm_struct *mm = current->mm;
 462         struct vm_area_struct *vma = NULL;
 463         struct mempolicy *pol = current->mempolicy;
 464
 465         cpuset_update_current_mems_allowed();
 466         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 467                 return -EINVAL;
 468         if (flags & MPOL_F_ADDR) {
 469                 down_read(&mm->mmap_sem);
 470                 vma = find_vma_intersection(mm, addr, addr+1);
 471                 if (!vma) {
 472                         up_read(&mm->mmap_sem);
 473                         return -EFAULT;
 474                 }
 475                 if (vma->vm_ops && vma->vm_ops->get_policy)
 476                         pol = vma->vm_ops->get_policy(vma, addr);
 477                 else
 478                         pol = vma->vm_policy;
 479         } else if (addr)
 480                 return -EINVAL;
 481
 482         if (!pol)
 483                 pol = &default_policy;
 484
 485         if (flags & MPOL_F_NODE) {
 486                 if (flags & MPOL_F_ADDR) {
 487                         err = lookup_node(mm, addr);
 488                         if (err < 0)
 489                                 goto out;
 490                         *policy = err;
 491                 } else if (pol == current->mempolicy &&
 492                                 pol->policy == MPOL_INTERLEAVE) {
 493                         *policy = current->il_next;
 494                 } else {
 495                         err = -EINVAL;
 496                         goto out;
 497                 }
 498         } else
 499                 *policy = pol->policy;
 500
 501         if (vma) {
 502                 up_read(&current->mm->mmap_sem);
 503                 vma = NULL;
 504         }
 505
 506         err = 0;
 507         if (nmask)
 508                 get_zonemask(pol, nmask);
 509
 510  out:
 511         if (vma)
 512                 up_read(&current->mm->mmap_sem);
 513         return err;
 514 }
 515
 516 /*
 517  * page migration
 518  */
 519
 520 /* Check if we are the only process mapping the page in question */
 521 static inline int single_mm_mapping(struct mm_struct *mm,
 522                         struct address_space *mapping)
 523 {
 524         struct vm_area_struct *vma;
 525         struct prio_tree_iter iter;
 526         int rc = 1;
 527
 528         spin_lock(&mapping->i_mmap_lock);
 529         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 530                 if (mm != vma->vm_mm) {
 531                         rc = 0;
 532                         goto out;
 533                 }
 534         list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 535                 if (mm != vma->vm_mm) {
 536                         rc = 0;
 537                         goto out;
 538                 }
 539 out:
 540         spin_unlock(&mapping->i_mmap_lock);
 541         return rc;
 542 }
 543
 544 /*
 545  * Add a page to be migrated to the pagelist
 546  */
 547 static void migrate_page_add(struct vm_area_struct *vma,
 548         struct page *page, struct list_head *pagelist, unsigned long flags)
 549 {
 550         /*
 551          * Avoid migrating a page that is shared by others and not writable.
 552          */
 553         if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 554             mapping_writably_mapped(page->mapping) ||
 555             single_mm_mapping(vma->vm_mm, page->mapping)) {
 556                 int rc = isolate_lru_page(page);
 557
 558                 if (rc == 1)
 559                         list_add(&page->lru, pagelist);
 560                 /*
 561                  * If the isolate attempt was not successful then we just
 562                  * encountered an unswappable page. Something must be wrong.
 563                  */
 564                 WARN_ON(rc == 0);
 565         }
 566 }
 567
 568 static int swap_pages(struct list_head *pagelist)
 569 {
 570         LIST_HEAD(moved);
 571         LIST_HEAD(failed);
 572         int n;
 573
 574         n = migrate_pages(pagelist, NULL, &moved, &failed);
 575         putback_lru_pages(&failed);
 576         putback_lru_pages(&moved);
 577
 578         return n;
 579 }
 580
 581 /*
 582  * For now migrate_pages simply swaps out the pages from nodes that are in
 583  * the source set but not in the target set. In the future, we would
 584  * want a function that moves pages between the two nodesets in such
 585  * a way as to preserve the physical layout as much as possible.
 586  *
 587  * Returns the number of page that could not be moved.
 588  */
 589 int do_migrate_pages(struct mm_struct *mm,
 590         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 591 {
 592         LIST_HEAD(pagelist);
 593         int count = 0;
 594         nodemask_t nodes;
 595
 596         nodes_andnot(nodes, *from_nodes, *to_nodes);
 597
 598         down_read(&mm->mmap_sem);
 599         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 600                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 601
 602         if (!list_empty(&pagelist)) {
 603                 count = swap_pages(&pagelist);
 604                 putback_lru_pages(&pagelist);
 605         }
 606
 607         up_read(&mm->mmap_sem);
 608         return count;
 609 }
 610
 611 long do_mbind(unsigned long start, unsigned long len,
 612                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 613 {
 614         struct vm_area_struct *vma;
 615         struct mm_struct *mm = current->mm;
 616         struct mempolicy *new;
 617         unsigned long end;
 618         int err;
 619         LIST_HEAD(pagelist);
 620
 621         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 622                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 623             || mode > MPOL_MAX)
 624                 return -EINVAL;
 625         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 626                 return -EPERM;
 627
 628         if (start & ~PAGE_MASK)
 629                 return -EINVAL;
 630
 631         if (mode == MPOL_DEFAULT)
 632                 flags &= ~MPOL_MF_STRICT;
 633
 634         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 635         end = start + len;
 636
 637         if (end < start)
 638                 return -EINVAL;
 639         if (end == start)
 640                 return 0;
 641
 642         if (mpol_check_policy(mode, nmask))
 643                 return -EINVAL;
 644
 645         new = mpol_new(mode, nmask);
 646         if (IS_ERR(new))
 647                 return PTR_ERR(new);
 648
 649         /*
 650          * If we are using the default policy then operation
 651          * on discontinuous address spaces is okay after all
 652          */
 653         if (!new)
 654                 flags |= MPOL_MF_DISCONTIG_OK;
 655
 656         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 657                         mode,nodes_addr(nodes)[0]);
 658
 659         down_write(&mm->mmap_sem);
 660         vma = check_range(mm, start, end, nmask,
 661                           flags | MPOL_MF_INVERT, &pagelist);
 662
 663         err = PTR_ERR(vma);
 664         if (!IS_ERR(vma)) {
 665                 int nr_failed = 0;
 666
 667                 err = mbind_range(vma, start, end, new);
 668                 if (!list_empty(&pagelist))
 669                         nr_failed = swap_pages(&pagelist);
 670
 671                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 672                         err = -EIO;
 673         }
 674         if (!list_empty(&pagelist))
 675                 putback_lru_pages(&pagelist);
 676
 677         up_write(&mm->mmap_sem);
 678         mpol_free(new);
 679         return err;
 680 }
 681
 682 /*
 683  * User space interface with variable sized bitmaps for nodelists.
 684  */
 685
 686 /* Copy a node mask from user space. */
 687 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 688                      unsigned long maxnode)
 689 {
 690         unsigned long k;
 691         unsigned long nlongs;
 692         unsigned long endmask;
 693
 694         --maxnode;
 695         nodes_clear(*nodes);
 696         if (maxnode == 0 || !nmask)
 697                 return 0;
 698
 699         nlongs = BITS_TO_LONGS(maxnode);
 700         if ((maxnode % BITS_PER_LONG) == 0)
 701                 endmask = ~0UL;
 702         else
 703                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 704
 705         /* When the user specified more nodes than supported just check
 706            if the non supported part is all zero. */
 707         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 708                 if (nlongs > PAGE_SIZE/sizeof(long))
 709                         return -EINVAL;
 710                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 711                         unsigned long t;
 712                         if (get_user(t, nmask + k))
 713                                 return -EFAULT;
 714                         if (k == nlongs - 1) {
 715                                 if (t & endmask)
 716                                         return -EINVAL;
 717                         } else if (t)
 718                                 return -EINVAL;
 719                 }
 720                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 721                 endmask = ~0UL;
 722         }
 723
 724         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 725                 return -EFAULT;
 726         nodes_addr(*nodes)[nlongs-1] &= endmask;
 727         return 0;
 728 }
 729
 730 /* Copy a kernel node mask to user space */
 731 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 732                               nodemask_t *nodes)
 733 {
 734         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 735         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 736
 737         if (copy > nbytes) {
 738                 if (copy > PAGE_SIZE)
 739                         return -EINVAL;
 740                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 741                         return -EFAULT;
 742                 copy = nbytes;
 743         }
 744         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 745 }
 746
 747 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 748                         unsigned long mode,
 749                         unsigned long __user *nmask, unsigned long maxnode,
 750                         unsigned flags)
 751 {
 752         nodemask_t nodes;
 753         int err;
 754
 755         err = get_nodes(&nodes, nmask, maxnode);
 756         if (err)
 757                 return err;
 758         return do_mbind(start, len, mode, &nodes, flags);
 759 }
 760
 761 /* Set the process memory policy */
 762 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 763                 unsigned long maxnode)
 764 {
 765         int err;
 766         nodemask_t nodes;
 767
 768         if (mode < 0 || mode > MPOL_MAX)
 769                 return -EINVAL;
 770         err = get_nodes(&nodes, nmask, maxnode);
 771         if (err)
 772                 return err;
 773         return do_set_mempolicy(mode, &nodes);
 774 }
 775
 776 /* Macro needed until Paul implements this function in kernel/cpusets.c */
 777 #define cpuset_mems_allowed(task) node_online_map
 778
 779 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 780                 const unsigned long __user *old_nodes,
 781                 const unsigned long __user *new_nodes)
 782 {
 783         struct mm_struct *mm;
 784         struct task_struct *task;
 785         nodemask_t old;
 786         nodemask_t new;
 787         nodemask_t task_nodes;
 788         int err;
 789
 790         err = get_nodes(&old, old_nodes, maxnode);
 791         if (err)
 792                 return err;
 793
 794         err = get_nodes(&new, new_nodes, maxnode);
 795         if (err)
 796                 return err;
 797
 798         /* Find the mm_struct */
 799         read_lock(&tasklist_lock);
 800         task = pid ? find_task_by_pid(pid) : current;
 801         if (!task) {
 802                 read_unlock(&tasklist_lock);
 803                 return -ESRCH;
 804         }
 805         mm = get_task_mm(task);
 806         read_unlock(&tasklist_lock);
 807
 808         if (!mm)
 809                 return -EINVAL;
 810
 811         /*
 812          * Check if this process has the right to modify the specified
 813          * process. The right exists if the process has administrative
 814          * capabilities, superuser priviledges or the same
 815          * userid as the target process.
 816          */
 817         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 818             (current->uid != task->suid) && (current->uid != task->uid) &&
 819             !capable(CAP_SYS_ADMIN)) {
 820                 err = -EPERM;
 821                 goto out;
 822         }
 823
 824         task_nodes = cpuset_mems_allowed(task);
 825         /* Is the user allowed to access the target nodes? */
 826         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 827                 err = -EPERM;
 828                 goto out;
 829         }
 830
 831         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 832 out:
 833         mmput(mm);
 834         return err;
 835 }
 836
 837
 838 /* Retrieve NUMA policy */
 839 asmlinkage long sys_get_mempolicy(int __user *policy,
 840                                 unsigned long __user *nmask,
 841                                 unsigned long maxnode,
 842                                 unsigned long addr, unsigned long flags)
 843 {
 844         int err, pval;
 845         nodemask_t nodes;
 846
 847         if (nmask != NULL && maxnode < MAX_NUMNODES)
 848                 return -EINVAL;
 849
 850         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 851
 852         if (err)
 853                 return err;
 854
 855         if (policy && put_user(pval, policy))
 856                 return -EFAULT;
 857
 858         if (nmask)
 859                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 860
 861         return err;
 862 }
 863
 864 #ifdef CONFIG_COMPAT
 865
 866 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 867                                      compat_ulong_t __user *nmask,
 868                                      compat_ulong_t maxnode,
 869                                      compat_ulong_t addr, compat_ulong_t flags)
 870 {
 871         long err;
 872         unsigned long __user *nm = NULL;
 873         unsigned long nr_bits, alloc_size;
 874         DECLARE_BITMAP(bm, MAX_NUMNODES);
 875
 876         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 877         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 878
 879         if (nmask)
 880                 nm = compat_alloc_user_space(alloc_size);
 881
 882         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 883
 884         if (!err && nmask) {
 885                 err = copy_from_user(bm, nm, alloc_size);
 886                 /* ensure entire bitmap is zeroed */
 887                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 888                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 889         }
 890
 891         return err;
 892 }
 893
 894 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 895                                      compat_ulong_t maxnode)
 896 {
 897         long err = 0;
 898         unsigned long __user *nm = NULL;
 899         unsigned long nr_bits, alloc_size;
 900         DECLARE_BITMAP(bm, MAX_NUMNODES);
 901
 902         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 903         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 904
 905         if (nmask) {
 906                 err = compat_get_bitmap(bm, nmask, nr_bits);
 907                 nm = compat_alloc_user_space(alloc_size);
 908                 err |= copy_to_user(nm, bm, alloc_size);
 909         }
 910
 911         if (err)
 912                 return -EFAULT;
 913
 914         return sys_set_mempolicy(mode, nm, nr_bits+1);
 915 }
 916
 917 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 918                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 919                              compat_ulong_t maxnode, compat_ulong_t flags)
 920 {
 921         long err = 0;
 922         unsigned long __user *nm = NULL;
 923         unsigned long nr_bits, alloc_size;
 924         nodemask_t bm;
 925
 926         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 927         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 928
 929         if (nmask) {
 930                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 931                 nm = compat_alloc_user_space(alloc_size);
 932                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 933         }
 934
 935         if (err)
 936                 return -EFAULT;
 937
 938         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 939 }
 940
 941 #endif
 942
 943 /* Return effective policy for a VMA */
 944 static struct mempolicy * get_vma_policy(struct task_struct *task,
 945                 struct vm_area_struct *vma, unsigned long addr)
 946 {
 947         struct mempolicy *pol = task->mempolicy;
 948
 949         if (vma) {
 950                 if (vma->vm_ops && vma->vm_ops->get_policy)
 951                         pol = vma->vm_ops->get_policy(vma, addr);
 952                 else if (vma->vm_policy &&
 953                                 vma->vm_policy->policy != MPOL_DEFAULT)
 954                         pol = vma->vm_policy;
 955         }
 956         if (!pol)
 957                 pol = &default_policy;
 958         return pol;
 959 }
 960
 961 /* Return a zonelist representing a mempolicy */
 962 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 963 {
 964         int nd;
 965
 966         switch (policy->policy) {
 967         case MPOL_PREFERRED:
 968                 nd = policy->v.preferred_node;
 969                 if (nd < 0)
 970                         nd = numa_node_id();
 971                 break;
 972         case MPOL_BIND:
 973                 /* Lower zones don't get a policy applied */
 974                 /* Careful: current->mems_allowed might have moved */
 975                 if (gfp_zone(gfp) >= policy_zone)
 976                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 977                                 return policy->v.zonelist;
 978                 /*FALL THROUGH*/
 979         case MPOL_INTERLEAVE: /* should not happen */
 980         case MPOL_DEFAULT:
 981                 nd = numa_node_id();
 982                 break;
 983         default:
 984                 nd = 0;
 985                 BUG();
 986         }
 987         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 988 }
 989
 990 /* Do dynamic interleaving for a process */
 991 static unsigned interleave_nodes(struct mempolicy *policy)
 992 {
 993         unsigned nid, next;
 994         struct task_struct *me = current;
 995
 996         nid = me->il_next;
 997         next = next_node(nid, policy->v.nodes);
 998         if (next >= MAX_NUMNODES)
 999                 next = first_node(policy->v.nodes);
1000         me->il_next = next;
1001         return nid;
1002 }
1003
1004 /* Do static interleaving for a VMA with known offset. */
1005 static unsigned offset_il_node(struct mempolicy *pol,
1006                 struct vm_area_struct *vma, unsigned long off)
1007 {
1008         unsigned nnodes = nodes_weight(pol->v.nodes);
1009         unsigned target = (unsigned)off % nnodes;
1010         int c;
1011         int nid = -1;
1012
1013         c = 0;
1014         do {
1015                 nid = next_node(nid, pol->v.nodes);
1016                 c++;
1017         } while (c <= target);
1018         return nid;
1019 }
1020
1021 /* Determine a node number for interleave */
1022 static inline unsigned interleave_nid(struct mempolicy *pol,
1023                  struct vm_area_struct *vma, unsigned long addr, int shift)
1024 {
1025         if (vma) {
1026                 unsigned long off;
1027
1028                 off = vma->vm_pgoff;
1029                 off += (addr - vma->vm_start) >> shift;
1030                 return offset_il_node(pol, vma, off);
1031         } else
1032                 return interleave_nodes(pol);
1033 }
1034
1035 /* Return a zonelist suitable for a huge page allocation. */
1036 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1037 {
1038         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1039
1040         if (pol->policy == MPOL_INTERLEAVE) {
1041                 unsigned nid;
1042
1043                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1044                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1045         }
1046         return zonelist_policy(GFP_HIGHUSER, pol);
1047 }
1048
1049 /* Allocate a page in interleaved policy.
1050    Own path because it needs to do special accounting. */
1051 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1052                                         unsigned nid)
1053 {
1054         struct zonelist *zl;
1055         struct page *page;
1056
1057         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1058         page = __alloc_pages(gfp, order, zl);
1059         if (page && page_zone(page) == zl->zones[0]) {
1060                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1061                 put_cpu();
1062         }
1063         return page;
1064 }
1065
1066 /**
1067  *      alloc_page_vma  - Allocate a page for a VMA.
1068  *
1069  *      @gfp:
1070  *      %GFP_USER    user allocation.
1071  *      %GFP_KERNEL  kernel allocations,
1072  *      %GFP_HIGHMEM highmem/user allocations,
1073  *      %GFP_FS      allocation should not call back into a file system.
1074  *      %GFP_ATOMIC  don't sleep.
1075  *
1076  *      @vma:  Pointer to VMA or NULL if not available.
1077  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1078  *
1079  *      This function allocates a page from the kernel page pool and applies
1080  *      a NUMA policy associated with the VMA or the current process.
1081  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1082  *      mm_struct of the VMA to prevent it from going away. Should be used for
1083  *      all allocations for pages that will be mapped into
1084  *      user space. Returns NULL when no page can be allocated.
1085  *
1086  *      Should be called with the mm_sem of the vma hold.
1087  */
1088 struct page *
1089 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1090 {
1091         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1092
1093         cpuset_update_current_mems_allowed();
1094
1095         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1096                 unsigned nid;
1097
1098                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1099                 return alloc_page_interleave(gfp, 0, nid);
1100         }
1101         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1102 }
1103
1104 /**
1105  *      alloc_pages_current - Allocate pages.
1106  *
1107  *      @gfp:
1108  *              %GFP_USER   user allocation,
1109  *              %GFP_KERNEL kernel allocation,
1110  *              %GFP_HIGHMEM highmem allocation,
1111  *              %GFP_FS     don't call back into a file system.
1112  *              %GFP_ATOMIC don't sleep.
1113  *      @order: Power of two of allocation size in pages. 0 is a single page.
1114  *
1115  *      Allocate a page from the kernel page pool.  When not in
1116  *      interrupt context and apply the current process NUMA policy.
1117  *      Returns NULL when no page can be allocated.
1118  *
1119  *      Don't call cpuset_update_current_mems_allowed() unless
1120  *      1) it's ok to take cpuset_sem (can WAIT), and
1121  *      2) allocating for current task (not interrupt).
1122  */
1123 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1124 {
1125         struct mempolicy *pol = current->mempolicy;
1126
1127         if ((gfp & __GFP_WAIT) && !in_interrupt())
1128                 cpuset_update_current_mems_allowed();
1129         if (!pol || in_interrupt())
1130                 pol = &default_policy;
1131         if (pol->policy == MPOL_INTERLEAVE)
1132                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1133         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1134 }
1135 EXPORT_SYMBOL(alloc_pages_current);
1136
1137 /* Slow path of a mempolicy copy */
1138 struct mempolicy *__mpol_copy(struct mempolicy *old)
1139 {
1140         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1141
1142         if (!new)
1143                 return ERR_PTR(-ENOMEM);
1144         *new = *old;
1145         atomic_set(&new->refcnt, 1);
1146         if (new->policy == MPOL_BIND) {
1147                 int sz = ksize(old->v.zonelist);
1148                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1149                 if (!new->v.zonelist) {
1150                         kmem_cache_free(policy_cache, new);
1151                         return ERR_PTR(-ENOMEM);
1152                 }
1153                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1154         }
1155         return new;
1156 }
1157
1158 /* Slow path of a mempolicy comparison */
1159 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1160 {
1161         if (!a || !b)
1162                 return 0;
1163         if (a->policy != b->policy)
1164                 return 0;
1165         switch (a->policy) {
1166         case MPOL_DEFAULT:
1167                 return 1;
1168         case MPOL_INTERLEAVE:
1169                 return nodes_equal(a->v.nodes, b->v.nodes);
1170         case MPOL_PREFERRED:
1171                 return a->v.preferred_node == b->v.preferred_node;
1172         case MPOL_BIND: {
1173                 int i;
1174                 for (i = 0; a->v.zonelist->zones[i]; i++)
1175                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1176                                 return 0;
1177                 return b->v.zonelist->zones[i] == NULL;
1178         }
1179         default:
1180                 BUG();
1181                 return 0;
1182         }
1183 }
1184
1185 /* Slow path of a mpol destructor. */
1186 void __mpol_free(struct mempolicy *p)
1187 {
1188         if (!atomic_dec_and_test(&p->refcnt))
1189                 return;
1190         if (p->policy == MPOL_BIND)
1191                 kfree(p->v.zonelist);
1192         p->policy = MPOL_DEFAULT;
1193         kmem_cache_free(policy_cache, p);
1194 }
1195
1196 /*
1197  * Shared memory backing store policy support.
1198  *
1199  * Remember policies even when nobody has shared memory mapped.
1200  * The policies are kept in Red-Black tree linked from the inode.
1201  * They are protected by the sp->lock spinlock, which should be held
1202  * for any accesses to the tree.
1203  */
1204
1205 /* lookup first element intersecting start-end */
1206 /* Caller holds sp->lock */
1207 static struct sp_node *
1208 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1209 {
1210         struct rb_node *n = sp->root.rb_node;
1211
1212         while (n) {
1213                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1214
1215                 if (start >= p->end)
1216                         n = n->rb_right;
1217                 else if (end <= p->start)
1218                         n = n->rb_left;
1219                 else
1220                         break;
1221         }
1222         if (!n)
1223                 return NULL;
1224         for (;;) {
1225                 struct sp_node *w = NULL;
1226                 struct rb_node *prev = rb_prev(n);
1227                 if (!prev)
1228                         break;
1229                 w = rb_entry(prev, struct sp_node, nd);
1230                 if (w->end <= start)
1231                         break;
1232                 n = prev;
1233         }
1234         return rb_entry(n, struct sp_node, nd);
1235 }
1236
1237 /* Insert a new shared policy into the list. */
1238 /* Caller holds sp->lock */
1239 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1240 {
1241         struct rb_node **p = &sp->root.rb_node;
1242         struct rb_node *parent = NULL;
1243         struct sp_node *nd;
1244
1245         while (*p) {
1246                 parent = *p;
1247                 nd = rb_entry(parent, struct sp_node, nd);
1248                 if (new->start < nd->start)
1249                         p = &(*p)->rb_left;
1250                 else if (new->end > nd->end)
1251                         p = &(*p)->rb_right;
1252                 else
1253                         BUG();
1254         }
1255         rb_link_node(&new->nd, parent, p);
1256         rb_insert_color(&new->nd, &sp->root);
1257         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1258                  new->policy ? new->policy->policy : 0);
1259 }
1260
1261 /* Find shared policy intersecting idx */
1262 struct mempolicy *
1263 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1264 {
1265         struct mempolicy *pol = NULL;
1266         struct sp_node *sn;
1267
1268         if (!sp->root.rb_node)
1269                 return NULL;
1270         spin_lock(&sp->lock);
1271         sn = sp_lookup(sp, idx, idx+1);
1272         if (sn) {
1273                 mpol_get(sn->policy);
1274                 pol = sn->policy;
1275         }
1276         spin_unlock(&sp->lock);
1277         return pol;
1278 }
1279
1280 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1281 {
1282         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1283         rb_erase(&n->nd, &sp->root);
1284         mpol_free(n->policy);
1285         kmem_cache_free(sn_cache, n);
1286 }
1287
1288 struct sp_node *
1289 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1290 {
1291         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1292
1293         if (!n)
1294                 return NULL;
1295         n->start = start;
1296         n->end = end;
1297         mpol_get(pol);
1298         n->policy = pol;
1299         return n;
1300 }
1301
1302 /* Replace a policy range. */
1303 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1304                                  unsigned long end, struct sp_node *new)
1305 {
1306         struct sp_node *n, *new2 = NULL;
1307
1308 restart:
1309         spin_lock(&sp->lock);
1310         n = sp_lookup(sp, start, end);
1311         /* Take care of old policies in the same range. */
1312         while (n && n->start < end) {
1313                 struct rb_node *next = rb_next(&n->nd);
1314                 if (n->start >= start) {
1315                         if (n->end <= end)
1316                                 sp_delete(sp, n);
1317                         else
1318                                 n->start = end;
1319                 } else {
1320                         /* Old policy spanning whole new range. */
1321                         if (n->end > end) {
1322                                 if (!new2) {
1323                                         spin_unlock(&sp->lock);
1324                                         new2 = sp_alloc(end, n->end, n->policy);
1325                                         if (!new2)
1326                                                 return -ENOMEM;
1327                                         goto restart;
1328                                 }
1329                                 n->end = start;
1330                                 sp_insert(sp, new2);
1331                                 new2 = NULL;
1332                                 break;
1333                         } else
1334                                 n->end = start;
1335                 }
1336                 if (!next)
1337                         break;
1338                 n = rb_entry(next, struct sp_node, nd);
1339         }
1340         if (new)
1341                 sp_insert(sp, new);
1342         spin_unlock(&sp->lock);
1343         if (new2) {
1344                 mpol_free(new2->policy);
1345                 kmem_cache_free(sn_cache, new2);
1346         }
1347         return 0;
1348 }
1349
1350 int mpol_set_shared_policy(struct shared_policy *info,
1351                         struct vm_area_struct *vma, struct mempolicy *npol)
1352 {
1353         int err;
1354         struct sp_node *new = NULL;
1355         unsigned long sz = vma_pages(vma);
1356
1357         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1358                  vma->vm_pgoff,
1359                  sz, npol? npol->policy : -1,
1360                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1361
1362         if (npol) {
1363                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1364                 if (!new)
1365                         return -ENOMEM;
1366         }
1367         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1368         if (err && new)
1369                 kmem_cache_free(sn_cache, new);
1370         return err;
1371 }
1372
1373 /* Free a backing policy store on inode delete. */
1374 void mpol_free_shared_policy(struct shared_policy *p)
1375 {
1376         struct sp_node *n;
1377         struct rb_node *next;
1378
1379         if (!p->root.rb_node)
1380                 return;
1381         spin_lock(&p->lock);
1382         next = rb_first(&p->root);
1383         while (next) {
1384                 n = rb_entry(next, struct sp_node, nd);
1385                 next = rb_next(&n->nd);
1386                 rb_erase(&n->nd, &p->root);
1387                 mpol_free(n->policy);
1388                 kmem_cache_free(sn_cache, n);
1389         }
1390         spin_unlock(&p->lock);
1391 }
1392
1393 /* assumes fs == KERNEL_DS */
1394 void __init numa_policy_init(void)
1395 {
1396         policy_cache = kmem_cache_create("numa_policy",
1397                                          sizeof(struct mempolicy),
1398                                          0, SLAB_PANIC, NULL, NULL);
1399
1400         sn_cache = kmem_cache_create("shared_policy_node",
1401                                      sizeof(struct sp_node),
1402                                      0, SLAB_PANIC, NULL, NULL);
1403
1404         /* Set interleaving policy for system init. This way not all
1405            the data structures allocated at system boot end up in node zero. */
1406
1407         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1408                 printk("numa_policy_init: interleaving failed\n");
1409 }
1410
1411 /* Reset policy of current process to default */
1412 void numa_default_policy(void)
1413 {
1414         do_set_mempolicy(MPOL_DEFAULT, NULL);
1415 }
1416
1417 /* Migrate a policy to a different set of nodes */
1418 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1419                                                         const nodemask_t *new)
1420 {
1421         nodemask_t tmp;
1422
1423         if (!pol)
1424                 return;
1425
1426         switch (pol->policy) {
1427         case MPOL_DEFAULT:
1428                 break;
1429         case MPOL_INTERLEAVE:
1430                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1431                 pol->v.nodes = tmp;
1432                 current->il_next = node_remap(current->il_next, *old, *new);
1433                 break;
1434         case MPOL_PREFERRED:
1435                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1436                                                                 *old, *new);
1437                 break;
1438         case MPOL_BIND: {
1439                 nodemask_t nodes;
1440                 struct zone **z;
1441                 struct zonelist *zonelist;
1442
1443                 nodes_clear(nodes);
1444                 for (z = pol->v.zonelist->zones; *z; z++)
1445                         node_set((*z)->zone_pgdat->node_id, nodes);
1446                 nodes_remap(tmp, nodes, *old, *new);
1447                 nodes = tmp;
1448
1449                 zonelist = bind_zonelist(&nodes);
1450
1451                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1452                  * If that old zonelist has no remaining mems_allowed nodes,
1453                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1454                  */
1455
1456                 if (zonelist) {
1457                         /* Good - got mem - substitute new zonelist */
1458                         kfree(pol->v.zonelist);
1459                         pol->v.zonelist = zonelist;
1460                 }
1461                 break;
1462         }
1463         default:
1464                 BUG();
1465                 break;
1466         }
1467 }
1468
1469 /*
1470  * Someone moved this task to different nodes.  Fixup mempolicies.
1471  *
1472  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1473  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1474  */
1475 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1476 {
1477         rebind_policy(current->mempolicy, old, new);
1478 }
1479
1480 /*
1481  * Display pages allocated per node and memory policy via /proc.
1482  */
1483
1484 static const char *policy_types[] = { "default", "prefer", "bind",
1485                                       "interleave" };
1486
1487 /*
1488  * Convert a mempolicy into a string.
1489  * Returns the number of characters in buffer (if positive)
1490  * or an error (negative)
1491  */
1492 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1493 {
1494         char *p = buffer;
1495         int l;
1496         nodemask_t nodes;
1497         int mode = pol ? pol->policy : MPOL_DEFAULT;
1498
1499         switch (mode) {
1500         case MPOL_DEFAULT:
1501                 nodes_clear(nodes);
1502                 break;
1503
1504         case MPOL_PREFERRED:
1505                 nodes_clear(nodes);
1506                 node_set(pol->v.preferred_node, nodes);
1507                 break;
1508
1509         case MPOL_BIND:
1510                 get_zonemask(pol, &nodes);
1511                 break;
1512
1513         case MPOL_INTERLEAVE:
1514                 nodes = pol->v.nodes;
1515                 break;
1516
1517         default:
1518                 BUG();
1519                 return -EFAULT;
1520         }
1521
1522         l = strlen(policy_types[mode]);
1523         if (buffer + maxlen < p + l + 1)
1524                 return -ENOSPC;
1525
1526         strcpy(p, policy_types[mode]);
1527         p += l;
1528
1529         if (!nodes_empty(nodes)) {
1530                 if (buffer + maxlen < p + 2)
1531                         return -ENOSPC;
1532                 *p++ = '=';
1533                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1534         }
1535         return p - buffer;
1536 }
1537
1538 struct numa_maps {
1539         unsigned long pages;
1540         unsigned long anon;
1541         unsigned long mapped;
1542         unsigned long mapcount_max;
1543         unsigned long node[MAX_NUMNODES];
1544 };
1545
1546 static void gather_stats(struct page *page, void *private)
1547 {
1548         struct numa_maps *md = private;
1549         int count = page_mapcount(page);
1550
1551         if (count)
1552                 md->mapped++;
1553
1554         if (count > md->mapcount_max)
1555                 md->mapcount_max = count;
1556
1557         md->pages++;
1558
1559         if (PageAnon(page))
1560                 md->anon++;
1561
1562         md->node[page_to_nid(page)]++;
1563         cond_resched();
1564 }
1565
1566 int show_numa_map(struct seq_file *m, void *v)
1567 {
1568         struct task_struct *task = m->private;
1569         struct vm_area_struct *vma = v;
1570         struct numa_maps *md;
1571         int n;
1572         char buffer[50];
1573
1574         if (!vma->vm_mm)
1575                 return 0;
1576
1577         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1578         if (!md)
1579                 return 0;
1580
1581         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1582                     &node_online_map, MPOL_MF_STATS, md);
1583
1584         if (md->pages) {
1585                 mpol_to_str(buffer, sizeof(buffer),
1586                             get_vma_policy(task, vma, vma->vm_start));
1587
1588                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1589                            vma->vm_start, buffer, md->pages,
1590                            md->mapped, md->mapcount_max);
1591
1592                 if (md->anon)
1593                         seq_printf(m," anon=%lu",md->anon);
1594
1595                 for_each_online_node(n)
1596                         if (md->node[n])
1597                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1598
1599                 seq_putc(m, '\n');
1600         }
1601         kfree(md);
1602
1603         if (m->count < m->size)
1604                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1605         return 0;
1606 }
1607