mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 /* The number of pages to migrate per call to migrate_pages() */
  99 #define MIGRATE_CHUNK_SIZE 256
 100
 101 static kmem_cache_t *policy_cache;
 102 static kmem_cache_t *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 int policy_zone = ZONE_DMA;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135 /* Generate a custom zonelist for the BIND policy. */
 136 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 137 {
 138         struct zonelist *zl;
 139         int num, max, nd;
 140
 141         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 142         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 143         if (!zl)
 144                 return NULL;
 145         num = 0;
 146         for_each_node_mask(nd, *nodes)
 147                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 148         zl->zones[num] = NULL;
 149         return zl;
 150 }
 151
 152 /* Create a new policy */
 153 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 154 {
 155         struct mempolicy *policy;
 156
 157         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 158         if (mode == MPOL_DEFAULT)
 159                 return NULL;
 160         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 161         if (!policy)
 162                 return ERR_PTR(-ENOMEM);
 163         atomic_set(&policy->refcnt, 1);
 164         switch (mode) {
 165         case MPOL_INTERLEAVE:
 166                 policy->v.nodes = *nodes;
 167                 if (nodes_weight(*nodes) == 0) {
 168                         kmem_cache_free(policy_cache, policy);
 169                         return ERR_PTR(-EINVAL);
 170                 }
 171                 break;
 172         case MPOL_PREFERRED:
 173                 policy->v.preferred_node = first_node(*nodes);
 174                 if (policy->v.preferred_node >= MAX_NUMNODES)
 175                         policy->v.preferred_node = -1;
 176                 break;
 177         case MPOL_BIND:
 178                 policy->v.zonelist = bind_zonelist(nodes);
 179                 if (policy->v.zonelist == NULL) {
 180                         kmem_cache_free(policy_cache, policy);
 181                         return ERR_PTR(-ENOMEM);
 182                 }
 183                 break;
 184         }
 185         policy->policy = mode;
 186         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 187         return policy;
 188 }
 189
 190 static void gather_stats(struct page *, void *);
 191 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 192                                 unsigned long flags);
 193
 194 /* Scan through pages checking if pages follow certain conditions. */
 195 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 196                 unsigned long addr, unsigned long end,
 197                 const nodemask_t *nodes, unsigned long flags,
 198                 void *private)
 199 {
 200         pte_t *orig_pte;
 201         pte_t *pte;
 202         spinlock_t *ptl;
 203
 204         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 205         do {
 206                 struct page *page;
 207                 unsigned int nid;
 208
 209                 if (!pte_present(*pte))
 210                         continue;
 211                 page = vm_normal_page(vma, addr, *pte);
 212                 if (!page)
 213                         continue;
 214                 /*
 215                  * The check for PageReserved here is important to avoid
 216                  * handling zero pages and other pages that may have been
 217                  * marked special by the system.
 218                  *
 219                  * If the PageReserved would not be checked here then f.e.
 220                  * the location of the zero page could have an influence
 221                  * on MPOL_MF_STRICT, zero pages would be counted for
 222                  * the per node stats, and there would be useless attempts
 223                  * to put zero pages on the migration list.
 224                  */
 225                 if (PageReserved(page))
 226                         continue;
 227                 nid = page_to_nid(page);
 228                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 229                         continue;
 230
 231                 if (flags & MPOL_MF_STATS)
 232                         gather_stats(page, private);
 233                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 234                         migrate_page_add(page, private, flags);
 235                 else
 236                         break;
 237         } while (pte++, addr += PAGE_SIZE, addr != end);
 238         pte_unmap_unlock(orig_pte, ptl);
 239         return addr != end;
 240 }
 241
 242 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 243                 unsigned long addr, unsigned long end,
 244                 const nodemask_t *nodes, unsigned long flags,
 245                 void *private)
 246 {
 247         pmd_t *pmd;
 248         unsigned long next;
 249
 250         pmd = pmd_offset(pud, addr);
 251         do {
 252                 next = pmd_addr_end(addr, end);
 253                 if (pmd_none_or_clear_bad(pmd))
 254                         continue;
 255                 if (check_pte_range(vma, pmd, addr, next, nodes,
 256                                     flags, private))
 257                         return -EIO;
 258         } while (pmd++, addr = next, addr != end);
 259         return 0;
 260 }
 261
 262 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 263                 unsigned long addr, unsigned long end,
 264                 const nodemask_t *nodes, unsigned long flags,
 265                 void *private)
 266 {
 267         pud_t *pud;
 268         unsigned long next;
 269
 270         pud = pud_offset(pgd, addr);
 271         do {
 272                 next = pud_addr_end(addr, end);
 273                 if (pud_none_or_clear_bad(pud))
 274                         continue;
 275                 if (check_pmd_range(vma, pud, addr, next, nodes,
 276                                     flags, private))
 277                         return -EIO;
 278         } while (pud++, addr = next, addr != end);
 279         return 0;
 280 }
 281
 282 static inline int check_pgd_range(struct vm_area_struct *vma,
 283                 unsigned long addr, unsigned long end,
 284                 const nodemask_t *nodes, unsigned long flags,
 285                 void *private)
 286 {
 287         pgd_t *pgd;
 288         unsigned long next;
 289
 290         pgd = pgd_offset(vma->vm_mm, addr);
 291         do {
 292                 next = pgd_addr_end(addr, end);
 293                 if (pgd_none_or_clear_bad(pgd))
 294                         continue;
 295                 if (check_pud_range(vma, pgd, addr, next, nodes,
 296                                     flags, private))
 297                         return -EIO;
 298         } while (pgd++, addr = next, addr != end);
 299         return 0;
 300 }
 301
 302 /* Check if a vma is migratable */
 303 static inline int vma_migratable(struct vm_area_struct *vma)
 304 {
 305         if (vma->vm_flags & (
 306                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 307                 return 0;
 308         return 1;
 309 }
 310
 311 /*
 312  * Check if all pages in a range are on a set of nodes.
 313  * If pagelist != NULL then isolate pages from the LRU and
 314  * put them on the pagelist.
 315  */
 316 static struct vm_area_struct *
 317 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 318                 const nodemask_t *nodes, unsigned long flags, void *private)
 319 {
 320         int err;
 321         struct vm_area_struct *first, *vma, *prev;
 322
 323         /* Clear the LRU lists so pages can be isolated */
 324         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 325                 lru_add_drain_all();
 326
 327         first = find_vma(mm, start);
 328         if (!first)
 329                 return ERR_PTR(-EFAULT);
 330         prev = NULL;
 331         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 332                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 333                         if (!vma->vm_next && vma->vm_end < end)
 334                                 return ERR_PTR(-EFAULT);
 335                         if (prev && prev->vm_end < vma->vm_start)
 336                                 return ERR_PTR(-EFAULT);
 337                 }
 338                 if (!is_vm_hugetlb_page(vma) &&
 339                     ((flags & MPOL_MF_STRICT) ||
 340                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 341                                 vma_migratable(vma)))) {
 342                         unsigned long endvma = vma->vm_end;
 343
 344                         if (endvma > end)
 345                                 endvma = end;
 346                         if (vma->vm_start > start)
 347                                 start = vma->vm_start;
 348                         err = check_pgd_range(vma, start, endvma, nodes,
 349                                                 flags, private);
 350                         if (err) {
 351                                 first = ERR_PTR(err);
 352                                 break;
 353                         }
 354                 }
 355                 prev = vma;
 356         }
 357         return first;
 358 }
 359
 360 /* Apply policy to a single VMA */
 361 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 362 {
 363         int err = 0;
 364         struct mempolicy *old = vma->vm_policy;
 365
 366         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 367                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 368                  vma->vm_ops, vma->vm_file,
 369                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 370
 371         if (vma->vm_ops && vma->vm_ops->set_policy)
 372                 err = vma->vm_ops->set_policy(vma, new);
 373         if (!err) {
 374                 mpol_get(new);
 375                 vma->vm_policy = new;
 376                 mpol_free(old);
 377         }
 378         return err;
 379 }
 380
 381 /* Step 2: apply policy to a range and do splits. */
 382 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 383                        unsigned long end, struct mempolicy *new)
 384 {
 385         struct vm_area_struct *next;
 386         int err;
 387
 388         err = 0;
 389         for (; vma && vma->vm_start < end; vma = next) {
 390                 next = vma->vm_next;
 391                 if (vma->vm_start < start)
 392                         err = split_vma(vma->vm_mm, vma, start, 1);
 393                 if (!err && vma->vm_end > end)
 394                         err = split_vma(vma->vm_mm, vma, end, 0);
 395                 if (!err)
 396                         err = policy_vma(vma, new);
 397                 if (err)
 398                         break;
 399         }
 400         return err;
 401 }
 402
 403 static int contextualize_policy(int mode, nodemask_t *nodes)
 404 {
 405         if (!nodes)
 406                 return 0;
 407
 408         cpuset_update_task_memory_state();
 409         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 410                 return -EINVAL;
 411         return mpol_check_policy(mode, nodes);
 412 }
 413
 414 /* Set the process memory policy */
 415 long do_set_mempolicy(int mode, nodemask_t *nodes)
 416 {
 417         struct mempolicy *new;
 418
 419         if (contextualize_policy(mode, nodes))
 420                 return -EINVAL;
 421         new = mpol_new(mode, nodes);
 422         if (IS_ERR(new))
 423                 return PTR_ERR(new);
 424         mpol_free(current->mempolicy);
 425         current->mempolicy = new;
 426         if (new && new->policy == MPOL_INTERLEAVE)
 427                 current->il_next = first_node(new->v.nodes);
 428         return 0;
 429 }
 430
 431 /* Fill a zone bitmap for a policy */
 432 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 433 {
 434         int i;
 435
 436         nodes_clear(*nodes);
 437         switch (p->policy) {
 438         case MPOL_BIND:
 439                 for (i = 0; p->v.zonelist->zones[i]; i++)
 440                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 441                                 *nodes);
 442                 break;
 443         case MPOL_DEFAULT:
 444                 break;
 445         case MPOL_INTERLEAVE:
 446                 *nodes = p->v.nodes;
 447                 break;
 448         case MPOL_PREFERRED:
 449                 /* or use current node instead of online map? */
 450                 if (p->v.preferred_node < 0)
 451                         *nodes = node_online_map;
 452                 else
 453                         node_set(p->v.preferred_node, *nodes);
 454                 break;
 455         default:
 456                 BUG();
 457         }
 458 }
 459
 460 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 461 {
 462         struct page *p;
 463         int err;
 464
 465         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 466         if (err >= 0) {
 467                 err = page_to_nid(p);
 468                 put_page(p);
 469         }
 470         return err;
 471 }
 472
 473 /* Retrieve NUMA policy */
 474 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 475                         unsigned long addr, unsigned long flags)
 476 {
 477         int err;
 478         struct mm_struct *mm = current->mm;
 479         struct vm_area_struct *vma = NULL;
 480         struct mempolicy *pol = current->mempolicy;
 481
 482         cpuset_update_task_memory_state();
 483         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 484                 return -EINVAL;
 485         if (flags & MPOL_F_ADDR) {
 486                 down_read(&mm->mmap_sem);
 487                 vma = find_vma_intersection(mm, addr, addr+1);
 488                 if (!vma) {
 489                         up_read(&mm->mmap_sem);
 490                         return -EFAULT;
 491                 }
 492                 if (vma->vm_ops && vma->vm_ops->get_policy)
 493                         pol = vma->vm_ops->get_policy(vma, addr);
 494                 else
 495                         pol = vma->vm_policy;
 496         } else if (addr)
 497                 return -EINVAL;
 498
 499         if (!pol)
 500                 pol = &default_policy;
 501
 502         if (flags & MPOL_F_NODE) {
 503                 if (flags & MPOL_F_ADDR) {
 504                         err = lookup_node(mm, addr);
 505                         if (err < 0)
 506                                 goto out;
 507                         *policy = err;
 508                 } else if (pol == current->mempolicy &&
 509                                 pol->policy == MPOL_INTERLEAVE) {
 510                         *policy = current->il_next;
 511                 } else {
 512                         err = -EINVAL;
 513                         goto out;
 514                 }
 515         } else
 516                 *policy = pol->policy;
 517
 518         if (vma) {
 519                 up_read(&current->mm->mmap_sem);
 520                 vma = NULL;
 521         }
 522
 523         err = 0;
 524         if (nmask)
 525                 get_zonemask(pol, nmask);
 526
 527  out:
 528         if (vma)
 529                 up_read(&current->mm->mmap_sem);
 530         return err;
 531 }
 532
 533 /*
 534  * page migration
 535  */
 536
 537 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 538                                 unsigned long flags)
 539 {
 540         /*
 541          * Avoid migrating a page that is shared with others.
 542          */
 543         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 544                 if (isolate_lru_page(page))
 545                         list_add(&page->lru, pagelist);
 546         }
 547 }
 548
 549 /*
 550  * Migrate the list 'pagelist' of pages to a certain destination.
 551  *
 552  * Specify destination with either non-NULL vma or dest_node >= 0
 553  * Return the number of pages not migrated or error code
 554  */
 555 static int migrate_pages_to(struct list_head *pagelist,
 556                         struct vm_area_struct *vma, int dest)
 557 {
 558         LIST_HEAD(newlist);
 559         LIST_HEAD(moved);
 560         LIST_HEAD(failed);
 561         int err = 0;
 562         int nr_pages;
 563         struct page *page;
 564         struct list_head *p;
 565
 566 redo:
 567         nr_pages = 0;
 568         list_for_each(p, pagelist) {
 569                 if (vma)
 570                         page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
 571                 else
 572                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 573
 574                 if (!page) {
 575                         err = -ENOMEM;
 576                         goto out;
 577                 }
 578                 list_add(&page->lru, &newlist);
 579                 nr_pages++;
 580                 if (nr_pages > MIGRATE_CHUNK_SIZE);
 581                         break;
 582         }
 583         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 584
 585         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 586
 587         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 588                 goto redo;
 589 out:
 590         /* Return leftover allocated pages */
 591         while (!list_empty(&newlist)) {
 592                 page = list_entry(newlist.next, struct page, lru);
 593                 list_del(&page->lru);
 594                 __free_page(page);
 595         }
 596         list_splice(&failed, pagelist);
 597         if (err < 0)
 598                 return err;
 599
 600         /* Calculate number of leftover pages */
 601         nr_pages = 0;
 602         list_for_each(p, pagelist)
 603                 nr_pages++;
 604         return nr_pages;
 605 }
 606
 607 /*
 608  * Migrate pages from one node to a target node.
 609  * Returns error or the number of pages not migrated.
 610  */
 611 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 612 {
 613         nodemask_t nmask;
 614         LIST_HEAD(pagelist);
 615         int err = 0;
 616
 617         nodes_clear(nmask);
 618         node_set(source, nmask);
 619
 620         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 621                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 622
 623         if (!list_empty(&pagelist)) {
 624                 err = migrate_pages_to(&pagelist, NULL, dest);
 625                 if (!list_empty(&pagelist))
 626                         putback_lru_pages(&pagelist);
 627         }
 628         return err;
 629 }
 630
 631 /*
 632  * Move pages between the two nodesets so as to preserve the physical
 633  * layout as much as possible.
 634  *
 635  * Returns the number of page that could not be moved.
 636  */
 637 int do_migrate_pages(struct mm_struct *mm,
 638         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 639 {
 640         LIST_HEAD(pagelist);
 641         int busy = 0;
 642         int err = 0;
 643         nodemask_t tmp;
 644
 645         down_read(&mm->mmap_sem);
 646
 647 /*
 648  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 649  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 650  * bit in 'tmp', and return that <source, dest> pair for migration.
 651  * The pair of nodemasks 'to' and 'from' define the map.
 652  *
 653  * If no pair of bits is found that way, fallback to picking some
 654  * pair of 'source' and 'dest' bits that are not the same.  If the
 655  * 'source' and 'dest' bits are the same, this represents a node
 656  * that will be migrating to itself, so no pages need move.
 657  *
 658  * If no bits are left in 'tmp', or if all remaining bits left
 659  * in 'tmp' correspond to the same bit in 'to', return false
 660  * (nothing left to migrate).
 661  *
 662  * This lets us pick a pair of nodes to migrate between, such that
 663  * if possible the dest node is not already occupied by some other
 664  * source node, minimizing the risk of overloading the memory on a
 665  * node that would happen if we migrated incoming memory to a node
 666  * before migrating outgoing memory source that same node.
 667  *
 668  * A single scan of tmp is sufficient.  As we go, we remember the
 669  * most recent <s, d> pair that moved (s != d).  If we find a pair
 670  * that not only moved, but what's better, moved to an empty slot
 671  * (d is not set in tmp), then we break out then, with that pair.
 672  * Otherwise when we finish scannng from_tmp, we at least have the
 673  * most recent <s, d> pair that moved.  If we get all the way through
 674  * the scan of tmp without finding any node that moved, much less
 675  * moved to an empty node, then there is nothing left worth migrating.
 676  */
 677
 678         tmp = *from_nodes;
 679         while (!nodes_empty(tmp)) {
 680                 int s,d;
 681                 int source = -1;
 682                 int dest = 0;
 683
 684                 for_each_node_mask(s, tmp) {
 685                         d = node_remap(s, *from_nodes, *to_nodes);
 686                         if (s == d)
 687                                 continue;
 688
 689                         source = s;     /* Node moved. Memorize */
 690                         dest = d;
 691
 692                         /* dest not in remaining from nodes? */
 693                         if (!node_isset(dest, tmp))
 694                                 break;
 695                 }
 696                 if (source == -1)
 697                         break;
 698
 699                 node_clear(source, tmp);
 700                 err = migrate_to_node(mm, source, dest, flags);
 701                 if (err > 0)
 702                         busy += err;
 703                 if (err < 0)
 704                         break;
 705         }
 706
 707         up_read(&mm->mmap_sem);
 708         if (err < 0)
 709                 return err;
 710         return busy;
 711 }
 712
 713 long do_mbind(unsigned long start, unsigned long len,
 714                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 715 {
 716         struct vm_area_struct *vma;
 717         struct mm_struct *mm = current->mm;
 718         struct mempolicy *new;
 719         unsigned long end;
 720         int err;
 721         LIST_HEAD(pagelist);
 722
 723         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 724                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 725             || mode > MPOL_MAX)
 726                 return -EINVAL;
 727         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 728                 return -EPERM;
 729
 730         if (start & ~PAGE_MASK)
 731                 return -EINVAL;
 732
 733         if (mode == MPOL_DEFAULT)
 734                 flags &= ~MPOL_MF_STRICT;
 735
 736         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 737         end = start + len;
 738
 739         if (end < start)
 740                 return -EINVAL;
 741         if (end == start)
 742                 return 0;
 743
 744         if (mpol_check_policy(mode, nmask))
 745                 return -EINVAL;
 746
 747         new = mpol_new(mode, nmask);
 748         if (IS_ERR(new))
 749                 return PTR_ERR(new);
 750
 751         /*
 752          * If we are using the default policy then operation
 753          * on discontinuous address spaces is okay after all
 754          */
 755         if (!new)
 756                 flags |= MPOL_MF_DISCONTIG_OK;
 757
 758         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 759                         mode,nodes_addr(nodes)[0]);
 760
 761         down_write(&mm->mmap_sem);
 762         vma = check_range(mm, start, end, nmask,
 763                           flags | MPOL_MF_INVERT, &pagelist);
 764
 765         err = PTR_ERR(vma);
 766         if (!IS_ERR(vma)) {
 767                 int nr_failed = 0;
 768
 769                 err = mbind_range(vma, start, end, new);
 770
 771                 if (!list_empty(&pagelist))
 772                         nr_failed = migrate_pages_to(&pagelist, vma, -1);
 773
 774                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 775                         err = -EIO;
 776         }
 777         if (!list_empty(&pagelist))
 778                 putback_lru_pages(&pagelist);
 779
 780         up_write(&mm->mmap_sem);
 781         mpol_free(new);
 782         return err;
 783 }
 784
 785 /*
 786  * User space interface with variable sized bitmaps for nodelists.
 787  */
 788
 789 /* Copy a node mask from user space. */
 790 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 791                      unsigned long maxnode)
 792 {
 793         unsigned long k;
 794         unsigned long nlongs;
 795         unsigned long endmask;
 796
 797         --maxnode;
 798         nodes_clear(*nodes);
 799         if (maxnode == 0 || !nmask)
 800                 return 0;
 801
 802         nlongs = BITS_TO_LONGS(maxnode);
 803         if ((maxnode % BITS_PER_LONG) == 0)
 804                 endmask = ~0UL;
 805         else
 806                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 807
 808         /* When the user specified more nodes than supported just check
 809            if the non supported part is all zero. */
 810         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 811                 if (nlongs > PAGE_SIZE/sizeof(long))
 812                         return -EINVAL;
 813                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 814                         unsigned long t;
 815                         if (get_user(t, nmask + k))
 816                                 return -EFAULT;
 817                         if (k == nlongs - 1) {
 818                                 if (t & endmask)
 819                                         return -EINVAL;
 820                         } else if (t)
 821                                 return -EINVAL;
 822                 }
 823                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 824                 endmask = ~0UL;
 825         }
 826
 827         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 828                 return -EFAULT;
 829         nodes_addr(*nodes)[nlongs-1] &= endmask;
 830         return 0;
 831 }
 832
 833 /* Copy a kernel node mask to user space */
 834 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 835                               nodemask_t *nodes)
 836 {
 837         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 838         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 839
 840         if (copy > nbytes) {
 841                 if (copy > PAGE_SIZE)
 842                         return -EINVAL;
 843                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 844                         return -EFAULT;
 845                 copy = nbytes;
 846         }
 847         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 848 }
 849
 850 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 851                         unsigned long mode,
 852                         unsigned long __user *nmask, unsigned long maxnode,
 853                         unsigned flags)
 854 {
 855         nodemask_t nodes;
 856         int err;
 857
 858         err = get_nodes(&nodes, nmask, maxnode);
 859         if (err)
 860                 return err;
 861         return do_mbind(start, len, mode, &nodes, flags);
 862 }
 863
 864 /* Set the process memory policy */
 865 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 866                 unsigned long maxnode)
 867 {
 868         int err;
 869         nodemask_t nodes;
 870
 871         if (mode < 0 || mode > MPOL_MAX)
 872                 return -EINVAL;
 873         err = get_nodes(&nodes, nmask, maxnode);
 874         if (err)
 875                 return err;
 876         return do_set_mempolicy(mode, &nodes);
 877 }
 878
 879 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 880                 const unsigned long __user *old_nodes,
 881                 const unsigned long __user *new_nodes)
 882 {
 883         struct mm_struct *mm;
 884         struct task_struct *task;
 885         nodemask_t old;
 886         nodemask_t new;
 887         nodemask_t task_nodes;
 888         int err;
 889
 890         err = get_nodes(&old, old_nodes, maxnode);
 891         if (err)
 892                 return err;
 893
 894         err = get_nodes(&new, new_nodes, maxnode);
 895         if (err)
 896                 return err;
 897
 898         /* Find the mm_struct */
 899         read_lock(&tasklist_lock);
 900         task = pid ? find_task_by_pid(pid) : current;
 901         if (!task) {
 902                 read_unlock(&tasklist_lock);
 903                 return -ESRCH;
 904         }
 905         mm = get_task_mm(task);
 906         read_unlock(&tasklist_lock);
 907
 908         if (!mm)
 909                 return -EINVAL;
 910
 911         /*
 912          * Check if this process has the right to modify the specified
 913          * process. The right exists if the process has administrative
 914          * capabilities, superuser priviledges or the same
 915          * userid as the target process.
 916          */
 917         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 918             (current->uid != task->suid) && (current->uid != task->uid) &&
 919             !capable(CAP_SYS_ADMIN)) {
 920                 err = -EPERM;
 921                 goto out;
 922         }
 923
 924         task_nodes = cpuset_mems_allowed(task);
 925         /* Is the user allowed to access the target nodes? */
 926         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 927                 err = -EPERM;
 928                 goto out;
 929         }
 930
 931         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 932 out:
 933         mmput(mm);
 934         return err;
 935 }
 936
 937
 938 /* Retrieve NUMA policy */
 939 asmlinkage long sys_get_mempolicy(int __user *policy,
 940                                 unsigned long __user *nmask,
 941                                 unsigned long maxnode,
 942                                 unsigned long addr, unsigned long flags)
 943 {
 944         int err, pval;
 945         nodemask_t nodes;
 946
 947         if (nmask != NULL && maxnode < MAX_NUMNODES)
 948                 return -EINVAL;
 949
 950         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 951
 952         if (err)
 953                 return err;
 954
 955         if (policy && put_user(pval, policy))
 956                 return -EFAULT;
 957
 958         if (nmask)
 959                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 960
 961         return err;
 962 }
 963
 964 #ifdef CONFIG_COMPAT
 965
 966 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 967                                      compat_ulong_t __user *nmask,
 968                                      compat_ulong_t maxnode,
 969                                      compat_ulong_t addr, compat_ulong_t flags)
 970 {
 971         long err;
 972         unsigned long __user *nm = NULL;
 973         unsigned long nr_bits, alloc_size;
 974         DECLARE_BITMAP(bm, MAX_NUMNODES);
 975
 976         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 977         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 978
 979         if (nmask)
 980                 nm = compat_alloc_user_space(alloc_size);
 981
 982         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 983
 984         if (!err && nmask) {
 985                 err = copy_from_user(bm, nm, alloc_size);
 986                 /* ensure entire bitmap is zeroed */
 987                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 988                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 989         }
 990
 991         return err;
 992 }
 993
 994 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 995                                      compat_ulong_t maxnode)
 996 {
 997         long err = 0;
 998         unsigned long __user *nm = NULL;
 999         unsigned long nr_bits, alloc_size;
1000         DECLARE_BITMAP(bm, MAX_NUMNODES);
1001
1002         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1003         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1004
1005         if (nmask) {
1006                 err = compat_get_bitmap(bm, nmask, nr_bits);
1007                 nm = compat_alloc_user_space(alloc_size);
1008                 err |= copy_to_user(nm, bm, alloc_size);
1009         }
1010
1011         if (err)
1012                 return -EFAULT;
1013
1014         return sys_set_mempolicy(mode, nm, nr_bits+1);
1015 }
1016
1017 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1018                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1019                              compat_ulong_t maxnode, compat_ulong_t flags)
1020 {
1021         long err = 0;
1022         unsigned long __user *nm = NULL;
1023         unsigned long nr_bits, alloc_size;
1024         nodemask_t bm;
1025
1026         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1027         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1028
1029         if (nmask) {
1030                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1031                 nm = compat_alloc_user_space(alloc_size);
1032                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1033         }
1034
1035         if (err)
1036                 return -EFAULT;
1037
1038         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1039 }
1040
1041 #endif
1042
1043 /* Return effective policy for a VMA */
1044 static struct mempolicy * get_vma_policy(struct task_struct *task,
1045                 struct vm_area_struct *vma, unsigned long addr)
1046 {
1047         struct mempolicy *pol = task->mempolicy;
1048
1049         if (vma) {
1050                 if (vma->vm_ops && vma->vm_ops->get_policy)
1051                         pol = vma->vm_ops->get_policy(vma, addr);
1052                 else if (vma->vm_policy &&
1053                                 vma->vm_policy->policy != MPOL_DEFAULT)
1054                         pol = vma->vm_policy;
1055         }
1056         if (!pol)
1057                 pol = &default_policy;
1058         return pol;
1059 }
1060
1061 /* Return a zonelist representing a mempolicy */
1062 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1063 {
1064         int nd;
1065
1066         switch (policy->policy) {
1067         case MPOL_PREFERRED:
1068                 nd = policy->v.preferred_node;
1069                 if (nd < 0)
1070                         nd = numa_node_id();
1071                 break;
1072         case MPOL_BIND:
1073                 /* Lower zones don't get a policy applied */
1074                 /* Careful: current->mems_allowed might have moved */
1075                 if (gfp_zone(gfp) >= policy_zone)
1076                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1077                                 return policy->v.zonelist;
1078                 /*FALL THROUGH*/
1079         case MPOL_INTERLEAVE: /* should not happen */
1080         case MPOL_DEFAULT:
1081                 nd = numa_node_id();
1082                 break;
1083         default:
1084                 nd = 0;
1085                 BUG();
1086         }
1087         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1088 }
1089
1090 /* Do dynamic interleaving for a process */
1091 static unsigned interleave_nodes(struct mempolicy *policy)
1092 {
1093         unsigned nid, next;
1094         struct task_struct *me = current;
1095
1096         nid = me->il_next;
1097         next = next_node(nid, policy->v.nodes);
1098         if (next >= MAX_NUMNODES)
1099                 next = first_node(policy->v.nodes);
1100         me->il_next = next;
1101         return nid;
1102 }
1103
1104 /*
1105  * Depending on the memory policy provide a node from which to allocate the
1106  * next slab entry.
1107  */
1108 unsigned slab_node(struct mempolicy *policy)
1109 {
1110         switch (policy->policy) {
1111         case MPOL_INTERLEAVE:
1112                 return interleave_nodes(policy);
1113
1114         case MPOL_BIND:
1115                 /*
1116                  * Follow bind policy behavior and start allocation at the
1117                  * first node.
1118                  */
1119                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1120
1121         case MPOL_PREFERRED:
1122                 if (policy->v.preferred_node >= 0)
1123                         return policy->v.preferred_node;
1124                 /* Fall through */
1125
1126         default:
1127                 return numa_node_id();
1128         }
1129 }
1130
1131 /* Do static interleaving for a VMA with known offset. */
1132 static unsigned offset_il_node(struct mempolicy *pol,
1133                 struct vm_area_struct *vma, unsigned long off)
1134 {
1135         unsigned nnodes = nodes_weight(pol->v.nodes);
1136         unsigned target = (unsigned)off % nnodes;
1137         int c;
1138         int nid = -1;
1139
1140         c = 0;
1141         do {
1142                 nid = next_node(nid, pol->v.nodes);
1143                 c++;
1144         } while (c <= target);
1145         return nid;
1146 }
1147
1148 /* Determine a node number for interleave */
1149 static inline unsigned interleave_nid(struct mempolicy *pol,
1150                  struct vm_area_struct *vma, unsigned long addr, int shift)
1151 {
1152         if (vma) {
1153                 unsigned long off;
1154
1155                 off = vma->vm_pgoff;
1156                 off += (addr - vma->vm_start) >> shift;
1157                 return offset_il_node(pol, vma, off);
1158         } else
1159                 return interleave_nodes(pol);
1160 }
1161
1162 /* Return a zonelist suitable for a huge page allocation. */
1163 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1164 {
1165         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1166
1167         if (pol->policy == MPOL_INTERLEAVE) {
1168                 unsigned nid;
1169
1170                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1171                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1172         }
1173         return zonelist_policy(GFP_HIGHUSER, pol);
1174 }
1175
1176 /* Allocate a page in interleaved policy.
1177    Own path because it needs to do special accounting. */
1178 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1179                                         unsigned nid)
1180 {
1181         struct zonelist *zl;
1182         struct page *page;
1183
1184         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1185         page = __alloc_pages(gfp, order, zl);
1186         if (page && page_zone(page) == zl->zones[0]) {
1187                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1188                 put_cpu();
1189         }
1190         return page;
1191 }
1192
1193 /**
1194  *      alloc_page_vma  - Allocate a page for a VMA.
1195  *
1196  *      @gfp:
1197  *      %GFP_USER    user allocation.
1198  *      %GFP_KERNEL  kernel allocations,
1199  *      %GFP_HIGHMEM highmem/user allocations,
1200  *      %GFP_FS      allocation should not call back into a file system.
1201  *      %GFP_ATOMIC  don't sleep.
1202  *
1203  *      @vma:  Pointer to VMA or NULL if not available.
1204  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1205  *
1206  *      This function allocates a page from the kernel page pool and applies
1207  *      a NUMA policy associated with the VMA or the current process.
1208  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1209  *      mm_struct of the VMA to prevent it from going away. Should be used for
1210  *      all allocations for pages that will be mapped into
1211  *      user space. Returns NULL when no page can be allocated.
1212  *
1213  *      Should be called with the mm_sem of the vma hold.
1214  */
1215 struct page *
1216 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1217 {
1218         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1219
1220         cpuset_update_task_memory_state();
1221
1222         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1223                 unsigned nid;
1224
1225                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1226                 return alloc_page_interleave(gfp, 0, nid);
1227         }
1228         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1229 }
1230
1231 /**
1232  *      alloc_pages_current - Allocate pages.
1233  *
1234  *      @gfp:
1235  *              %GFP_USER   user allocation,
1236  *              %GFP_KERNEL kernel allocation,
1237  *              %GFP_HIGHMEM highmem allocation,
1238  *              %GFP_FS     don't call back into a file system.
1239  *              %GFP_ATOMIC don't sleep.
1240  *      @order: Power of two of allocation size in pages. 0 is a single page.
1241  *
1242  *      Allocate a page from the kernel page pool.  When not in
1243  *      interrupt context and apply the current process NUMA policy.
1244  *      Returns NULL when no page can be allocated.
1245  *
1246  *      Don't call cpuset_update_task_memory_state() unless
1247  *      1) it's ok to take cpuset_sem (can WAIT), and
1248  *      2) allocating for current task (not interrupt).
1249  */
1250 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1251 {
1252         struct mempolicy *pol = current->mempolicy;
1253
1254         if ((gfp & __GFP_WAIT) && !in_interrupt())
1255                 cpuset_update_task_memory_state();
1256         if (!pol || in_interrupt())
1257                 pol = &default_policy;
1258         if (pol->policy == MPOL_INTERLEAVE)
1259                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1260         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1261 }
1262 EXPORT_SYMBOL(alloc_pages_current);
1263
1264 /*
1265  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1266  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1267  * with the mems_allowed returned by cpuset_mems_allowed().  This
1268  * keeps mempolicies cpuset relative after its cpuset moves.  See
1269  * further kernel/cpuset.c update_nodemask().
1270  */
1271 void *cpuset_being_rebound;
1272
1273 /* Slow path of a mempolicy copy */
1274 struct mempolicy *__mpol_copy(struct mempolicy *old)
1275 {
1276         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1277
1278         if (!new)
1279                 return ERR_PTR(-ENOMEM);
1280         if (current_cpuset_is_being_rebound()) {
1281                 nodemask_t mems = cpuset_mems_allowed(current);
1282                 mpol_rebind_policy(old, &mems);
1283         }
1284         *new = *old;
1285         atomic_set(&new->refcnt, 1);
1286         if (new->policy == MPOL_BIND) {
1287                 int sz = ksize(old->v.zonelist);
1288                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1289                 if (!new->v.zonelist) {
1290                         kmem_cache_free(policy_cache, new);
1291                         return ERR_PTR(-ENOMEM);
1292                 }
1293                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1294         }
1295         return new;
1296 }
1297
1298 /* Slow path of a mempolicy comparison */
1299 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1300 {
1301         if (!a || !b)
1302                 return 0;
1303         if (a->policy != b->policy)
1304                 return 0;
1305         switch (a->policy) {
1306         case MPOL_DEFAULT:
1307                 return 1;
1308         case MPOL_INTERLEAVE:
1309                 return nodes_equal(a->v.nodes, b->v.nodes);
1310         case MPOL_PREFERRED:
1311                 return a->v.preferred_node == b->v.preferred_node;
1312         case MPOL_BIND: {
1313                 int i;
1314                 for (i = 0; a->v.zonelist->zones[i]; i++)
1315                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1316                                 return 0;
1317                 return b->v.zonelist->zones[i] == NULL;
1318         }
1319         default:
1320                 BUG();
1321                 return 0;
1322         }
1323 }
1324
1325 /* Slow path of a mpol destructor. */
1326 void __mpol_free(struct mempolicy *p)
1327 {
1328         if (!atomic_dec_and_test(&p->refcnt))
1329                 return;
1330         if (p->policy == MPOL_BIND)
1331                 kfree(p->v.zonelist);
1332         p->policy = MPOL_DEFAULT;
1333         kmem_cache_free(policy_cache, p);
1334 }
1335
1336 /*
1337  * Shared memory backing store policy support.
1338  *
1339  * Remember policies even when nobody has shared memory mapped.
1340  * The policies are kept in Red-Black tree linked from the inode.
1341  * They are protected by the sp->lock spinlock, which should be held
1342  * for any accesses to the tree.
1343  */
1344
1345 /* lookup first element intersecting start-end */
1346 /* Caller holds sp->lock */
1347 static struct sp_node *
1348 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1349 {
1350         struct rb_node *n = sp->root.rb_node;
1351
1352         while (n) {
1353                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1354
1355                 if (start >= p->end)
1356                         n = n->rb_right;
1357                 else if (end <= p->start)
1358                         n = n->rb_left;
1359                 else
1360                         break;
1361         }
1362         if (!n)
1363                 return NULL;
1364         for (;;) {
1365                 struct sp_node *w = NULL;
1366                 struct rb_node *prev = rb_prev(n);
1367                 if (!prev)
1368                         break;
1369                 w = rb_entry(prev, struct sp_node, nd);
1370                 if (w->end <= start)
1371                         break;
1372                 n = prev;
1373         }
1374         return rb_entry(n, struct sp_node, nd);
1375 }
1376
1377 /* Insert a new shared policy into the list. */
1378 /* Caller holds sp->lock */
1379 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1380 {
1381         struct rb_node **p = &sp->root.rb_node;
1382         struct rb_node *parent = NULL;
1383         struct sp_node *nd;
1384
1385         while (*p) {
1386                 parent = *p;
1387                 nd = rb_entry(parent, struct sp_node, nd);
1388                 if (new->start < nd->start)
1389                         p = &(*p)->rb_left;
1390                 else if (new->end > nd->end)
1391                         p = &(*p)->rb_right;
1392                 else
1393                         BUG();
1394         }
1395         rb_link_node(&new->nd, parent, p);
1396         rb_insert_color(&new->nd, &sp->root);
1397         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1398                  new->policy ? new->policy->policy : 0);
1399 }
1400
1401 /* Find shared policy intersecting idx */
1402 struct mempolicy *
1403 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1404 {
1405         struct mempolicy *pol = NULL;
1406         struct sp_node *sn;
1407
1408         if (!sp->root.rb_node)
1409                 return NULL;
1410         spin_lock(&sp->lock);
1411         sn = sp_lookup(sp, idx, idx+1);
1412         if (sn) {
1413                 mpol_get(sn->policy);
1414                 pol = sn->policy;
1415         }
1416         spin_unlock(&sp->lock);
1417         return pol;
1418 }
1419
1420 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1421 {
1422         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1423         rb_erase(&n->nd, &sp->root);
1424         mpol_free(n->policy);
1425         kmem_cache_free(sn_cache, n);
1426 }
1427
1428 struct sp_node *
1429 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1430 {
1431         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1432
1433         if (!n)
1434                 return NULL;
1435         n->start = start;
1436         n->end = end;
1437         mpol_get(pol);
1438         n->policy = pol;
1439         return n;
1440 }
1441
1442 /* Replace a policy range. */
1443 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1444                                  unsigned long end, struct sp_node *new)
1445 {
1446         struct sp_node *n, *new2 = NULL;
1447
1448 restart:
1449         spin_lock(&sp->lock);
1450         n = sp_lookup(sp, start, end);
1451         /* Take care of old policies in the same range. */
1452         while (n && n->start < end) {
1453                 struct rb_node *next = rb_next(&n->nd);
1454                 if (n->start >= start) {
1455                         if (n->end <= end)
1456                                 sp_delete(sp, n);
1457                         else
1458                                 n->start = end;
1459                 } else {
1460                         /* Old policy spanning whole new range. */
1461                         if (n->end > end) {
1462                                 if (!new2) {
1463                                         spin_unlock(&sp->lock);
1464                                         new2 = sp_alloc(end, n->end, n->policy);
1465                                         if (!new2)
1466                                                 return -ENOMEM;
1467                                         goto restart;
1468                                 }
1469                                 n->end = start;
1470                                 sp_insert(sp, new2);
1471                                 new2 = NULL;
1472                                 break;
1473                         } else
1474                                 n->end = start;
1475                 }
1476                 if (!next)
1477                         break;
1478                 n = rb_entry(next, struct sp_node, nd);
1479         }
1480         if (new)
1481                 sp_insert(sp, new);
1482         spin_unlock(&sp->lock);
1483         if (new2) {
1484                 mpol_free(new2->policy);
1485                 kmem_cache_free(sn_cache, new2);
1486         }
1487         return 0;
1488 }
1489
1490 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1491                                 nodemask_t *policy_nodes)
1492 {
1493         info->root = RB_ROOT;
1494         spin_lock_init(&info->lock);
1495
1496         if (policy != MPOL_DEFAULT) {
1497                 struct mempolicy *newpol;
1498
1499                 /* Falls back to MPOL_DEFAULT on any error */
1500                 newpol = mpol_new(policy, policy_nodes);
1501                 if (!IS_ERR(newpol)) {
1502                         /* Create pseudo-vma that contains just the policy */
1503                         struct vm_area_struct pvma;
1504
1505                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1506                         /* Policy covers entire file */
1507                         pvma.vm_end = TASK_SIZE;
1508                         mpol_set_shared_policy(info, &pvma, newpol);
1509                         mpol_free(newpol);
1510                 }
1511         }
1512 }
1513
1514 int mpol_set_shared_policy(struct shared_policy *info,
1515                         struct vm_area_struct *vma, struct mempolicy *npol)
1516 {
1517         int err;
1518         struct sp_node *new = NULL;
1519         unsigned long sz = vma_pages(vma);
1520
1521         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1522                  vma->vm_pgoff,
1523                  sz, npol? npol->policy : -1,
1524                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1525
1526         if (npol) {
1527                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1528                 if (!new)
1529                         return -ENOMEM;
1530         }
1531         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1532         if (err && new)
1533                 kmem_cache_free(sn_cache, new);
1534         return err;
1535 }
1536
1537 /* Free a backing policy store on inode delete. */
1538 void mpol_free_shared_policy(struct shared_policy *p)
1539 {
1540         struct sp_node *n;
1541         struct rb_node *next;
1542
1543         if (!p->root.rb_node)
1544                 return;
1545         spin_lock(&p->lock);
1546         next = rb_first(&p->root);
1547         while (next) {
1548                 n = rb_entry(next, struct sp_node, nd);
1549                 next = rb_next(&n->nd);
1550                 rb_erase(&n->nd, &p->root);
1551                 mpol_free(n->policy);
1552                 kmem_cache_free(sn_cache, n);
1553         }
1554         spin_unlock(&p->lock);
1555 }
1556
1557 /* assumes fs == KERNEL_DS */
1558 void __init numa_policy_init(void)
1559 {
1560         policy_cache = kmem_cache_create("numa_policy",
1561                                          sizeof(struct mempolicy),
1562                                          0, SLAB_PANIC, NULL, NULL);
1563
1564         sn_cache = kmem_cache_create("shared_policy_node",
1565                                      sizeof(struct sp_node),
1566                                      0, SLAB_PANIC, NULL, NULL);
1567
1568         /* Set interleaving policy for system init. This way not all
1569            the data structures allocated at system boot end up in node zero. */
1570
1571         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1572                 printk("numa_policy_init: interleaving failed\n");
1573 }
1574
1575 /* Reset policy of current process to default */
1576 void numa_default_policy(void)
1577 {
1578         do_set_mempolicy(MPOL_DEFAULT, NULL);
1579 }
1580
1581 /* Migrate a policy to a different set of nodes */
1582 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1583 {
1584         nodemask_t *mpolmask;
1585         nodemask_t tmp;
1586
1587         if (!pol)
1588                 return;
1589         mpolmask = &pol->cpuset_mems_allowed;
1590         if (nodes_equal(*mpolmask, *newmask))
1591                 return;
1592
1593         switch (pol->policy) {
1594         case MPOL_DEFAULT:
1595                 break;
1596         case MPOL_INTERLEAVE:
1597                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1598                 pol->v.nodes = tmp;
1599                 *mpolmask = *newmask;
1600                 current->il_next = node_remap(current->il_next,
1601                                                 *mpolmask, *newmask);
1602                 break;
1603         case MPOL_PREFERRED:
1604                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1605                                                 *mpolmask, *newmask);
1606                 *mpolmask = *newmask;
1607                 break;
1608         case MPOL_BIND: {
1609                 nodemask_t nodes;
1610                 struct zone **z;
1611                 struct zonelist *zonelist;
1612
1613                 nodes_clear(nodes);
1614                 for (z = pol->v.zonelist->zones; *z; z++)
1615                         node_set((*z)->zone_pgdat->node_id, nodes);
1616                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1617                 nodes = tmp;
1618
1619                 zonelist = bind_zonelist(&nodes);
1620
1621                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1622                  * If that old zonelist has no remaining mems_allowed nodes,
1623                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1624                  */
1625
1626                 if (zonelist) {
1627                         /* Good - got mem - substitute new zonelist */
1628                         kfree(pol->v.zonelist);
1629                         pol->v.zonelist = zonelist;
1630                 }
1631                 *mpolmask = *newmask;
1632                 break;
1633         }
1634         default:
1635                 BUG();
1636                 break;
1637         }
1638 }
1639
1640 /*
1641  * Wrapper for mpol_rebind_policy() that just requires task
1642  * pointer, and updates task mempolicy.
1643  */
1644
1645 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1646 {
1647         mpol_rebind_policy(tsk->mempolicy, new);
1648 }
1649
1650 /*
1651  * Rebind each vma in mm to new nodemask.
1652  *
1653  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1654  */
1655
1656 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1657 {
1658         struct vm_area_struct *vma;
1659
1660         down_write(&mm->mmap_sem);
1661         for (vma = mm->mmap; vma; vma = vma->vm_next)
1662                 mpol_rebind_policy(vma->vm_policy, new);
1663         up_write(&mm->mmap_sem);
1664 }
1665
1666 /*
1667  * Display pages allocated per node and memory policy via /proc.
1668  */
1669
1670 static const char *policy_types[] = { "default", "prefer", "bind",
1671                                       "interleave" };
1672
1673 /*
1674  * Convert a mempolicy into a string.
1675  * Returns the number of characters in buffer (if positive)
1676  * or an error (negative)
1677  */
1678 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1679 {
1680         char *p = buffer;
1681         int l;
1682         nodemask_t nodes;
1683         int mode = pol ? pol->policy : MPOL_DEFAULT;
1684
1685         switch (mode) {
1686         case MPOL_DEFAULT:
1687                 nodes_clear(nodes);
1688                 break;
1689
1690         case MPOL_PREFERRED:
1691                 nodes_clear(nodes);
1692                 node_set(pol->v.preferred_node, nodes);
1693                 break;
1694
1695         case MPOL_BIND:
1696                 get_zonemask(pol, &nodes);
1697                 break;
1698
1699         case MPOL_INTERLEAVE:
1700                 nodes = pol->v.nodes;
1701                 break;
1702
1703         default:
1704                 BUG();
1705                 return -EFAULT;
1706         }
1707
1708         l = strlen(policy_types[mode]);
1709         if (buffer + maxlen < p + l + 1)
1710                 return -ENOSPC;
1711
1712         strcpy(p, policy_types[mode]);
1713         p += l;
1714
1715         if (!nodes_empty(nodes)) {
1716                 if (buffer + maxlen < p + 2)
1717                         return -ENOSPC;
1718                 *p++ = '=';
1719                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1720         }
1721         return p - buffer;
1722 }
1723
1724 struct numa_maps {
1725         unsigned long pages;
1726         unsigned long anon;
1727         unsigned long mapped;
1728         unsigned long mapcount_max;
1729         unsigned long node[MAX_NUMNODES];
1730 };
1731
1732 static void gather_stats(struct page *page, void *private)
1733 {
1734         struct numa_maps *md = private;
1735         int count = page_mapcount(page);
1736
1737         if (count)
1738                 md->mapped++;
1739
1740         if (count > md->mapcount_max)
1741                 md->mapcount_max = count;
1742
1743         md->pages++;
1744
1745         if (PageAnon(page))
1746                 md->anon++;
1747
1748         md->node[page_to_nid(page)]++;
1749         cond_resched();
1750 }
1751
1752 int show_numa_map(struct seq_file *m, void *v)
1753 {
1754         struct task_struct *task = m->private;
1755         struct vm_area_struct *vma = v;
1756         struct numa_maps *md;
1757         int n;
1758         char buffer[50];
1759
1760         if (!vma->vm_mm)
1761                 return 0;
1762
1763         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1764         if (!md)
1765                 return 0;
1766
1767         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1768                     &node_online_map, MPOL_MF_STATS, md);
1769
1770         if (md->pages) {
1771                 mpol_to_str(buffer, sizeof(buffer),
1772                             get_vma_policy(task, vma, vma->vm_start));
1773
1774                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1775                            vma->vm_start, buffer, md->pages,
1776                            md->mapped, md->mapcount_max);
1777
1778                 if (md->anon)
1779                         seq_printf(m," anon=%lu",md->anon);
1780
1781                 for_each_online_node(n)
1782                         if (md->node[n])
1783                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1784
1785                 seq_putc(m, '\n');
1786         }
1787         kfree(md);
1788
1789         if (m->count < m->size)
1790                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1791         return 0;
1792 }
1793