mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/slab.h>
  77 #include <linux/string.h>
  78 #include <linux/export.h>
  79 #include <linux/nsproxy.h>
  80 #include <linux/interrupt.h>
  81 #include <linux/init.h>
  82 #include <linux/compat.h>
  83 #include <linux/swap.h>
  84 #include <linux/seq_file.h>
  85 #include <linux/proc_fs.h>
  86 #include <linux/migrate.h>
  87 #include <linux/ksm.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92 #include <linux/mm_inline.h>
  93
  94 #include <asm/tlbflush.h>
  95 #include <asm/uaccess.h>
  96 #include <linux/random.h>
  97
  98 #include "internal.h"
  99
 100 /* Internal flags */
 101 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 102 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 103
 104 static struct kmem_cache *policy_cache;
 105 static struct kmem_cache *sn_cache;
 106
 107 /* Highest zone. An specific allocation for a zone below that is not
 108    policied. */
 109 enum zone_type policy_zone = 0;
 110
 111 /*
 112  * run-time system-wide default policy => local allocation
 113  */
 114 static struct mempolicy default_policy = {
 115         .refcnt = ATOMIC_INIT(1), /* never free it */
 116         .mode = MPOL_PREFERRED,
 117         .flags = MPOL_F_LOCAL,
 118 };
 119
 120 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 121
 122 static struct mempolicy *get_task_policy(struct task_struct *p)
 123 {
 124         struct mempolicy *pol = p->mempolicy;
 125         int node;
 126
 127         if (!pol) {
 128                 node = tsk_home_node(p);
 129                 if (node != -1)
 130                         pol = &preferred_node_policy[node];
 131         }
 132
 133         return pol;
 134 }
 135
 136 static const struct mempolicy_operations {
 137         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 138         /*
 139          * If read-side task has no lock to protect task->mempolicy, write-side
 140          * task will rebind the task->mempolicy by two step. The first step is
 141          * setting all the newly nodes, and the second step is cleaning all the
 142          * disallowed nodes. In this way, we can avoid finding no node to alloc
 143          * page.
 144          * If we have a lock to protect task->mempolicy in read-side, we do
 145          * rebind directly.
 146          *
 147          * step:
 148          *      MPOL_REBIND_ONCE - do rebind work at once
 149          *      MPOL_REBIND_STEP1 - set all the newly nodes
 150          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 151          */
 152         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 153                         enum mpol_rebind_step step);
 154 } mpol_ops[MPOL_MAX];
 155
 156 /* Check that the nodemask contains at least one populated zone */
 157 static int is_valid_nodemask(const nodemask_t *nodemask)
 158 {
 159         int nd, k;
 160
 161         for_each_node_mask(nd, *nodemask) {
 162                 struct zone *z;
 163
 164                 for (k = 0; k <= policy_zone; k++) {
 165                         z = &NODE_DATA(nd)->node_zones[k];
 166                         if (z->present_pages > 0)
 167                                 return 1;
 168                 }
 169         }
 170
 171         return 0;
 172 }
 173
 174 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 175 {
 176         return pol->flags & MPOL_MODE_FLAGS;
 177 }
 178
 179 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 180                                    const nodemask_t *rel)
 181 {
 182         nodemask_t tmp;
 183         nodes_fold(tmp, *orig, nodes_weight(*rel));
 184         nodes_onto(*ret, tmp, *rel);
 185 }
 186
 187 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 188 {
 189         if (nodes_empty(*nodes))
 190                 return -EINVAL;
 191         pol->v.nodes = *nodes;
 192         return 0;
 193 }
 194
 195 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 196 {
 197         if (!nodes)
 198                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 199         else if (nodes_empty(*nodes))
 200                 return -EINVAL;                 /*  no allowed nodes */
 201         else
 202                 pol->v.preferred_node = first_node(*nodes);
 203         return 0;
 204 }
 205
 206 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 207 {
 208         if (!is_valid_nodemask(nodes))
 209                 return -EINVAL;
 210         pol->v.nodes = *nodes;
 211         return 0;
 212 }
 213
 214 /*
 215  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 216  * any, for the new policy.  mpol_new() has already validated the nodes
 217  * parameter with respect to the policy mode and flags.  But, we need to
 218  * handle an empty nodemask with MPOL_PREFERRED here.
 219  *
 220  * Must be called holding task's alloc_lock to protect task's mems_allowed
 221  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 222  */
 223 static int mpol_set_nodemask(struct mempolicy *pol,
 224                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 225 {
 226         int ret;
 227
 228         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 229         if (pol == NULL)
 230                 return 0;
 231         /* Check N_HIGH_MEMORY */
 232         nodes_and(nsc->mask1,
 233                   cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
 234
 235         VM_BUG_ON(!nodes);
 236         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 237                 nodes = NULL;   /* explicit local allocation */
 238         else {
 239                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 240                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 241                 else
 242                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 243
 244                 if (mpol_store_user_nodemask(pol))
 245                         pol->w.user_nodemask = *nodes;
 246                 else
 247                         pol->w.cpuset_mems_allowed =
 248                                                 cpuset_current_mems_allowed;
 249         }
 250
 251         if (nodes)
 252                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 253         else
 254                 ret = mpol_ops[pol->mode].create(pol, NULL);
 255         return ret;
 256 }
 257
 258 /*
 259  * This function just creates a new policy, does some check and simple
 260  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 261  */
 262 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 263                                   nodemask_t *nodes)
 264 {
 265         struct mempolicy *policy;
 266
 267         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 268                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 269
 270         if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) {
 271                 if (nodes && !nodes_empty(*nodes))
 272                         return ERR_PTR(-EINVAL);
 273                 return NULL;
 274         }
 275         VM_BUG_ON(!nodes);
 276
 277         /*
 278          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 279          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 280          * All other modes require a valid pointer to a non-empty nodemask.
 281          */
 282         if (mode == MPOL_PREFERRED) {
 283                 if (nodes_empty(*nodes)) {
 284                         if (((flags & MPOL_F_STATIC_NODES) ||
 285                              (flags & MPOL_F_RELATIVE_NODES)))
 286                                 return ERR_PTR(-EINVAL);
 287                 }
 288         } else if (mode == MPOL_LOCAL) {
 289                 if (!nodes_empty(*nodes))
 290                         return ERR_PTR(-EINVAL);
 291                 mode = MPOL_PREFERRED;
 292         } else if (nodes_empty(*nodes))
 293                 return ERR_PTR(-EINVAL);
 294         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 295         if (!policy)
 296                 return ERR_PTR(-ENOMEM);
 297         atomic_set(&policy->refcnt, 1);
 298         policy->mode = mode;
 299         policy->flags = flags;
 300
 301         return policy;
 302 }
 303
 304 /* Slow path of a mpol destructor. */
 305 void __mpol_put(struct mempolicy *p)
 306 {
 307         if (!atomic_dec_and_test(&p->refcnt))
 308                 return;
 309         kmem_cache_free(policy_cache, p);
 310 }
 311
 312 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 313                                 enum mpol_rebind_step step)
 314 {
 315 }
 316
 317 /*
 318  * step:
 319  *      MPOL_REBIND_ONCE  - do rebind work at once
 320  *      MPOL_REBIND_STEP1 - set all the newly nodes
 321  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 322  */
 323 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 324                                  enum mpol_rebind_step step)
 325 {
 326         nodemask_t tmp;
 327
 328         if (pol->flags & MPOL_F_STATIC_NODES)
 329                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 330         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 331                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 332         else {
 333                 /*
 334                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 335                  * result
 336                  */
 337                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 338                         nodes_remap(tmp, pol->v.nodes,
 339                                         pol->w.cpuset_mems_allowed, *nodes);
 340                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 341                 } else if (step == MPOL_REBIND_STEP2) {
 342                         tmp = pol->w.cpuset_mems_allowed;
 343                         pol->w.cpuset_mems_allowed = *nodes;
 344                 } else
 345                         BUG();
 346         }
 347
 348         if (nodes_empty(tmp))
 349                 tmp = *nodes;
 350
 351         if (step == MPOL_REBIND_STEP1)
 352                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 353         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 354                 pol->v.nodes = tmp;
 355         else
 356                 BUG();
 357
 358         if (!node_isset(current->il_next, tmp)) {
 359                 current->il_next = next_node(current->il_next, tmp);
 360                 if (current->il_next >= MAX_NUMNODES)
 361                         current->il_next = first_node(tmp);
 362                 if (current->il_next >= MAX_NUMNODES)
 363                         current->il_next = numa_node_id();
 364         }
 365 }
 366
 367 static void mpol_rebind_preferred(struct mempolicy *pol,
 368                                   const nodemask_t *nodes,
 369                                   enum mpol_rebind_step step)
 370 {
 371         nodemask_t tmp;
 372
 373         if (pol->flags & MPOL_F_STATIC_NODES) {
 374                 int node = first_node(pol->w.user_nodemask);
 375
 376                 if (node_isset(node, *nodes)) {
 377                         pol->v.preferred_node = node;
 378                         pol->flags &= ~MPOL_F_LOCAL;
 379                 } else
 380                         pol->flags |= MPOL_F_LOCAL;
 381         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 382                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 383                 pol->v.preferred_node = first_node(tmp);
 384         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 385                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 386                                                    pol->w.cpuset_mems_allowed,
 387                                                    *nodes);
 388                 pol->w.cpuset_mems_allowed = *nodes;
 389         }
 390 }
 391
 392 /*
 393  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 394  *
 395  * If read-side task has no lock to protect task->mempolicy, write-side
 396  * task will rebind the task->mempolicy by two step. The first step is
 397  * setting all the newly nodes, and the second step is cleaning all the
 398  * disallowed nodes. In this way, we can avoid finding no node to alloc
 399  * page.
 400  * If we have a lock to protect task->mempolicy in read-side, we do
 401  * rebind directly.
 402  *
 403  * step:
 404  *      MPOL_REBIND_ONCE  - do rebind work at once
 405  *      MPOL_REBIND_STEP1 - set all the newly nodes
 406  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 407  */
 408 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 409                                 enum mpol_rebind_step step)
 410 {
 411         if (!pol)
 412                 return;
 413         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 414             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 415                 return;
 416
 417         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 418                 return;
 419
 420         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 421                 BUG();
 422
 423         if (step == MPOL_REBIND_STEP1)
 424                 pol->flags |= MPOL_F_REBINDING;
 425         else if (step == MPOL_REBIND_STEP2)
 426                 pol->flags &= ~MPOL_F_REBINDING;
 427         else if (step >= MPOL_REBIND_NSTEP)
 428                 BUG();
 429
 430         mpol_ops[pol->mode].rebind(pol, newmask, step);
 431 }
 432
 433 /*
 434  * Wrapper for mpol_rebind_policy() that just requires task
 435  * pointer, and updates task mempolicy.
 436  *
 437  * Called with task's alloc_lock held.
 438  */
 439
 440 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 441                         enum mpol_rebind_step step)
 442 {
 443         mpol_rebind_policy(tsk->mempolicy, new, step);
 444 }
 445
 446 /*
 447  * Rebind each vma in mm to new nodemask.
 448  *
 449  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 450  */
 451
 452 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 453 {
 454         struct vm_area_struct *vma;
 455
 456         down_write(&mm->mmap_sem);
 457         for (vma = mm->mmap; vma; vma = vma->vm_next)
 458                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 459         up_write(&mm->mmap_sem);
 460 }
 461
 462 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 463         [MPOL_DEFAULT] = {
 464                 .rebind = mpol_rebind_default,
 465         },
 466         [MPOL_INTERLEAVE] = {
 467                 .create = mpol_new_interleave,
 468                 .rebind = mpol_rebind_nodemask,
 469         },
 470         [MPOL_PREFERRED] = {
 471                 .create = mpol_new_preferred,
 472                 .rebind = mpol_rebind_preferred,
 473         },
 474         [MPOL_BIND] = {
 475                 .create = mpol_new_bind,
 476                 .rebind = mpol_rebind_nodemask,
 477         },
 478 };
 479
 480 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 481                                 unsigned long flags);
 482
 483 /* Scan through pages checking if pages follow certain conditions. */
 484 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 485                 unsigned long addr, unsigned long end,
 486                 const nodemask_t *nodes, unsigned long flags,
 487                 void *private)
 488 {
 489         pte_t *orig_pte;
 490         pte_t *pte;
 491         spinlock_t *ptl;
 492
 493         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 494         do {
 495                 struct page *page;
 496                 int nid;
 497
 498                 if (!pte_present(*pte))
 499                         continue;
 500                 page = vm_normal_page(vma, addr, *pte);
 501                 if (!page)
 502                         continue;
 503                 /*
 504                  * vm_normal_page() filters out zero pages, but there might
 505                  * still be PageReserved pages to skip, perhaps in a VDSO.
 506                  * And we cannot move PageKsm pages sensibly or safely yet.
 507                  */
 508                 if (PageReserved(page) || PageKsm(page))
 509                         continue;
 510                 nid = page_to_nid(page);
 511                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 512                         continue;
 513
 514                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 515                         migrate_page_add(page, private, flags);
 516                 else
 517                         break;
 518         } while (pte++, addr += PAGE_SIZE, addr != end);
 519         pte_unmap_unlock(orig_pte, ptl);
 520         return addr != end;
 521 }
 522
 523 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 524                 unsigned long addr, unsigned long end,
 525                 const nodemask_t *nodes, unsigned long flags,
 526                 void *private)
 527 {
 528         pmd_t *pmd;
 529         unsigned long next;
 530
 531         pmd = pmd_offset(pud, addr);
 532         do {
 533                 next = pmd_addr_end(addr, end);
 534                 split_huge_page_pmd(vma->vm_mm, pmd);
 535                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 536                         continue;
 537                 if (check_pte_range(vma, pmd, addr, next, nodes,
 538                                     flags, private))
 539                         return -EIO;
 540         } while (pmd++, addr = next, addr != end);
 541         return 0;
 542 }
 543
 544 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 545                 unsigned long addr, unsigned long end,
 546                 const nodemask_t *nodes, unsigned long flags,
 547                 void *private)
 548 {
 549         pud_t *pud;
 550         unsigned long next;
 551
 552         pud = pud_offset(pgd, addr);
 553         do {
 554                 next = pud_addr_end(addr, end);
 555                 if (pud_none_or_clear_bad(pud))
 556                         continue;
 557                 if (check_pmd_range(vma, pud, addr, next, nodes,
 558                                     flags, private))
 559                         return -EIO;
 560         } while (pud++, addr = next, addr != end);
 561         return 0;
 562 }
 563
 564 static inline int check_pgd_range(struct vm_area_struct *vma,
 565                 unsigned long addr, unsigned long end,
 566                 const nodemask_t *nodes, unsigned long flags,
 567                 void *private)
 568 {
 569         pgd_t *pgd;
 570         unsigned long next;
 571
 572         pgd = pgd_offset(vma->vm_mm, addr);
 573         do {
 574                 next = pgd_addr_end(addr, end);
 575                 if (pgd_none_or_clear_bad(pgd))
 576                         continue;
 577                 if (check_pud_range(vma, pgd, addr, next, nodes,
 578                                     flags, private))
 579                         return -EIO;
 580         } while (pgd++, addr = next, addr != end);
 581         return 0;
 582 }
 583
 584 static void
 585 change_prot_none(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 586 {
 587         change_protection(vma, start, end, vma_prot_none(vma), 0);
 588 }
 589
 590 /*
 591  * Check if all pages in a range are on a set of nodes.
 592  * If pagelist != NULL then isolate pages from the LRU and
 593  * put them on the pagelist.
 594  */
 595 static struct vm_area_struct *
 596 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 597                 const nodemask_t *nodes, unsigned long flags, void *private)
 598 {
 599         int err;
 600         struct vm_area_struct *first, *vma, *prev;
 601
 602
 603         first = find_vma(mm, start);
 604         if (!first)
 605                 return ERR_PTR(-EFAULT);
 606         prev = NULL;
 607         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 608                 unsigned long endvma = vma->vm_end;
 609
 610                 if (endvma > end)
 611                         endvma = end;
 612                 if (vma->vm_start > start)
 613                         start = vma->vm_start;
 614
 615                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 616                         if (!vma->vm_next && vma->vm_end < end)
 617                                 return ERR_PTR(-EFAULT);
 618                         if (prev && prev->vm_end < vma->vm_start)
 619                                 return ERR_PTR(-EFAULT);
 620                 }
 621
 622                 if (is_vm_hugetlb_page(vma))
 623                         goto next;
 624
 625                 if (flags & MPOL_MF_LAZY) {
 626                         change_prot_none(vma, start, endvma);
 627                         goto next;
 628                 }
 629
 630                 if ((flags & MPOL_MF_STRICT) ||
 631                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 632                       vma_migratable(vma))) {
 633
 634                         err = check_pgd_range(vma, start, endvma, nodes,
 635                                                 flags, private);
 636                         if (err) {
 637                                 first = ERR_PTR(err);
 638                                 break;
 639                         }
 640                 }
 641 next:
 642                 prev = vma;
 643         }
 644         return first;
 645 }
 646
 647 /*
 648  * Apply policy to a single VMA
 649  * This must be called with the mmap_sem held for writing.
 650  */
 651 static int vma_replace_policy(struct vm_area_struct *vma,
 652                                                 struct mempolicy *pol)
 653 {
 654         int err;
 655         struct mempolicy *old;
 656         struct mempolicy *new;
 657
 658         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 659                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 660                  vma->vm_ops, vma->vm_file,
 661                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 662
 663         new = mpol_dup(pol);
 664         if (IS_ERR(new))
 665                 return PTR_ERR(new);
 666
 667         if (vma->vm_ops && vma->vm_ops->set_policy) {
 668                 err = vma->vm_ops->set_policy(vma, new);
 669                 if (err)
 670                         goto err_out;
 671         }
 672
 673         old = vma->vm_policy;
 674         vma->vm_policy = new; /* protected by mmap_sem */
 675         mpol_put(old);
 676
 677         return 0;
 678  err_out:
 679         mpol_put(new);
 680         return err;
 681 }
 682
 683 /* Step 2: apply policy to a range and do splits. */
 684 static int mbind_range(struct mm_struct *mm, unsigned long start,
 685                        unsigned long end, struct mempolicy *new_pol)
 686 {
 687         struct vm_area_struct *next;
 688         struct vm_area_struct *prev;
 689         struct vm_area_struct *vma;
 690         int err = 0;
 691         pgoff_t pgoff;
 692         unsigned long vmstart;
 693         unsigned long vmend;
 694
 695         vma = find_vma(mm, start);
 696         if (!vma || vma->vm_start > start)
 697                 return -EFAULT;
 698
 699         prev = vma->vm_prev;
 700         if (start > vma->vm_start)
 701                 prev = vma;
 702
 703         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 704                 next = vma->vm_next;
 705                 vmstart = max(start, vma->vm_start);
 706                 vmend   = min(end, vma->vm_end);
 707
 708                 if (mpol_equal(vma_policy(vma), new_pol))
 709                         continue;
 710
 711                 pgoff = vma->vm_pgoff +
 712                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 713                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 714                                   vma->anon_vma, vma->vm_file, pgoff,
 715                                   new_pol);
 716                 if (prev) {
 717                         vma = prev;
 718                         next = vma->vm_next;
 719                         continue;
 720                 }
 721                 if (vma->vm_start != vmstart) {
 722                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 723                         if (err)
 724                                 goto out;
 725                 }
 726                 if (vma->vm_end != vmend) {
 727                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 728                         if (err)
 729                                 goto out;
 730                 }
 731                 err = vma_replace_policy(vma, new_pol);
 732                 if (err)
 733                         goto out;
 734         }
 735
 736  out:
 737         return err;
 738 }
 739
 740 /*
 741  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 742  * mempolicy.  Allows more rapid checking of this (combined perhaps
 743  * with other PF_* flag bits) on memory allocation hot code paths.
 744  *
 745  * If called from outside this file, the task 'p' should -only- be
 746  * a newly forked child not yet visible on the task list, because
 747  * manipulating the task flags of a visible task is not safe.
 748  *
 749  * The above limitation is why this routine has the funny name
 750  * mpol_fix_fork_child_flag().
 751  *
 752  * It is also safe to call this with a task pointer of current,
 753  * which the static wrapper mpol_set_task_struct_flag() does,
 754  * for use within this file.
 755  */
 756
 757 void mpol_fix_fork_child_flag(struct task_struct *p)
 758 {
 759         if (p->mempolicy)
 760                 p->flags |= PF_MEMPOLICY;
 761         else
 762                 p->flags &= ~PF_MEMPOLICY;
 763 }
 764
 765 static void mpol_set_task_struct_flag(void)
 766 {
 767         mpol_fix_fork_child_flag(current);
 768 }
 769
 770 /* Set the process memory policy */
 771 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 772                              nodemask_t *nodes)
 773 {
 774         struct mempolicy *new, *old;
 775         struct mm_struct *mm = current->mm;
 776         NODEMASK_SCRATCH(scratch);
 777         int ret;
 778
 779         if (!scratch)
 780                 return -ENOMEM;
 781
 782         new = mpol_new(mode, flags, nodes);
 783         if (IS_ERR(new)) {
 784                 ret = PTR_ERR(new);
 785                 goto out;
 786         }
 787         /*
 788          * prevent changing our mempolicy while show_numa_maps()
 789          * is using it.
 790          * Note:  do_set_mempolicy() can be called at init time
 791          * with no 'mm'.
 792          */
 793         if (mm)
 794                 down_write(&mm->mmap_sem);
 795         task_lock(current);
 796         ret = mpol_set_nodemask(new, nodes, scratch);
 797         if (ret) {
 798                 task_unlock(current);
 799                 if (mm)
 800                         up_write(&mm->mmap_sem);
 801                 mpol_put(new);
 802                 goto out;
 803         }
 804         old = current->mempolicy;
 805         current->mempolicy = new;
 806         mpol_set_task_struct_flag();
 807         if (new && new->mode == MPOL_INTERLEAVE &&
 808             nodes_weight(new->v.nodes))
 809                 current->il_next = first_node(new->v.nodes);
 810         task_unlock(current);
 811         if (mm)
 812                 up_write(&mm->mmap_sem);
 813
 814         mpol_put(old);
 815         ret = 0;
 816 out:
 817         NODEMASK_SCRATCH_FREE(scratch);
 818         return ret;
 819 }
 820
 821 /*
 822  * Return nodemask for policy for get_mempolicy() query
 823  *
 824  * Called with task's alloc_lock held
 825  */
 826 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 827 {
 828         nodes_clear(*nodes);
 829         if (p == &default_policy)
 830                 return;
 831
 832         switch (p->mode) {
 833         case MPOL_BIND:
 834                 /* Fall through */
 835         case MPOL_INTERLEAVE:
 836                 *nodes = p->v.nodes;
 837                 break;
 838         case MPOL_PREFERRED:
 839                 if (!(p->flags & MPOL_F_LOCAL))
 840                         node_set(p->v.preferred_node, *nodes);
 841                 /* else return empty node mask for local allocation */
 842                 break;
 843         default:
 844                 BUG();
 845         }
 846 }
 847
 848 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 849 {
 850         struct page *p;
 851         int err;
 852
 853         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 854         if (err >= 0) {
 855                 err = page_to_nid(p);
 856                 put_page(p);
 857         }
 858         return err;
 859 }
 860
 861 /* Retrieve NUMA policy */
 862 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 863                              unsigned long addr, unsigned long flags)
 864 {
 865         int err;
 866         struct mm_struct *mm = current->mm;
 867         struct vm_area_struct *vma = NULL;
 868         struct mempolicy *pol = current->mempolicy;
 869
 870         if (flags &
 871                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 872                 return -EINVAL;
 873
 874         if (flags & MPOL_F_MEMS_ALLOWED) {
 875                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 876                         return -EINVAL;
 877                 *policy = 0;    /* just so it's initialized */
 878                 task_lock(current);
 879                 *nmask  = cpuset_current_mems_allowed;
 880                 task_unlock(current);
 881                 return 0;
 882         }
 883
 884         if (flags & MPOL_F_ADDR) {
 885                 /*
 886                  * Do NOT fall back to task policy if the
 887                  * vma/shared policy at addr is NULL.  We
 888                  * want to return MPOL_DEFAULT in this case.
 889                  */
 890                 down_read(&mm->mmap_sem);
 891                 vma = find_vma_intersection(mm, addr, addr+1);
 892                 if (!vma) {
 893                         up_read(&mm->mmap_sem);
 894                         return -EFAULT;
 895                 }
 896                 if (vma->vm_ops && vma->vm_ops->get_policy)
 897                         pol = vma->vm_ops->get_policy(vma, addr);
 898                 else
 899                         pol = vma->vm_policy;
 900         } else if (addr)
 901                 return -EINVAL;
 902
 903         if (!pol)
 904                 pol = &default_policy;  /* indicates default behavior */
 905
 906         if (flags & MPOL_F_NODE) {
 907                 if (flags & MPOL_F_ADDR) {
 908                         err = lookup_node(mm, addr);
 909                         if (err < 0)
 910                                 goto out;
 911                         *policy = err;
 912                 } else if (pol == current->mempolicy &&
 913                                 pol->mode == MPOL_INTERLEAVE) {
 914                         *policy = current->il_next;
 915                 } else {
 916                         err = -EINVAL;
 917                         goto out;
 918                 }
 919         } else {
 920                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 921                                                 pol->mode;
 922                 /*
 923                  * Internal mempolicy flags must be masked off before exposing
 924                  * the policy to userspace.
 925                  */
 926                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 927         }
 928
 929         if (vma) {
 930                 up_read(&current->mm->mmap_sem);
 931                 vma = NULL;
 932         }
 933
 934         err = 0;
 935         if (nmask) {
 936                 if (mpol_store_user_nodemask(pol)) {
 937                         *nmask = pol->w.user_nodemask;
 938                 } else {
 939                         task_lock(current);
 940                         get_policy_nodemask(pol, nmask);
 941                         task_unlock(current);
 942                 }
 943         }
 944
 945  out:
 946         mpol_cond_put(pol);
 947         if (vma)
 948                 up_read(&current->mm->mmap_sem);
 949         return err;
 950 }
 951
 952 #ifdef CONFIG_MIGRATION
 953 /*
 954  * page migration
 955  */
 956 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 957                                 unsigned long flags)
 958 {
 959         /*
 960          * Avoid migrating a page that is shared with others.
 961          */
 962         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 963                 if (!isolate_lru_page(page)) {
 964                         list_add_tail(&page->lru, pagelist);
 965                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 966                                             page_is_file_cache(page));
 967                 }
 968         }
 969 }
 970
 971 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 972 {
 973         return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 974 }
 975
 976 /*
 977  * Migrate pages from one node to a target node.
 978  * Returns error or the number of pages not migrated.
 979  */
 980 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 981                            int flags)
 982 {
 983         nodemask_t nmask;
 984         LIST_HEAD(pagelist);
 985         int err = 0;
 986
 987         nodes_clear(nmask);
 988         node_set(source, nmask);
 989
 990         /*
 991          * This does not "check" the range but isolates all pages that
 992          * need migration.  Between passing in the full user address
 993          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
 994          */
 995         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
 996         check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 997                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 998
 999         if (!list_empty(&pagelist)) {
1000                 err = migrate_pages(&pagelist, new_node_page, dest,
1001                                                         false, MIGRATE_SYNC);
1002                 if (err)
1003                         putback_lru_pages(&pagelist);
1004         }
1005
1006         return err;
1007 }
1008
1009 /*
1010  * Move pages between the two nodesets so as to preserve the physical
1011  * layout as much as possible.
1012  *
1013  * Returns the number of page that could not be moved.
1014  */
1015 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1016                      const nodemask_t *to, int flags)
1017 {
1018         int busy = 0;
1019         int err;
1020         nodemask_t tmp;
1021
1022         err = migrate_prep();
1023         if (err)
1024                 return err;
1025
1026         down_read(&mm->mmap_sem);
1027
1028         err = migrate_vmas(mm, from, to, flags);
1029         if (err)
1030                 goto out;
1031
1032         /*
1033          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1034          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1035          * bit in 'tmp', and return that <source, dest> pair for migration.
1036          * The pair of nodemasks 'to' and 'from' define the map.
1037          *
1038          * If no pair of bits is found that way, fallback to picking some
1039          * pair of 'source' and 'dest' bits that are not the same.  If the
1040          * 'source' and 'dest' bits are the same, this represents a node
1041          * that will be migrating to itself, so no pages need move.
1042          *
1043          * If no bits are left in 'tmp', or if all remaining bits left
1044          * in 'tmp' correspond to the same bit in 'to', return false
1045          * (nothing left to migrate).
1046          *
1047          * This lets us pick a pair of nodes to migrate between, such that
1048          * if possible the dest node is not already occupied by some other
1049          * source node, minimizing the risk of overloading the memory on a
1050          * node that would happen if we migrated incoming memory to a node
1051          * before migrating outgoing memory source that same node.
1052          *
1053          * A single scan of tmp is sufficient.  As we go, we remember the
1054          * most recent <s, d> pair that moved (s != d).  If we find a pair
1055          * that not only moved, but what's better, moved to an empty slot
1056          * (d is not set in tmp), then we break out then, with that pair.
1057          * Otherwise when we finish scanning from_tmp, we at least have the
1058          * most recent <s, d> pair that moved.  If we get all the way through
1059          * the scan of tmp without finding any node that moved, much less
1060          * moved to an empty node, then there is nothing left worth migrating.
1061          */
1062
1063         tmp = *from;
1064         while (!nodes_empty(tmp)) {
1065                 int s,d;
1066                 int source = -1;
1067                 int dest = 0;
1068
1069                 for_each_node_mask(s, tmp) {
1070
1071                         /*
1072                          * do_migrate_pages() tries to maintain the relative
1073                          * node relationship of the pages established between
1074                          * threads and memory areas.
1075                          *
1076                          * However if the number of source nodes is not equal to
1077                          * the number of destination nodes we can not preserve
1078                          * this node relative relationship.  In that case, skip
1079                          * copying memory from a node that is in the destination
1080                          * mask.
1081                          *
1082                          * Example: [2,3,4] -> [3,4,5] moves everything.
1083                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1084                          */
1085
1086                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1087                                                 (node_isset(s, *to)))
1088                                 continue;
1089
1090                         d = node_remap(s, *from, *to);
1091                         if (s == d)
1092                                 continue;
1093
1094                         source = s;     /* Node moved. Memorize */
1095                         dest = d;
1096
1097                         /* dest not in remaining from nodes? */
1098                         if (!node_isset(dest, tmp))
1099                                 break;
1100                 }
1101                 if (source == -1)
1102                         break;
1103
1104                 node_clear(source, tmp);
1105                 err = migrate_to_node(mm, source, dest, flags);
1106                 if (err > 0)
1107                         busy += err;
1108                 if (err < 0)
1109                         break;
1110         }
1111 out:
1112         up_read(&mm->mmap_sem);
1113         if (err < 0)
1114                 return err;
1115         return busy;
1116
1117 }
1118
1119 /*
1120  * Allocate a new page for page migration based on vma policy.
1121  * Start assuming that page is mapped by vma pointed to by @private.
1122  * Search forward from there, if not.  N.B., this assumes that the
1123  * list of pages handed to migrate_pages()--which is how we get here--
1124  * is in virtual address order.
1125  */
1126 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1127 {
1128         struct vm_area_struct *vma = (struct vm_area_struct *)private;
1129         unsigned long uninitialized_var(address);
1130
1131         while (vma) {
1132                 address = page_address_in_vma(page, vma);
1133                 if (address != -EFAULT)
1134                         break;
1135                 vma = vma->vm_next;
1136         }
1137
1138         /*
1139          * if !vma, alloc_page_vma() will use task or system default policy
1140          */
1141         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1142 }
1143 #else
1144
1145 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1146                                 unsigned long flags)
1147 {
1148 }
1149
1150 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1151                      const nodemask_t *to, int flags)
1152 {
1153         return -ENOSYS;
1154 }
1155
1156 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1157 {
1158         return NULL;
1159 }
1160 #endif
1161
1162 static long do_mbind(unsigned long start, unsigned long len,
1163                      unsigned short mode, unsigned short mode_flags,
1164                      nodemask_t *nmask, unsigned long flags)
1165 {
1166         struct vm_area_struct *vma;
1167         struct mm_struct *mm = current->mm;
1168         struct mempolicy *new;
1169         unsigned long end;
1170         int err;
1171         LIST_HEAD(pagelist);
1172
1173         if (flags & ~(unsigned long)MPOL_MF_VALID)
1174                 return -EINVAL;
1175         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1176                 return -EPERM;
1177
1178         if (start & ~PAGE_MASK)
1179                 return -EINVAL;
1180
1181         if (mode == MPOL_DEFAULT || mode == MPOL_NOOP)
1182                 flags &= ~MPOL_MF_STRICT;
1183
1184         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1185         end = start + len;
1186
1187         if (end < start)
1188                 return -EINVAL;
1189         if (end == start)
1190                 return 0;
1191
1192         new = mpol_new(mode, mode_flags, nmask);
1193         if (IS_ERR(new))
1194                 return PTR_ERR(new);
1195
1196         if (flags & MPOL_MF_LAZY)
1197                 new->flags |= MPOL_F_MOF;
1198
1199         /*
1200          * If we are using the default policy then operation
1201          * on discontinuous address spaces is okay after all
1202          */
1203         if (!new)
1204                 flags |= MPOL_MF_DISCONTIG_OK;
1205
1206         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1207                  start, start + len, mode, mode_flags,
1208                  nmask ? nodes_addr(*nmask)[0] : -1);
1209
1210         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1211
1212                 err = migrate_prep();
1213                 if (err)
1214                         goto mpol_out;
1215         }
1216         {
1217                 NODEMASK_SCRATCH(scratch);
1218                 if (scratch) {
1219                         down_write(&mm->mmap_sem);
1220                         task_lock(current);
1221                         err = mpol_set_nodemask(new, nmask, scratch);
1222                         task_unlock(current);
1223                         if (err)
1224                                 up_write(&mm->mmap_sem);
1225                 } else
1226                         err = -ENOMEM;
1227                 NODEMASK_SCRATCH_FREE(scratch);
1228         }
1229         if (err)
1230                 goto mpol_out;
1231
1232         vma = check_range(mm, start, end, nmask,
1233                           flags | MPOL_MF_INVERT, &pagelist);
1234
1235         err = PTR_ERR(vma);     /* maybe ... */
1236         if (!IS_ERR(vma) && mode != MPOL_NOOP)
1237                 err = mbind_range(mm, start, end, new);
1238
1239         if (!err) {
1240                 int nr_failed = 0;
1241
1242                 if (!list_empty(&pagelist)) {
1243                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1244                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1245                                                   (unsigned long)vma,
1246                                                   false, MIGRATE_SYNC);
1247                         if (nr_failed)
1248                                 putback_lru_pages(&pagelist);
1249                 }
1250
1251                 if (nr_failed && (flags & MPOL_MF_STRICT))
1252                         err = -EIO;
1253         } else
1254                 putback_lru_pages(&pagelist);
1255
1256         up_write(&mm->mmap_sem);
1257  mpol_out:
1258         mpol_put(new);
1259         return err;
1260 }
1261
1262 static void lazy_migrate_vma(struct vm_area_struct *vma)
1263 {
1264         if (!vma_migratable(vma))
1265                 return;
1266
1267         change_prot_none(vma, vma->vm_start, vma->vm_end);
1268 }
1269
1270 void lazy_migrate_process(struct mm_struct *mm)
1271 {
1272         struct vm_area_struct *vma;
1273
1274         down_read(&mm->mmap_sem);
1275         for (vma = mm->mmap; vma; vma = vma->vm_next)
1276                 lazy_migrate_vma(vma);
1277         up_read(&mm->mmap_sem);
1278 }
1279
1280 /*
1281  * User space interface with variable sized bitmaps for nodelists.
1282  */
1283
1284 /* Copy a node mask from user space. */
1285 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1286                      unsigned long maxnode)
1287 {
1288         unsigned long k;
1289         unsigned long nlongs;
1290         unsigned long endmask;
1291
1292         --maxnode;
1293         nodes_clear(*nodes);
1294         if (maxnode == 0 || !nmask)
1295                 return 0;
1296         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1297                 return -EINVAL;
1298
1299         nlongs = BITS_TO_LONGS(maxnode);
1300         if ((maxnode % BITS_PER_LONG) == 0)
1301                 endmask = ~0UL;
1302         else
1303                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1304
1305         /* When the user specified more nodes than supported just check
1306            if the non supported part is all zero. */
1307         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1308                 if (nlongs > PAGE_SIZE/sizeof(long))
1309                         return -EINVAL;
1310                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1311                         unsigned long t;
1312                         if (get_user(t, nmask + k))
1313                                 return -EFAULT;
1314                         if (k == nlongs - 1) {
1315                                 if (t & endmask)
1316                                         return -EINVAL;
1317                         } else if (t)
1318                                 return -EINVAL;
1319                 }
1320                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1321                 endmask = ~0UL;
1322         }
1323
1324         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1325                 return -EFAULT;
1326         nodes_addr(*nodes)[nlongs-1] &= endmask;
1327         return 0;
1328 }
1329
1330 /* Copy a kernel node mask to user space */
1331 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1332                               nodemask_t *nodes)
1333 {
1334         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1335         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1336
1337         if (copy > nbytes) {
1338                 if (copy > PAGE_SIZE)
1339                         return -EINVAL;
1340                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1341                         return -EFAULT;
1342                 copy = nbytes;
1343         }
1344         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1345 }
1346
1347 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1348                 unsigned long, mode, unsigned long __user *, nmask,
1349                 unsigned long, maxnode, unsigned, flags)
1350 {
1351         nodemask_t nodes;
1352         int err;
1353         unsigned short mode_flags;
1354
1355         mode_flags = mode & MPOL_MODE_FLAGS;
1356         mode &= ~MPOL_MODE_FLAGS;
1357         if (mode >= MPOL_MAX)
1358                 return -EINVAL;
1359         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1360             (mode_flags & MPOL_F_RELATIVE_NODES))
1361                 return -EINVAL;
1362         err = get_nodes(&nodes, nmask, maxnode);
1363         if (err)
1364                 return err;
1365         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1366 }
1367
1368 /* Set the process memory policy */
1369 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1370                 unsigned long, maxnode)
1371 {
1372         int err;
1373         nodemask_t nodes;
1374         unsigned short flags;
1375
1376         flags = mode & MPOL_MODE_FLAGS;
1377         mode &= ~MPOL_MODE_FLAGS;
1378         if ((unsigned int)mode >= MPOL_MAX)
1379                 return -EINVAL;
1380         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1381                 return -EINVAL;
1382         err = get_nodes(&nodes, nmask, maxnode);
1383         if (err)
1384                 return err;
1385         return do_set_mempolicy(mode, flags, &nodes);
1386 }
1387
1388 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1389                 const unsigned long __user *, old_nodes,
1390                 const unsigned long __user *, new_nodes)
1391 {
1392         const struct cred *cred = current_cred(), *tcred;
1393         struct mm_struct *mm = NULL;
1394         struct task_struct *task;
1395         nodemask_t task_nodes;
1396         int err;
1397         nodemask_t *old;
1398         nodemask_t *new;
1399         NODEMASK_SCRATCH(scratch);
1400
1401         if (!scratch)
1402                 return -ENOMEM;
1403
1404         old = &scratch->mask1;
1405         new = &scratch->mask2;
1406
1407         err = get_nodes(old, old_nodes, maxnode);
1408         if (err)
1409                 goto out;
1410
1411         err = get_nodes(new, new_nodes, maxnode);
1412         if (err)
1413                 goto out;
1414
1415         /* Find the mm_struct */
1416         rcu_read_lock();
1417         task = pid ? find_task_by_vpid(pid) : current;
1418         if (!task) {
1419                 rcu_read_unlock();
1420                 err = -ESRCH;
1421                 goto out;
1422         }
1423         get_task_struct(task);
1424
1425         err = -EINVAL;
1426
1427         /*
1428          * Check if this process has the right to modify the specified
1429          * process. The right exists if the process has administrative
1430          * capabilities, superuser privileges or the same
1431          * userid as the target process.
1432          */
1433         tcred = __task_cred(task);
1434         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1435             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1436             !capable(CAP_SYS_NICE)) {
1437                 rcu_read_unlock();
1438                 err = -EPERM;
1439                 goto out_put;
1440         }
1441         rcu_read_unlock();
1442
1443         task_nodes = cpuset_mems_allowed(task);
1444         /* Is the user allowed to access the target nodes? */
1445         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1446                 err = -EPERM;
1447                 goto out_put;
1448         }
1449
1450         if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1451                 err = -EINVAL;
1452                 goto out_put;
1453         }
1454
1455         err = security_task_movememory(task);
1456         if (err)
1457                 goto out_put;
1458
1459         mm = get_task_mm(task);
1460         put_task_struct(task);
1461
1462         if (!mm) {
1463                 err = -EINVAL;
1464                 goto out;
1465         }
1466
1467         err = do_migrate_pages(mm, old, new,
1468                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1469
1470         mmput(mm);
1471 out:
1472         NODEMASK_SCRATCH_FREE(scratch);
1473
1474         return err;
1475
1476 out_put:
1477         put_task_struct(task);
1478         goto out;
1479
1480 }
1481
1482
1483 /* Retrieve NUMA policy */
1484 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1485                 unsigned long __user *, nmask, unsigned long, maxnode,
1486                 unsigned long, addr, unsigned long, flags)
1487 {
1488         int err;
1489         int uninitialized_var(pval);
1490         nodemask_t nodes;
1491
1492         if (nmask != NULL && maxnode < MAX_NUMNODES)
1493                 return -EINVAL;
1494
1495         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1496
1497         if (err)
1498                 return err;
1499
1500         if (policy && put_user(pval, policy))
1501                 return -EFAULT;
1502
1503         if (nmask)
1504                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1505
1506         return err;
1507 }
1508
1509 #ifdef CONFIG_COMPAT
1510
1511 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1512                                      compat_ulong_t __user *nmask,
1513                                      compat_ulong_t maxnode,
1514                                      compat_ulong_t addr, compat_ulong_t flags)
1515 {
1516         long err;
1517         unsigned long __user *nm = NULL;
1518         unsigned long nr_bits, alloc_size;
1519         DECLARE_BITMAP(bm, MAX_NUMNODES);
1520
1521         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1522         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1523
1524         if (nmask)
1525                 nm = compat_alloc_user_space(alloc_size);
1526
1527         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1528
1529         if (!err && nmask) {
1530                 unsigned long copy_size;
1531                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1532                 err = copy_from_user(bm, nm, copy_size);
1533                 /* ensure entire bitmap is zeroed */
1534                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1535                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1536         }
1537
1538         return err;
1539 }
1540
1541 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1542                                      compat_ulong_t maxnode)
1543 {
1544         long err = 0;
1545         unsigned long __user *nm = NULL;
1546         unsigned long nr_bits, alloc_size;
1547         DECLARE_BITMAP(bm, MAX_NUMNODES);
1548
1549         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1550         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1551
1552         if (nmask) {
1553                 err = compat_get_bitmap(bm, nmask, nr_bits);
1554                 nm = compat_alloc_user_space(alloc_size);
1555                 err |= copy_to_user(nm, bm, alloc_size);
1556         }
1557
1558         if (err)
1559                 return -EFAULT;
1560
1561         return sys_set_mempolicy(mode, nm, nr_bits+1);
1562 }
1563
1564 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1565                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1566                              compat_ulong_t maxnode, compat_ulong_t flags)
1567 {
1568         long err = 0;
1569         unsigned long __user *nm = NULL;
1570         unsigned long nr_bits, alloc_size;
1571         nodemask_t bm;
1572
1573         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1574         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1575
1576         if (nmask) {
1577                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1578                 nm = compat_alloc_user_space(alloc_size);
1579                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1580         }
1581
1582         if (err)
1583                 return -EFAULT;
1584
1585         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1586 }
1587
1588 #endif
1589
1590 /*
1591  * get_vma_policy(@task, @vma, @addr)
1592  * @task - task for fallback if vma policy == default
1593  * @vma   - virtual memory area whose policy is sought
1594  * @addr  - address in @vma for shared policy lookup
1595  *
1596  * Returns effective policy for a VMA at specified address.
1597  * Falls back to @task or system default policy, as necessary.
1598  * Current or other task's task mempolicy and non-shared vma policies must be
1599  * protected by task_lock(task) by the caller.
1600  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1601  * count--added by the get_policy() vm_op, as appropriate--to protect against
1602  * freeing by another task.  It is the caller's responsibility to free the
1603  * extra reference for shared policies.
1604  */
1605 struct mempolicy *get_vma_policy(struct task_struct *task,
1606                 struct vm_area_struct *vma, unsigned long addr)
1607 {
1608         struct mempolicy *pol = get_task_policy(task);
1609
1610         if (vma) {
1611                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1612                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1613                                                                         addr);
1614                         if (vpol)
1615                                 pol = vpol;
1616                 } else if (vma->vm_policy) {
1617                         pol = vma->vm_policy;
1618
1619                         /*
1620                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1621                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1622                          * count on these policies which will be dropped by
1623                          * mpol_cond_put() later
1624                          */
1625                         if (mpol_needs_cond_ref(pol))
1626                                 mpol_get(pol);
1627                 }
1628         }
1629         if (!pol)
1630                 pol = &default_policy;
1631         return pol;
1632 }
1633
1634 /*
1635  * Return a nodemask representing a mempolicy for filtering nodes for
1636  * page allocation
1637  */
1638 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1639 {
1640         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1641         if (unlikely(policy->mode == MPOL_BIND) &&
1642                         gfp_zone(gfp) >= policy_zone &&
1643                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1644                 return &policy->v.nodes;
1645
1646         return NULL;
1647 }
1648
1649 /* Do dynamic interleaving for a process */
1650 static unsigned interleave_nodes(struct mempolicy *policy)
1651 {
1652         unsigned nid, next;
1653         struct task_struct *me = current;
1654
1655         nid = me->il_next;
1656         next = next_node(nid, policy->v.nodes);
1657         if (next >= MAX_NUMNODES)
1658                 next = first_node(policy->v.nodes);
1659         if (next < MAX_NUMNODES)
1660                 me->il_next = next;
1661         return nid;
1662 }
1663
1664 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1665 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1666         int nd)
1667 {
1668         switch (policy->mode) {
1669         case MPOL_INTERLEAVE:
1670                 nd = interleave_nodes(policy);
1671                 break;
1672         case MPOL_PREFERRED:
1673                 if (!(policy->flags & MPOL_F_LOCAL))
1674                         nd = policy->v.preferred_node;
1675                 break;
1676         case MPOL_BIND:
1677                 /*
1678                  * Normally, MPOL_BIND allocations are node-local within the
1679                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1680                  * current node isn't part of the mask, we use the zonelist for
1681                  * the first node in the mask instead.
1682                  */
1683                 if (unlikely(gfp & __GFP_THISNODE) &&
1684                                 unlikely(!node_isset(nd, policy->v.nodes)))
1685                         nd = first_node(policy->v.nodes);
1686                 break;
1687         default:
1688                 BUG();
1689         }
1690         return node_zonelist(nd, gfp);
1691 }
1692
1693 /*
1694  * Depending on the memory policy provide a node from which to allocate the
1695  * next slab entry.
1696  * @policy must be protected by freeing by the caller.  If @policy is
1697  * the current task's mempolicy, this protection is implicit, as only the
1698  * task can change it's policy.  The system default policy requires no
1699  * such protection.
1700  */
1701 unsigned slab_node(void)
1702 {
1703         struct mempolicy *policy;
1704
1705         if (in_interrupt())
1706                 return numa_node_id();
1707
1708         policy = current->mempolicy;
1709         if (!policy || policy->flags & MPOL_F_LOCAL)
1710                 return numa_node_id();
1711
1712         switch (policy->mode) {
1713         case MPOL_PREFERRED:
1714                 /*
1715                  * handled MPOL_F_LOCAL above
1716                  */
1717                 return policy->v.preferred_node;
1718
1719         case MPOL_INTERLEAVE:
1720                 return interleave_nodes(policy);
1721
1722         case MPOL_BIND: {
1723                 /*
1724                  * Follow bind policy behavior and start allocation at the
1725                  * first node.
1726                  */
1727                 struct zonelist *zonelist;
1728                 struct zone *zone;
1729                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1730                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1731                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1732                                                         &policy->v.nodes,
1733                                                         &zone);
1734                 return zone ? zone->node : numa_node_id();
1735         }
1736
1737         default:
1738                 BUG();
1739         }
1740 }
1741
1742 /* Do static interleaving for a VMA with known offset. */
1743 static unsigned offset_il_node(struct mempolicy *pol,
1744                 struct vm_area_struct *vma, unsigned long off)
1745 {
1746         unsigned nnodes = nodes_weight(pol->v.nodes);
1747         unsigned target;
1748         int c;
1749         int nid = -1;
1750
1751         if (!nnodes)
1752                 return numa_node_id();
1753         target = (unsigned int)off % nnodes;
1754         c = 0;
1755         do {
1756                 nid = next_node(nid, pol->v.nodes);
1757                 c++;
1758         } while (c <= target);
1759         return nid;
1760 }
1761
1762 /* Determine a node number for interleave */
1763 static inline unsigned interleave_nid(struct mempolicy *pol,
1764                  struct vm_area_struct *vma, unsigned long addr, int shift)
1765 {
1766         if (vma) {
1767                 unsigned long off;
1768
1769                 /*
1770                  * for small pages, there is no difference between
1771                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1772                  * for huge pages, since vm_pgoff is in units of small
1773                  * pages, we need to shift off the always 0 bits to get
1774                  * a useful offset.
1775                  */
1776                 BUG_ON(shift < PAGE_SHIFT);
1777                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1778                 off += (addr - vma->vm_start) >> shift;
1779                 return offset_il_node(pol, vma, off);
1780         } else
1781                 return interleave_nodes(pol);
1782 }
1783
1784 /*
1785  * Return the bit number of a random bit set in the nodemask.
1786  * (returns -1 if nodemask is empty)
1787  */
1788 int node_random(const nodemask_t *maskp)
1789 {
1790         int w, bit = -1;
1791
1792         w = nodes_weight(*maskp);
1793         if (w)
1794                 bit = bitmap_ord_to_pos(maskp->bits,
1795                         get_random_int() % w, MAX_NUMNODES);
1796         return bit;
1797 }
1798
1799 #ifdef CONFIG_HUGETLBFS
1800 /*
1801  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1802  * @vma = virtual memory area whose policy is sought
1803  * @addr = address in @vma for shared policy lookup and interleave policy
1804  * @gfp_flags = for requested zone
1805  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1806  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1807  *
1808  * Returns a zonelist suitable for a huge page allocation and a pointer
1809  * to the struct mempolicy for conditional unref after allocation.
1810  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1811  * @nodemask for filtering the zonelist.
1812  *
1813  * Must be protected by get_mems_allowed()
1814  */
1815 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1816                                 gfp_t gfp_flags, struct mempolicy **mpol,
1817                                 nodemask_t **nodemask)
1818 {
1819         struct zonelist *zl;
1820
1821         *mpol = get_vma_policy(current, vma, addr);
1822         *nodemask = NULL;       /* assume !MPOL_BIND */
1823
1824         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1825                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1826                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1827         } else {
1828                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1829                 if ((*mpol)->mode == MPOL_BIND)
1830                         *nodemask = &(*mpol)->v.nodes;
1831         }
1832         return zl;
1833 }
1834
1835 /*
1836  * init_nodemask_of_mempolicy
1837  *
1838  * If the current task's mempolicy is "default" [NULL], return 'false'
1839  * to indicate default policy.  Otherwise, extract the policy nodemask
1840  * for 'bind' or 'interleave' policy into the argument nodemask, or
1841  * initialize the argument nodemask to contain the single node for
1842  * 'preferred' or 'local' policy and return 'true' to indicate presence
1843  * of non-default mempolicy.
1844  *
1845  * We don't bother with reference counting the mempolicy [mpol_get/put]
1846  * because the current task is examining it's own mempolicy and a task's
1847  * mempolicy is only ever changed by the task itself.
1848  *
1849  * N.B., it is the caller's responsibility to free a returned nodemask.
1850  */
1851 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1852 {
1853         struct mempolicy *mempolicy;
1854         int nid;
1855
1856         if (!(mask && current->mempolicy))
1857                 return false;
1858
1859         task_lock(current);
1860         mempolicy = current->mempolicy;
1861         switch (mempolicy->mode) {
1862         case MPOL_PREFERRED:
1863                 if (mempolicy->flags & MPOL_F_LOCAL)
1864                         nid = numa_node_id();
1865                 else
1866                         nid = mempolicy->v.preferred_node;
1867                 init_nodemask_of_node(mask, nid);
1868                 break;
1869
1870         case MPOL_BIND:
1871                 /* Fall through */
1872         case MPOL_INTERLEAVE:
1873                 *mask =  mempolicy->v.nodes;
1874                 break;
1875
1876         default:
1877                 BUG();
1878         }
1879         task_unlock(current);
1880
1881         return true;
1882 }
1883 #endif
1884
1885 /*
1886  * mempolicy_nodemask_intersects
1887  *
1888  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1889  * policy.  Otherwise, check for intersection between mask and the policy
1890  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1891  * policy, always return true since it may allocate elsewhere on fallback.
1892  *
1893  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1894  */
1895 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1896                                         const nodemask_t *mask)
1897 {
1898         struct mempolicy *mempolicy;
1899         bool ret = true;
1900
1901         if (!mask)
1902                 return ret;
1903         task_lock(tsk);
1904         mempolicy = tsk->mempolicy;
1905         if (!mempolicy)
1906                 goto out;
1907
1908         switch (mempolicy->mode) {
1909         case MPOL_PREFERRED:
1910                 /*
1911                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1912                  * allocate from, they may fallback to other nodes when oom.
1913                  * Thus, it's possible for tsk to have allocated memory from
1914                  * nodes in mask.
1915                  */
1916                 break;
1917         case MPOL_BIND:
1918         case MPOL_INTERLEAVE:
1919                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1920                 break;
1921         default:
1922                 BUG();
1923         }
1924 out:
1925         task_unlock(tsk);
1926         return ret;
1927 }
1928
1929 /**
1930  *      alloc_pages_vma - Allocate a page for a VMA.
1931  *
1932  *      @gfp:
1933  *      %GFP_USER    user allocation.
1934  *      %GFP_KERNEL  kernel allocations,
1935  *      %GFP_HIGHMEM highmem/user allocations,
1936  *      %GFP_FS      allocation should not call back into a file system.
1937  *      %GFP_ATOMIC  don't sleep.
1938  *
1939  *      @order:Order of the GFP allocation.
1940  *      @vma:  Pointer to VMA or NULL if not available.
1941  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1942  *
1943  *      This function allocates a page from the kernel page pool and applies
1944  *      a NUMA policy associated with the VMA or the current process.
1945  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1946  *      mm_struct of the VMA to prevent it from going away. Should be used for
1947  *      all allocations for pages that will be mapped into
1948  *      user space. Returns NULL when no page can be allocated.
1949  *
1950  *      Should be called with the mm_sem of the vma hold.
1951  */
1952 struct page *
1953 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1954                 unsigned long addr, int node)
1955 {
1956         struct mempolicy *pol;
1957         struct zonelist *zl;
1958         struct page *page;
1959         unsigned int cpuset_mems_cookie;
1960
1961 retry_cpuset:
1962         pol = get_vma_policy(current, vma, addr);
1963         cpuset_mems_cookie = get_mems_allowed();
1964
1965         zl = policy_zonelist(gfp, pol, node);
1966         if (unlikely(mpol_needs_cond_ref(pol))) {
1967                 /*
1968                  * slow path: ref counted shared policy
1969                  */
1970                 struct page *page =  __alloc_pages_nodemask(gfp, order,
1971                                                 zl, policy_nodemask(gfp, pol));
1972                 __mpol_put(pol);
1973                 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1974                         goto retry_cpuset;
1975                 return page;
1976         }
1977         /*
1978          * fast path:  default or task policy
1979          */
1980         page = __alloc_pages_nodemask(gfp, order, zl,
1981                                       policy_nodemask(gfp, pol));
1982         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1983                 goto retry_cpuset;
1984         return page;
1985 }
1986
1987 /**
1988  *      alloc_pages_current - Allocate pages.
1989  *
1990  *      @gfp:
1991  *              %GFP_USER   user allocation,
1992  *              %GFP_KERNEL kernel allocation,
1993  *              %GFP_HIGHMEM highmem allocation,
1994  *              %GFP_FS     don't call back into a file system.
1995  *              %GFP_ATOMIC don't sleep.
1996  *      @order: Power of two of allocation size in pages. 0 is a single page.
1997  *
1998  *      Allocate a page from the kernel page pool.  When not in
1999  *      interrupt context and apply the current process NUMA policy.
2000  *      Returns NULL when no page can be allocated.
2001  *
2002  *      Don't call cpuset_update_task_memory_state() unless
2003  *      1) it's ok to take cpuset_sem (can WAIT), and
2004  *      2) allocating for current task (not interrupt).
2005  */
2006 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2007 {
2008         struct mempolicy *pol = get_task_policy(current);
2009         struct page *page;
2010         unsigned int cpuset_mems_cookie;
2011
2012         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2013                 pol = &default_policy;
2014
2015 retry_cpuset:
2016         cpuset_mems_cookie = get_mems_allowed();
2017
2018         /*
2019          * No reference counting needed for current->mempolicy
2020          * nor system default_policy
2021          */
2022         page = __alloc_pages_nodemask(gfp, order,
2023                         policy_zonelist(gfp, pol, numa_node_id()),
2024                         policy_nodemask(gfp, pol));
2025
2026         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2027                 goto retry_cpuset;
2028
2029         return page;
2030 }
2031 EXPORT_SYMBOL(alloc_pages_current);
2032
2033 /*
2034  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2035  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2036  * with the mems_allowed returned by cpuset_mems_allowed().  This
2037  * keeps mempolicies cpuset relative after its cpuset moves.  See
2038  * further kernel/cpuset.c update_nodemask().
2039  *
2040  * current's mempolicy may be rebinded by the other task(the task that changes
2041  * cpuset's mems), so we needn't do rebind work for current task.
2042  */
2043
2044 /* Slow path of a mempolicy duplicate */
2045 struct mempolicy *__mpol_dup(struct mempolicy *old)
2046 {
2047         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2048
2049         if (!new)
2050                 return ERR_PTR(-ENOMEM);
2051
2052         /* task's mempolicy is protected by alloc_lock */
2053         if (old == current->mempolicy) {
2054                 task_lock(current);
2055                 *new = *old;
2056                 task_unlock(current);
2057         } else
2058                 *new = *old;
2059
2060         rcu_read_lock();
2061         if (current_cpuset_is_being_rebound()) {
2062                 nodemask_t mems = cpuset_mems_allowed(current);
2063                 if (new->flags & MPOL_F_REBINDING)
2064                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2065                 else
2066                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2067         }
2068         rcu_read_unlock();
2069         atomic_set(&new->refcnt, 1);
2070         return new;
2071 }
2072
2073 /*
2074  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
2075  * eliminate the * MPOL_F_* flags that require conditional ref and
2076  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
2077  * after return.  Use the returned value.
2078  *
2079  * Allows use of a mempolicy for, e.g., multiple allocations with a single
2080  * policy lookup, even if the policy needs/has extra ref on lookup.
2081  * shmem_readahead needs this.
2082  */
2083 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
2084                                                 struct mempolicy *frompol)
2085 {
2086         if (!mpol_needs_cond_ref(frompol))
2087                 return frompol;
2088
2089         *tompol = *frompol;
2090         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
2091         __mpol_put(frompol);
2092         return tompol;
2093 }
2094
2095 /* Slow path of a mempolicy comparison */
2096 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2097 {
2098         if (!a || !b)
2099                 return false;
2100         if (a->mode != b->mode)
2101                 return false;
2102         if (a->flags != b->flags)
2103                 return false;
2104         if (mpol_store_user_nodemask(a))
2105                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2106                         return false;
2107
2108         switch (a->mode) {
2109         case MPOL_BIND:
2110                 /* Fall through */
2111         case MPOL_INTERLEAVE:
2112                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2113         case MPOL_PREFERRED:
2114                 return a->v.preferred_node == b->v.preferred_node;
2115         default:
2116                 BUG();
2117                 return false;
2118         }
2119 }
2120
2121 /*
2122  * Shared memory backing store policy support.
2123  *
2124  * Remember policies even when nobody has shared memory mapped.
2125  * The policies are kept in Red-Black tree linked from the inode.
2126  * They are protected by the sp->lock spinlock, which should be held
2127  * for any accesses to the tree.
2128  */
2129
2130 /* lookup first element intersecting start-end */
2131 /* Caller holds sp->mutex */
2132 static struct sp_node *
2133 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2134 {
2135         struct rb_node *n = sp->root.rb_node;
2136
2137         while (n) {
2138                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2139
2140                 if (start >= p->end)
2141                         n = n->rb_right;
2142                 else if (end <= p->start)
2143                         n = n->rb_left;
2144                 else
2145                         break;
2146         }
2147         if (!n)
2148                 return NULL;
2149         for (;;) {
2150                 struct sp_node *w = NULL;
2151                 struct rb_node *prev = rb_prev(n);
2152                 if (!prev)
2153                         break;
2154                 w = rb_entry(prev, struct sp_node, nd);
2155                 if (w->end <= start)
2156                         break;
2157                 n = prev;
2158         }
2159         return rb_entry(n, struct sp_node, nd);
2160 }
2161
2162 /* Insert a new shared policy into the list. */
2163 /* Caller holds sp->lock */
2164 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2165 {
2166         struct rb_node **p = &sp->root.rb_node;
2167         struct rb_node *parent = NULL;
2168         struct sp_node *nd;
2169
2170         while (*p) {
2171                 parent = *p;
2172                 nd = rb_entry(parent, struct sp_node, nd);
2173                 if (new->start < nd->start)
2174                         p = &(*p)->rb_left;
2175                 else if (new->end > nd->end)
2176                         p = &(*p)->rb_right;
2177                 else
2178                         BUG();
2179         }
2180         rb_link_node(&new->nd, parent, p);
2181         rb_insert_color(&new->nd, &sp->root);
2182         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2183                  new->policy ? new->policy->mode : 0);
2184 }
2185
2186 /* Find shared policy intersecting idx */
2187 struct mempolicy *
2188 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2189 {
2190         struct mempolicy *pol = NULL;
2191         struct sp_node *sn;
2192
2193         if (!sp->root.rb_node)
2194                 return NULL;
2195         mutex_lock(&sp->mutex);
2196         sn = sp_lookup(sp, idx, idx+1);
2197         if (sn) {
2198                 mpol_get(sn->policy);
2199                 pol = sn->policy;
2200         }
2201         mutex_unlock(&sp->mutex);
2202         return pol;
2203 }
2204
2205 static void sp_free(struct sp_node *n)
2206 {
2207         mpol_put(n->policy);
2208         kmem_cache_free(sn_cache, n);
2209 }
2210
2211 /**
2212  * mpol_misplaced - check whether current page node is valid in policy
2213  *
2214  * @page   - page to be checked
2215  * @vma    - vm area where page mapped
2216  * @addr   - virtual address where page mapped
2217  * @multi  - use multi-stage node binding
2218  *
2219  * Lookup current policy node id for vma,addr and "compare to" page's
2220  * node id.
2221  *
2222  * Returns:
2223  *      -1      - not misplaced, page is in the right node
2224  *      node    - node id where the page should be
2225  *
2226  * Policy determination "mimics" alloc_page_vma().
2227  * Called from fault path where we know the vma and faulting address.
2228  */
2229 int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
2230                    unsigned long addr)
2231 {
2232         struct mempolicy *pol;
2233         struct zone *zone;
2234         int curnid = page_to_nid(page);
2235         unsigned long pgoff;
2236         int polnid = -1;
2237         int ret = -1;
2238
2239         BUG_ON(!vma);
2240
2241         pol = get_vma_policy(current, vma, addr);
2242         if (!(pol->flags & MPOL_F_MOF))
2243                 goto out;
2244
2245         switch (pol->mode) {
2246         case MPOL_INTERLEAVE:
2247                 BUG_ON(addr >= vma->vm_end);
2248                 BUG_ON(addr < vma->vm_start);
2249
2250                 pgoff = vma->vm_pgoff;
2251                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2252                 polnid = offset_il_node(pol, vma, pgoff);
2253                 break;
2254
2255         case MPOL_PREFERRED:
2256                 if (pol->flags & MPOL_F_LOCAL)
2257                         polnid = numa_node_id();
2258                 else
2259                         polnid = pol->v.preferred_node;
2260                 break;
2261
2262         case MPOL_BIND:
2263                 /*
2264                  * allows binding to multiple nodes.
2265                  * use current page if in policy nodemask,
2266                  * else select nearest allowed node, if any.
2267                  * If no allowed nodes, use current [!misplaced].
2268                  */
2269                 if (node_isset(curnid, pol->v.nodes))
2270                         goto out;
2271                 (void)first_zones_zonelist(
2272                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2273                                 gfp_zone(GFP_HIGHUSER),
2274                                 &pol->v.nodes, &zone);
2275                 polnid = zone->node;
2276                 break;
2277
2278         default:
2279                 BUG();
2280         }
2281
2282         /*
2283          * Multi-stage node selection is used in conjunction with a periodic
2284          * migration fault to build a temporal task<->page relation. By
2285          * using a two-stage filter we remove short/unlikely relations.
2286          *
2287          * Using P(p) ~ n_p / n_t as per frequentist probability, we can
2288          * equate a task's usage of a particular page (n_p) per total usage
2289          * of this page (n_t) (in a given time-span) to a probability.
2290          *
2291          * Our periodic faults will then sample this probability and getting
2292          * the same result twice in a row, given these samples are fully
2293          * independent, is then given by P(n)^2, provided our sample period
2294          * is sufficiently short compared to the usage pattern.
2295          *
2296          * This quadric squishes small probabilities, making it less likely
2297          * we act on an unlikely task<->page relation.
2298          */
2299         if (pol->flags & MPOL_F_HOME) {
2300                 int last_nid;
2301
2302                 /*
2303                  * Migrate towards the current node, depends on
2304                  * task_numa_placement() details.
2305                  */
2306                 polnid = numa_node_id();
2307                 last_nid = page_xchg_last_nid(page, polnid);
2308                 if (last_nid != polnid)
2309                         goto out;
2310         }
2311
2312         if (curnid != polnid)
2313                 ret = polnid;
2314 out:
2315         mpol_cond_put(pol);
2316
2317         return ret;
2318 }
2319
2320 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2321 {
2322         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2323         rb_erase(&n->nd, &sp->root);
2324         sp_free(n);
2325 }
2326
2327 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2328                                 struct mempolicy *pol)
2329 {
2330         struct sp_node *n;
2331         struct mempolicy *newpol;
2332
2333         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2334         if (!n)
2335                 return NULL;
2336
2337         newpol = mpol_dup(pol);
2338         if (IS_ERR(newpol)) {
2339                 kmem_cache_free(sn_cache, n);
2340                 return NULL;
2341         }
2342         newpol->flags |= MPOL_F_SHARED;
2343
2344         n->start = start;
2345         n->end = end;
2346         n->policy = newpol;
2347
2348         return n;
2349 }
2350
2351 /* Replace a policy range. */
2352 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2353                                  unsigned long end, struct sp_node *new)
2354 {
2355         struct sp_node *n;
2356         int ret = 0;
2357
2358         mutex_lock(&sp->mutex);
2359         n = sp_lookup(sp, start, end);
2360         /* Take care of old policies in the same range. */
2361         while (n && n->start < end) {
2362                 struct rb_node *next = rb_next(&n->nd);
2363                 if (n->start >= start) {
2364                         if (n->end <= end)
2365                                 sp_delete(sp, n);
2366                         else
2367                                 n->start = end;
2368                 } else {
2369                         /* Old policy spanning whole new range. */
2370                         if (n->end > end) {
2371                                 struct sp_node *new2;
2372                                 new2 = sp_alloc(end, n->end, n->policy);
2373                                 if (!new2) {
2374                                         ret = -ENOMEM;
2375                                         goto out;
2376                                 }
2377                                 n->end = start;
2378                                 sp_insert(sp, new2);
2379                                 break;
2380                         } else
2381                                 n->end = start;
2382                 }
2383                 if (!next)
2384                         break;
2385                 n = rb_entry(next, struct sp_node, nd);
2386         }
2387         if (new)
2388                 sp_insert(sp, new);
2389 out:
2390         mutex_unlock(&sp->mutex);
2391         return ret;
2392 }
2393
2394 /**
2395  * mpol_shared_policy_init - initialize shared policy for inode
2396  * @sp: pointer to inode shared policy
2397  * @mpol:  struct mempolicy to install
2398  *
2399  * Install non-NULL @mpol in inode's shared policy rb-tree.
2400  * On entry, the current task has a reference on a non-NULL @mpol.
2401  * This must be released on exit.
2402  * This is called at get_inode() calls and we can use GFP_KERNEL.
2403  */
2404 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2405 {
2406         int ret;
2407
2408         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2409         mutex_init(&sp->mutex);
2410
2411         if (mpol) {
2412                 struct vm_area_struct pvma;
2413                 struct mempolicy *new;
2414                 NODEMASK_SCRATCH(scratch);
2415
2416                 if (!scratch)
2417                         goto put_mpol;
2418                 /* contextualize the tmpfs mount point mempolicy */
2419                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2420                 if (IS_ERR(new))
2421                         goto free_scratch; /* no valid nodemask intersection */
2422
2423                 task_lock(current);
2424                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2425                 task_unlock(current);
2426                 if (ret)
2427                         goto put_new;
2428
2429                 /* Create pseudo-vma that contains just the policy */
2430                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2431                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2432                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2433
2434 put_new:
2435                 mpol_put(new);                  /* drop initial ref */
2436 free_scratch:
2437                 NODEMASK_SCRATCH_FREE(scratch);
2438 put_mpol:
2439                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2440         }
2441 }
2442
2443 int mpol_set_shared_policy(struct shared_policy *info,
2444                         struct vm_area_struct *vma, struct mempolicy *npol)
2445 {
2446         int err;
2447         struct sp_node *new = NULL;
2448         unsigned long sz = vma_pages(vma);
2449
2450         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2451                  vma->vm_pgoff,
2452                  sz, npol ? npol->mode : -1,
2453                  npol ? npol->flags : -1,
2454                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
2455
2456         if (npol) {
2457                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2458                 if (!new)
2459                         return -ENOMEM;
2460         }
2461         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2462         if (err && new)
2463                 sp_free(new);
2464         return err;
2465 }
2466
2467 /* Free a backing policy store on inode delete. */
2468 void mpol_free_shared_policy(struct shared_policy *p)
2469 {
2470         struct sp_node *n;
2471         struct rb_node *next;
2472
2473         if (!p->root.rb_node)
2474                 return;
2475         mutex_lock(&p->mutex);
2476         next = rb_first(&p->root);
2477         while (next) {
2478                 n = rb_entry(next, struct sp_node, nd);
2479                 next = rb_next(&n->nd);
2480                 sp_delete(p, n);
2481         }
2482         mutex_unlock(&p->mutex);
2483 }
2484
2485 /* assumes fs == KERNEL_DS */
2486 void __init numa_policy_init(void)
2487 {
2488         nodemask_t interleave_nodes;
2489         unsigned long largest = 0;
2490         int nid, prefer = 0;
2491
2492         policy_cache = kmem_cache_create("numa_policy",
2493                                          sizeof(struct mempolicy),
2494                                          0, SLAB_PANIC, NULL);
2495
2496         sn_cache = kmem_cache_create("shared_policy_node",
2497                                      sizeof(struct sp_node),
2498                                      0, SLAB_PANIC, NULL);
2499
2500         for_each_node(nid) {
2501                 preferred_node_policy[nid] = (struct mempolicy) {
2502                         .refcnt = ATOMIC_INIT(1),
2503                         .mode = MPOL_PREFERRED,
2504                         .flags = MPOL_F_MOF | MPOL_F_HOME,
2505                         .v = { .preferred_node = nid, },
2506                 };
2507         }
2508
2509         /*
2510          * Set interleaving policy for system init. Interleaving is only
2511          * enabled across suitably sized nodes (default is >= 16MB), or
2512          * fall back to the largest node if they're all smaller.
2513          */
2514         nodes_clear(interleave_nodes);
2515         for_each_node_state(nid, N_HIGH_MEMORY) {
2516                 unsigned long total_pages = node_present_pages(nid);
2517
2518                 /* Preserve the largest node */
2519                 if (largest < total_pages) {
2520                         largest = total_pages;
2521                         prefer = nid;
2522                 }
2523
2524                 /* Interleave this node? */
2525                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2526                         node_set(nid, interleave_nodes);
2527         }
2528
2529         /* All too small, use the largest */
2530         if (unlikely(nodes_empty(interleave_nodes)))
2531                 node_set(prefer, interleave_nodes);
2532
2533         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2534                 printk("numa_policy_init: interleaving failed\n");
2535 }
2536
2537 /* Reset policy of current process to default */
2538 void numa_default_policy(void)
2539 {
2540         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2541 }
2542
2543 /*
2544  * Parse and format mempolicy from/to strings
2545  */
2546
2547 /*
2548  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2549  * Used only for mpol_parse_str() and mpol_to_str()
2550  */
2551 static const char * const policy_modes[] =
2552 {
2553         [MPOL_DEFAULT]    = "default",
2554         [MPOL_PREFERRED]  = "prefer",
2555         [MPOL_BIND]       = "bind",
2556         [MPOL_INTERLEAVE] = "interleave",
2557         [MPOL_LOCAL]      = "local",
2558         [MPOL_NOOP]       = "noop",     /* should not actually be used */
2559 };
2560
2561
2562 #ifdef CONFIG_TMPFS
2563 /**
2564  * mpol_parse_str - parse string to mempolicy
2565  * @str:  string containing mempolicy to parse
2566  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2567  * @no_context:  flag whether to "contextualize" the mempolicy
2568  *
2569  * Format of input:
2570  *      <mode>[=<flags>][:<nodelist>]
2571  *
2572  * if @no_context is true, save the input nodemask in w.user_nodemask in
2573  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2574  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2575  * mount option.  Note that if 'static' or 'relative' mode flags were
2576  * specified, the input nodemask will already have been saved.  Saving
2577  * it again is redundant, but safe.
2578  *
2579  * On success, returns 0, else 1
2580  */
2581 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2582 {
2583         struct mempolicy *new = NULL;
2584         unsigned short mode;
2585         unsigned short uninitialized_var(mode_flags);
2586         nodemask_t nodes;
2587         char *nodelist = strchr(str, ':');
2588         char *flags = strchr(str, '=');
2589         int err = 1;
2590
2591         if (nodelist) {
2592                 /* NUL-terminate mode or flags string */
2593                 *nodelist++ = '\0';
2594                 if (nodelist_parse(nodelist, nodes))
2595                         goto out;
2596                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2597                         goto out;
2598         } else
2599                 nodes_clear(nodes);
2600
2601         if (flags)
2602                 *flags++ = '\0';        /* terminate mode string */
2603
2604         for (mode = 0; mode < MPOL_MAX; mode++) {
2605                 if (!strcmp(str, policy_modes[mode])) {
2606                         break;
2607                 }
2608         }
2609         if (mode >= MPOL_MAX || mode == MPOL_NOOP)
2610                 goto out;
2611
2612         switch (mode) {
2613         case MPOL_PREFERRED:
2614                 /*
2615                  * Insist on a nodelist of one node only
2616                  */
2617                 if (nodelist) {
2618                         char *rest = nodelist;
2619                         while (isdigit(*rest))
2620                                 rest++;
2621                         if (*rest)
2622                                 goto out;
2623                 }
2624                 break;
2625         case MPOL_INTERLEAVE:
2626                 /*
2627                  * Default to online nodes with memory if no nodelist
2628                  */
2629                 if (!nodelist)
2630                         nodes = node_states[N_HIGH_MEMORY];
2631                 break;
2632         case MPOL_LOCAL:
2633                 /*
2634                  * Don't allow a nodelist;  mpol_new() checks flags
2635                  */
2636                 if (nodelist)
2637                         goto out;
2638                 mode = MPOL_PREFERRED;
2639                 break;
2640         case MPOL_DEFAULT:
2641                 /*
2642                  * Insist on a empty nodelist
2643                  */
2644                 if (!nodelist)
2645                         err = 0;
2646                 goto out;
2647         case MPOL_BIND:
2648                 /*
2649                  * Insist on a nodelist
2650                  */
2651                 if (!nodelist)
2652                         goto out;
2653         }
2654
2655         mode_flags = 0;
2656         if (flags) {
2657                 /*
2658                  * Currently, we only support two mutually exclusive
2659                  * mode flags.
2660                  */
2661                 if (!strcmp(flags, "static"))
2662                         mode_flags |= MPOL_F_STATIC_NODES;
2663                 else if (!strcmp(flags, "relative"))
2664                         mode_flags |= MPOL_F_RELATIVE_NODES;
2665                 else
2666                         goto out;
2667         }
2668
2669         new = mpol_new(mode, mode_flags, &nodes);
2670         if (IS_ERR(new))
2671                 goto out;
2672
2673         if (no_context) {
2674                 /* save for contextualization */
2675                 new->w.user_nodemask = nodes;
2676         } else {
2677                 int ret;
2678                 NODEMASK_SCRATCH(scratch);
2679                 if (scratch) {
2680                         task_lock(current);
2681                         ret = mpol_set_nodemask(new, &nodes, scratch);
2682                         task_unlock(current);
2683                 } else
2684                         ret = -ENOMEM;
2685                 NODEMASK_SCRATCH_FREE(scratch);
2686                 if (ret) {
2687                         mpol_put(new);
2688                         goto out;
2689                 }
2690         }
2691         err = 0;
2692
2693 out:
2694         /* Restore string for error message */
2695         if (nodelist)
2696                 *--nodelist = ':';
2697         if (flags)
2698                 *--flags = '=';
2699         if (!err)
2700                 *mpol = new;
2701         return err;
2702 }
2703 #endif /* CONFIG_TMPFS */
2704
2705 /**
2706  * mpol_to_str - format a mempolicy structure for printing
2707  * @buffer:  to contain formatted mempolicy string
2708  * @maxlen:  length of @buffer
2709  * @pol:  pointer to mempolicy to be formatted
2710  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2711  *
2712  * Convert a mempolicy into a string.
2713  * Returns the number of characters in buffer (if positive)
2714  * or an error (negative)
2715  */
2716 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2717 {
2718         char *p = buffer;
2719         int l;
2720         nodemask_t nodes;
2721         unsigned short mode;
2722         unsigned short flags = pol ? pol->flags : 0;
2723
2724         /*
2725          * Sanity check:  room for longest mode, flag and some nodes
2726          */
2727         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2728
2729         if (!pol || pol == &default_policy)
2730                 mode = MPOL_DEFAULT;
2731         else
2732                 mode = pol->mode;
2733
2734         switch (mode) {
2735         case MPOL_DEFAULT:
2736                 nodes_clear(nodes);
2737                 break;
2738
2739         case MPOL_PREFERRED:
2740                 nodes_clear(nodes);
2741                 if (flags & MPOL_F_LOCAL)
2742                         mode = MPOL_LOCAL;      /* pseudo-policy */
2743                 else
2744                         node_set(pol->v.preferred_node, nodes);
2745                 break;
2746
2747         case MPOL_BIND:
2748                 /* Fall through */
2749         case MPOL_INTERLEAVE:
2750                 if (no_context)
2751                         nodes = pol->w.user_nodemask;
2752                 else
2753                         nodes = pol->v.nodes;
2754                 break;
2755
2756         default:
2757                 return -EINVAL;
2758         }
2759
2760         l = strlen(policy_modes[mode]);
2761         if (buffer + maxlen < p + l + 1)
2762                 return -ENOSPC;
2763
2764         strcpy(p, policy_modes[mode]);
2765         p += l;
2766
2767         if (flags & MPOL_MODE_FLAGS) {
2768                 if (buffer + maxlen < p + 2)
2769                         return -ENOSPC;
2770                 *p++ = '=';
2771
2772                 /*
2773                  * Currently, the only defined flags are mutually exclusive
2774                  */
2775                 if (flags & MPOL_F_STATIC_NODES)
2776                         p += snprintf(p, buffer + maxlen - p, "static");
2777                 else if (flags & MPOL_F_RELATIVE_NODES)
2778                         p += snprintf(p, buffer + maxlen - p, "relative");
2779         }
2780
2781         if (!nodes_empty(nodes)) {
2782                 if (buffer + maxlen < p + 2)
2783                         return -ENOSPC;
2784                 *p++ = ':';
2785                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2786         }
2787         return p - buffer;
2788 }