mm/hugetlb.c

   1 /*
   2  * Generic hugetlb support.
   3  * (C) Nadia Yvette Chambers, April 2004
   4  */
   5 #include <linux/list.h>
   6 #include <linux/init.h>
   7 #include <linux/module.h>
   8 #include <linux/mm.h>
   9 #include <linux/seq_file.h>
  10 #include <linux/sysctl.h>
  11 #include <linux/highmem.h>
  12 #include <linux/mmu_notifier.h>
  13 #include <linux/nodemask.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/mempolicy.h>
  16 #include <linux/compiler.h>
  17 #include <linux/cpuset.h>
  18 #include <linux/mutex.h>
  19 #include <linux/bootmem.h>
  20 #include <linux/sysfs.h>
  21 #include <linux/slab.h>
  22 #include <linux/rmap.h>
  23 #include <linux/swap.h>
  24 #include <linux/swapops.h>
  25 #include <linux/page-isolation.h>
  26 #include <linux/jhash.h>
  27
  28 #include <asm/page.h>
  29 #include <asm/pgtable.h>
  30 #include <asm/tlb.h>
  31
  32 #include <linux/io.h>
  33 #include <linux/hugetlb.h>
  34 #include <linux/hugetlb_cgroup.h>
  35 #include <linux/node.h>
  36 #include "internal.h"
  37
  38 int hugepages_treat_as_movable;
  39
  40 int hugetlb_max_hstate __read_mostly;
  41 unsigned int default_hstate_idx;
  42 struct hstate hstates[HUGE_MAX_HSTATE];
  43
  44 __initdata LIST_HEAD(huge_boot_pages);
  45
  46 /* for command line parsing */
  47 static struct hstate * __initdata parsed_hstate;
  48 static unsigned long __initdata default_hstate_max_huge_pages;
  49 static unsigned long __initdata default_hstate_size;
  50
  51 /*
  52  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
  53  * free_huge_pages, and surplus_huge_pages.
  54  */
  55 DEFINE_SPINLOCK(hugetlb_lock);
  56
  57 /*
  58  * Serializes faults on the same logical page.  This is used to
  59  * prevent spurious OOMs when the hugepage pool is fully utilized.
  60  */
  61 static int num_fault_mutexes;
  62 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
  63
  64 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
  65 {
  66         bool free = (spool->count == 0) && (spool->used_hpages == 0);
  67
  68         spin_unlock(&spool->lock);
  69
  70         /* If no pages are used, and no other handles to the subpool
  71          * remain, free the subpool the subpool remain */
  72         if (free)
  73                 kfree(spool);
  74 }
  75
  76 struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
  77 {
  78         struct hugepage_subpool *spool;
  79
  80         spool = kzalloc(sizeof(*spool), GFP_KERNEL);
  81         if (!spool)
  82                 return NULL;
  83
  84         spin_lock_init(&spool->lock);
  85         spool->count = 1;
  86         spool->max_hpages = nr_blocks;
  87
  88         return spool;
  89 }
  90
  91 void hugepage_put_subpool(struct hugepage_subpool *spool)
  92 {
  93         spin_lock(&spool->lock);
  94         BUG_ON(!spool->count);
  95         spool->count--;
  96         unlock_or_release_subpool(spool);
  97 }
  98
  99 static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
 100                                       long delta)
 101 {
 102         int ret = 0;
 103
 104         if (!spool)
 105                 return 0;
 106
 107         spin_lock(&spool->lock);
 108         if ((spool->used_hpages + delta) <= spool->max_hpages) {
 109                 spool->used_hpages += delta;
 110         } else {
 111                 ret = -ENOMEM;
 112         }
 113         spin_unlock(&spool->lock);
 114
 115         return ret;
 116 }
 117
 118 static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
 119                                        long delta)
 120 {
 121         if (!spool)
 122                 return;
 123
 124         spin_lock(&spool->lock);
 125         spool->used_hpages -= delta;
 126         /* If hugetlbfs_put_super couldn't free spool due to
 127         * an outstanding quota reference, free it now. */
 128         unlock_or_release_subpool(spool);
 129 }
 130
 131 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
 132 {
 133         return HUGETLBFS_SB(inode->i_sb)->spool;
 134 }
 135
 136 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
 137 {
 138         return subpool_inode(file_inode(vma->vm_file));
 139 }
 140
 141 /*
 142  * Region tracking -- allows tracking of reservations and instantiated pages
 143  *                    across the pages in a mapping.
 144  *
 145  * The region data structures are embedded into a resv_map and
 146  * protected by a resv_map's lock
 147  */
 148 struct file_region {
 149         struct list_head link;
 150         long from;
 151         long to;
 152 };
 153
 154 static long region_add(struct resv_map *resv, long f, long t)
 155 {
 156         struct list_head *head = &resv->regions;
 157         struct file_region *rg, *nrg, *trg;
 158
 159         spin_lock(&resv->lock);
 160         /* Locate the region we are either in or before. */
 161         list_for_each_entry(rg, head, link)
 162                 if (f <= rg->to)
 163                         break;
 164
 165         /* Round our left edge to the current segment if it encloses us. */
 166         if (f > rg->from)
 167                 f = rg->from;
 168
 169         /* Check for and consume any regions we now overlap with. */
 170         nrg = rg;
 171         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 172                 if (&rg->link == head)
 173                         break;
 174                 if (rg->from > t)
 175                         break;
 176
 177                 /* If this area reaches higher then extend our area to
 178                  * include it completely.  If this is not the first area
 179                  * which we intend to reuse, free it. */
 180                 if (rg->to > t)
 181                         t = rg->to;
 182                 if (rg != nrg) {
 183                         list_del(&rg->link);
 184                         kfree(rg);
 185                 }
 186         }
 187         nrg->from = f;
 188         nrg->to = t;
 189         spin_unlock(&resv->lock);
 190         return 0;
 191 }
 192
 193 static long region_chg(struct resv_map *resv, long f, long t)
 194 {
 195         struct list_head *head = &resv->regions;
 196         struct file_region *rg, *nrg = NULL;
 197         long chg = 0;
 198
 199 retry:
 200         spin_lock(&resv->lock);
 201         /* Locate the region we are before or in. */
 202         list_for_each_entry(rg, head, link)
 203                 if (f <= rg->to)
 204                         break;
 205
 206         /* If we are below the current region then a new region is required.
 207          * Subtle, allocate a new region at the position but make it zero
 208          * size such that we can guarantee to record the reservation. */
 209         if (&rg->link == head || t < rg->from) {
 210                 if (!nrg) {
 211                         spin_unlock(&resv->lock);
 212                         nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
 213                         if (!nrg)
 214                                 return -ENOMEM;
 215
 216                         nrg->from = f;
 217                         nrg->to   = f;
 218                         INIT_LIST_HEAD(&nrg->link);
 219                         goto retry;
 220                 }
 221
 222                 list_add(&nrg->link, rg->link.prev);
 223                 chg = t - f;
 224                 goto out_nrg;
 225         }
 226
 227         /* Round our left edge to the current segment if it encloses us. */
 228         if (f > rg->from)
 229                 f = rg->from;
 230         chg = t - f;
 231
 232         /* Check for and consume any regions we now overlap with. */
 233         list_for_each_entry(rg, rg->link.prev, link) {
 234                 if (&rg->link == head)
 235                         break;
 236                 if (rg->from > t)
 237                         goto out;
 238
 239                 /* We overlap with this area, if it extends further than
 240                  * us then we must extend ourselves.  Account for its
 241                  * existing reservation. */
 242                 if (rg->to > t) {
 243                         chg += rg->to - t;
 244                         t = rg->to;
 245                 }
 246                 chg -= rg->to - rg->from;
 247         }
 248
 249 out:
 250         spin_unlock(&resv->lock);
 251         /*  We already know we raced and no longer need the new region */
 252         kfree(nrg);
 253         return chg;
 254 out_nrg:
 255         spin_unlock(&resv->lock);
 256         return chg;
 257 }
 258
 259 static long region_truncate(struct resv_map *resv, long end)
 260 {
 261         struct list_head *head = &resv->regions;
 262         struct file_region *rg, *trg;
 263         long chg = 0;
 264
 265         spin_lock(&resv->lock);
 266         /* Locate the region we are either in or before. */
 267         list_for_each_entry(rg, head, link)
 268                 if (end <= rg->to)
 269                         break;
 270         if (&rg->link == head)
 271                 goto out;
 272
 273         /* If we are in the middle of a region then adjust it. */
 274         if (end > rg->from) {
 275                 chg = rg->to - end;
 276                 rg->to = end;
 277                 rg = list_entry(rg->link.next, typeof(*rg), link);
 278         }
 279
 280         /* Drop any remaining regions. */
 281         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 282                 if (&rg->link == head)
 283                         break;
 284                 chg += rg->to - rg->from;
 285                 list_del(&rg->link);
 286                 kfree(rg);
 287         }
 288
 289 out:
 290         spin_unlock(&resv->lock);
 291         return chg;
 292 }
 293
 294 static long region_count(struct resv_map *resv, long f, long t)
 295 {
 296         struct list_head *head = &resv->regions;
 297         struct file_region *rg;
 298         long chg = 0;
 299
 300         spin_lock(&resv->lock);
 301         /* Locate each segment we overlap with, and count that overlap. */
 302         list_for_each_entry(rg, head, link) {
 303                 long seg_from;
 304                 long seg_to;
 305
 306                 if (rg->to <= f)
 307                         continue;
 308                 if (rg->from >= t)
 309                         break;
 310
 311                 seg_from = max(rg->from, f);
 312                 seg_to = min(rg->to, t);
 313
 314                 chg += seg_to - seg_from;
 315         }
 316         spin_unlock(&resv->lock);
 317
 318         return chg;
 319 }
 320
 321 /*
 322  * Convert the address within this vma to the page offset within
 323  * the mapping, in pagecache page units; huge pages here.
 324  */
 325 static pgoff_t vma_hugecache_offset(struct hstate *h,
 326                         struct vm_area_struct *vma, unsigned long address)
 327 {
 328         return ((address - vma->vm_start) >> huge_page_shift(h)) +
 329                         (vma->vm_pgoff >> huge_page_order(h));
 330 }
 331
 332 pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
 333                                      unsigned long address)
 334 {
 335         return vma_hugecache_offset(hstate_vma(vma), vma, address);
 336 }
 337
 338 /*
 339  * Return the size of the pages allocated when backing a VMA. In the majority
 340  * cases this will be same size as used by the page table entries.
 341  */
 342 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
 343 {
 344         struct hstate *hstate;
 345
 346         if (!is_vm_hugetlb_page(vma))
 347                 return PAGE_SIZE;
 348
 349         hstate = hstate_vma(vma);
 350
 351         return 1UL << huge_page_shift(hstate);
 352 }
 353 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
 354
 355 /*
 356  * Return the page size being used by the MMU to back a VMA. In the majority
 357  * of cases, the page size used by the kernel matches the MMU size. On
 358  * architectures where it differs, an architecture-specific version of this
 359  * function is required.
 360  */
 361 #ifndef vma_mmu_pagesize
 362 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 363 {
 364         return vma_kernel_pagesize(vma);
 365 }
 366 #endif
 367
 368 /*
 369  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 370  * bits of the reservation map pointer, which are always clear due to
 371  * alignment.
 372  */
 373 #define HPAGE_RESV_OWNER    (1UL << 0)
 374 #define HPAGE_RESV_UNMAPPED (1UL << 1)
 375 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
 376
 377 /*
 378  * These helpers are used to track how many pages are reserved for
 379  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 380  * is guaranteed to have their future faults succeed.
 381  *
 382  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
 383  * the reserve counters are updated with the hugetlb_lock held. It is safe
 384  * to reset the VMA at fork() time as it is not in use yet and there is no
 385  * chance of the global counters getting corrupted as a result of the values.
 386  *
 387  * The private mapping reservation is represented in a subtly different
 388  * manner to a shared mapping.  A shared mapping has a region map associated
 389  * with the underlying file, this region map represents the backing file
 390  * pages which have ever had a reservation assigned which this persists even
 391  * after the page is instantiated.  A private mapping has a region map
 392  * associated with the original mmap which is attached to all VMAs which
 393  * reference it, this region map represents those offsets which have consumed
 394  * reservation ie. where pages have been instantiated.
 395  */
 396 static unsigned long get_vma_private_data(struct vm_area_struct *vma)
 397 {
 398         return (unsigned long)vma->vm_private_data;
 399 }
 400
 401 static void set_vma_private_data(struct vm_area_struct *vma,
 402                                                         unsigned long value)
 403 {
 404         vma->vm_private_data = (void *)value;
 405 }
 406
 407 struct resv_map *resv_map_alloc(void)
 408 {
 409         struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
 410         if (!resv_map)
 411                 return NULL;
 412
 413         kref_init(&resv_map->refs);
 414         spin_lock_init(&resv_map->lock);
 415         INIT_LIST_HEAD(&resv_map->regions);
 416
 417         return resv_map;
 418 }
 419
 420 void resv_map_release(struct kref *ref)
 421 {
 422         struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
 423
 424         /* Clear out any active regions before we release the map. */
 425         region_truncate(resv_map, 0);
 426         kfree(resv_map);
 427 }
 428
 429 static inline struct resv_map *inode_resv_map(struct inode *inode)
 430 {
 431         return inode->i_mapping->private_data;
 432 }
 433
 434 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 435 {
 436         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
 437         if (vma->vm_flags & VM_MAYSHARE) {
 438                 struct address_space *mapping = vma->vm_file->f_mapping;
 439                 struct inode *inode = mapping->host;
 440
 441                 return inode_resv_map(inode);
 442
 443         } else {
 444                 return (struct resv_map *)(get_vma_private_data(vma) &
 445                                                         ~HPAGE_RESV_MASK);
 446         }
 447 }
 448
 449 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
 450 {
 451         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
 452         VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
 453
 454         set_vma_private_data(vma, (get_vma_private_data(vma) &
 455                                 HPAGE_RESV_MASK) | (unsigned long)map);
 456 }
 457
 458 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
 459 {
 460         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
 461         VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
 462
 463         set_vma_private_data(vma, get_vma_private_data(vma) | flags);
 464 }
 465
 466 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 467 {
 468         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
 469
 470         return (get_vma_private_data(vma) & flag) != 0;
 471 }
 472
 473 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
 474 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 475 {
 476         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
 477         if (!(vma->vm_flags & VM_MAYSHARE))
 478                 vma->vm_private_data = (void *)0;
 479 }
 480
 481 /* Returns true if the VMA has associated reserve pages */
 482 static int vma_has_reserves(struct vm_area_struct *vma, long chg)
 483 {
 484         if (vma->vm_flags & VM_NORESERVE) {
 485                 /*
 486                  * This address is already reserved by other process(chg == 0),
 487                  * so, we should decrement reserved count. Without decrementing,
 488                  * reserve count remains after releasing inode, because this
 489                  * allocated page will go into page cache and is regarded as
 490                  * coming from reserved pool in releasing step.  Currently, we
 491                  * don't have any other solution to deal with this situation
 492                  * properly, so add work-around here.
 493                  */
 494                 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
 495                         return 1;
 496                 else
 497                         return 0;
 498         }
 499
 500         /* Shared mappings always use reserves */
 501         if (vma->vm_flags & VM_MAYSHARE)
 502                 return 1;
 503
 504         /*
 505          * Only the process that called mmap() has reserves for
 506          * private mappings.
 507          */
 508         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 509                 return 1;
 510
 511         return 0;
 512 }
 513
 514 static void enqueue_huge_page(struct hstate *h, struct page *page)
 515 {
 516         int nid = page_to_nid(page);
 517         list_move(&page->lru, &h->hugepage_freelists[nid]);
 518         h->free_huge_pages++;
 519         h->free_huge_pages_node[nid]++;
 520 }
 521
 522 static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
 523 {
 524         struct page *page;
 525
 526         list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
 527                 if (!is_migrate_isolate_page(page))
 528                         break;
 529         /*
 530          * if 'non-isolated free hugepage' not found on the list,
 531          * the allocation fails.
 532          */
 533         if (&h->hugepage_freelists[nid] == &page->lru)
 534                 return NULL;
 535         list_move(&page->lru, &h->hugepage_activelist);
 536         set_page_refcounted(page);
 537         h->free_huge_pages--;
 538         h->free_huge_pages_node[nid]--;
 539         return page;
 540 }
 541
 542 /* Movability of hugepages depends on migration support. */
 543 static inline gfp_t htlb_alloc_mask(struct hstate *h)
 544 {
 545         if (hugepages_treat_as_movable || hugepage_migration_supported(h))
 546                 return GFP_HIGHUSER_MOVABLE;
 547         else
 548                 return GFP_HIGHUSER;
 549 }
 550
 551 static struct page *dequeue_huge_page_vma(struct hstate *h,
 552                                 struct vm_area_struct *vma,
 553                                 unsigned long address, int avoid_reserve,
 554                                 long chg)
 555 {
 556         struct page *page = NULL;
 557         struct mempolicy *mpol;
 558         nodemask_t *nodemask;
 559         struct zonelist *zonelist;
 560         struct zone *zone;
 561         struct zoneref *z;
 562         unsigned int cpuset_mems_cookie;
 563
 564         /*
 565          * A child process with MAP_PRIVATE mappings created by their parent
 566          * have no page reserves. This check ensures that reservations are
 567          * not "stolen". The child may still get SIGKILLed
 568          */
 569         if (!vma_has_reserves(vma, chg) &&
 570                         h->free_huge_pages - h->resv_huge_pages == 0)
 571                 goto err;
 572
 573         /* If reserves cannot be used, ensure enough pages are in the pool */
 574         if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
 575                 goto err;
 576
 577 retry_cpuset:
 578         cpuset_mems_cookie = read_mems_allowed_begin();
 579         zonelist = huge_zonelist(vma, address,
 580                                         htlb_alloc_mask(h), &mpol, &nodemask);
 581
 582         for_each_zone_zonelist_nodemask(zone, z, zonelist,
 583                                                 MAX_NR_ZONES - 1, nodemask) {
 584                 if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
 585                         page = dequeue_huge_page_node(h, zone_to_nid(zone));
 586                         if (page) {
 587                                 if (avoid_reserve)
 588                                         break;
 589                                 if (!vma_has_reserves(vma, chg))
 590                                         break;
 591
 592                                 SetPagePrivate(page);
 593                                 h->resv_huge_pages--;
 594                                 break;
 595                         }
 596                 }
 597         }
 598
 599         mpol_cond_put(mpol);
 600         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 601                 goto retry_cpuset;
 602         return page;
 603
 604 err:
 605         return NULL;
 606 }
 607
 608 /*
 609  * common helper functions for hstate_next_node_to_{alloc|free}.
 610  * We may have allocated or freed a huge page based on a different
 611  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
 612  * be outside of *nodes_allowed.  Ensure that we use an allowed
 613  * node for alloc or free.
 614  */
 615 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
 616 {
 617         nid = next_node(nid, *nodes_allowed);
 618         if (nid == MAX_NUMNODES)
 619                 nid = first_node(*nodes_allowed);
 620         VM_BUG_ON(nid >= MAX_NUMNODES);
 621
 622         return nid;
 623 }
 624
 625 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
 626 {
 627         if (!node_isset(nid, *nodes_allowed))
 628                 nid = next_node_allowed(nid, nodes_allowed);
 629         return nid;
 630 }
 631
 632 /*
 633  * returns the previously saved node ["this node"] from which to
 634  * allocate a persistent huge page for the pool and advance the
 635  * next node from which to allocate, handling wrap at end of node
 636  * mask.
 637  */
 638 static int hstate_next_node_to_alloc(struct hstate *h,
 639                                         nodemask_t *nodes_allowed)
 640 {
 641         int nid;
 642
 643         VM_BUG_ON(!nodes_allowed);
 644
 645         nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
 646         h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
 647
 648         return nid;
 649 }
 650
 651 /*
 652  * helper for free_pool_huge_page() - return the previously saved
 653  * node ["this node"] from which to free a huge page.  Advance the
 654  * next node id whether or not we find a free huge page to free so
 655  * that the next attempt to free addresses the next node.
 656  */
 657 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 658 {
 659         int nid;
 660
 661         VM_BUG_ON(!nodes_allowed);
 662
 663         nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
 664         h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
 665
 666         return nid;
 667 }
 668
 669 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)           \
 670         for (nr_nodes = nodes_weight(*mask);                            \
 671                 nr_nodes > 0 &&                                         \
 672                 ((node = hstate_next_node_to_alloc(hs, mask)) || 1);    \
 673                 nr_nodes--)
 674
 675 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)            \
 676         for (nr_nodes = nodes_weight(*mask);                            \
 677                 nr_nodes > 0 &&                                         \
 678                 ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
 679                 nr_nodes--)
 680
 681 #if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
 682 static void destroy_compound_gigantic_page(struct page *page,
 683                                         unsigned long order)
 684 {
 685         int i;
 686         int nr_pages = 1 << order;
 687         struct page *p = page + 1;
 688
 689         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
 690                 __ClearPageTail(p);
 691                 set_page_refcounted(p);
 692                 p->first_page = NULL;
 693         }
 694
 695         set_compound_order(page, 0);
 696         __ClearPageHead(page);
 697 }
 698
 699 static void free_gigantic_page(struct page *page, unsigned order)
 700 {
 701         free_contig_range(page_to_pfn(page), 1 << order);
 702 }
 703
 704 static int __alloc_gigantic_page(unsigned long start_pfn,
 705                                 unsigned long nr_pages)
 706 {
 707         unsigned long end_pfn = start_pfn + nr_pages;
 708         return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 709 }
 710
 711 static bool pfn_range_valid_gigantic(unsigned long start_pfn,
 712                                 unsigned long nr_pages)
 713 {
 714         unsigned long i, end_pfn = start_pfn + nr_pages;
 715         struct page *page;
 716
 717         for (i = start_pfn; i < end_pfn; i++) {
 718                 if (!pfn_valid(i))
 719                         return false;
 720
 721                 page = pfn_to_page(i);
 722
 723                 if (PageReserved(page))
 724                         return false;
 725
 726                 if (page_count(page) > 0)
 727                         return false;
 728
 729                 if (PageHuge(page))
 730                         return false;
 731         }
 732
 733         return true;
 734 }
 735
 736 static bool zone_spans_last_pfn(const struct zone *zone,
 737                         unsigned long start_pfn, unsigned long nr_pages)
 738 {
 739         unsigned long last_pfn = start_pfn + nr_pages - 1;
 740         return zone_spans_pfn(zone, last_pfn);
 741 }
 742
 743 static struct page *alloc_gigantic_page(int nid, unsigned order)
 744 {
 745         unsigned long nr_pages = 1 << order;
 746         unsigned long ret, pfn, flags;
 747         struct zone *z;
 748
 749         z = NODE_DATA(nid)->node_zones;
 750         for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
 751                 spin_lock_irqsave(&z->lock, flags);
 752
 753                 pfn = ALIGN(z->zone_start_pfn, nr_pages);
 754                 while (zone_spans_last_pfn(z, pfn, nr_pages)) {
 755                         if (pfn_range_valid_gigantic(pfn, nr_pages)) {
 756                                 /*
 757                                  * We release the zone lock here because
 758                                  * alloc_contig_range() will also lock the zone
 759                                  * at some point. If there's an allocation
 760                                  * spinning on this lock, it may win the race
 761                                  * and cause alloc_contig_range() to fail...
 762                                  */
 763                                 spin_unlock_irqrestore(&z->lock, flags);
 764                                 ret = __alloc_gigantic_page(pfn, nr_pages);
 765                                 if (!ret)
 766                                         return pfn_to_page(pfn);
 767                                 spin_lock_irqsave(&z->lock, flags);
 768                         }
 769                         pfn += nr_pages;
 770                 }
 771
 772                 spin_unlock_irqrestore(&z->lock, flags);
 773         }
 774
 775         return NULL;
 776 }
 777
 778 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
 779 static void prep_compound_gigantic_page(struct page *page, unsigned long order);
 780
 781 static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
 782 {
 783         struct page *page;
 784
 785         page = alloc_gigantic_page(nid, huge_page_order(h));
 786         if (page) {
 787                 prep_compound_gigantic_page(page, huge_page_order(h));
 788                 prep_new_huge_page(h, page, nid);
 789         }
 790
 791         return page;
 792 }
 793
 794 static int alloc_fresh_gigantic_page(struct hstate *h,
 795                                 nodemask_t *nodes_allowed)
 796 {
 797         struct page *page = NULL;
 798         int nr_nodes, node;
 799
 800         for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
 801                 page = alloc_fresh_gigantic_page_node(h, node);
 802                 if (page)
 803                         return 1;
 804         }
 805
 806         return 0;
 807 }
 808
 809 static inline bool gigantic_page_supported(void) { return true; }
 810 #else
 811 static inline bool gigantic_page_supported(void) { return false; }
 812 static inline void free_gigantic_page(struct page *page, unsigned order) { }
 813 static inline void destroy_compound_gigantic_page(struct page *page,
 814                                                 unsigned long order) { }
 815 static inline int alloc_fresh_gigantic_page(struct hstate *h,
 816                                         nodemask_t *nodes_allowed) { return 0; }
 817 #endif
 818
 819 static void update_and_free_page(struct hstate *h, struct page *page)
 820 {
 821         int i;
 822
 823         if (hstate_is_gigantic(h) && !gigantic_page_supported())
 824                 return;
 825
 826         h->nr_huge_pages--;
 827         h->nr_huge_pages_node[page_to_nid(page)]--;
 828         for (i = 0; i < pages_per_huge_page(h); i++) {
 829                 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
 830                                 1 << PG_referenced | 1 << PG_dirty |
 831                                 1 << PG_active | 1 << PG_private |
 832                                 1 << PG_writeback);
 833         }
 834         VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
 835         set_compound_page_dtor(page, NULL);
 836         set_page_refcounted(page);
 837         if (hstate_is_gigantic(h)) {
 838                 destroy_compound_gigantic_page(page, huge_page_order(h));
 839                 free_gigantic_page(page, huge_page_order(h));
 840         } else {
 841                 arch_release_hugepage(page);
 842                 __free_pages(page, huge_page_order(h));
 843         }
 844 }
 845
 846 struct hstate *size_to_hstate(unsigned long size)
 847 {
 848         struct hstate *h;
 849
 850         for_each_hstate(h) {
 851                 if (huge_page_size(h) == size)
 852                         return h;
 853         }
 854         return NULL;
 855 }
 856
 857 void free_huge_page(struct page *page)
 858 {
 859         /*
 860          * Can't pass hstate in here because it is called from the
 861          * compound page destructor.
 862          */
 863         struct hstate *h = page_hstate(page);
 864         int nid = page_to_nid(page);
 865         struct hugepage_subpool *spool =
 866                 (struct hugepage_subpool *)page_private(page);
 867         bool restore_reserve;
 868
 869         set_page_private(page, 0);
 870         page->mapping = NULL;
 871         BUG_ON(page_count(page));
 872         BUG_ON(page_mapcount(page));
 873         restore_reserve = PagePrivate(page);
 874         ClearPagePrivate(page);
 875
 876         spin_lock(&hugetlb_lock);
 877         hugetlb_cgroup_uncharge_page(hstate_index(h),
 878                                      pages_per_huge_page(h), page);
 879         if (restore_reserve)
 880                 h->resv_huge_pages++;
 881
 882         if (h->surplus_huge_pages_node[nid]) {
 883                 /* remove the page from active list */
 884                 list_del(&page->lru);
 885                 update_and_free_page(h, page);
 886                 h->surplus_huge_pages--;
 887                 h->surplus_huge_pages_node[nid]--;
 888         } else {
 889                 arch_clear_hugepage_flags(page);
 890                 enqueue_huge_page(h, page);
 891         }
 892         spin_unlock(&hugetlb_lock);
 893         hugepage_subpool_put_pages(spool, 1);
 894 }
 895
 896 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 897 {
 898         INIT_LIST_HEAD(&page->lru);
 899         set_compound_page_dtor(page, free_huge_page);
 900         spin_lock(&hugetlb_lock);
 901         set_hugetlb_cgroup(page, NULL);
 902         h->nr_huge_pages++;
 903         h->nr_huge_pages_node[nid]++;
 904         spin_unlock(&hugetlb_lock);
 905         put_page(page); /* free it into the hugepage allocator */
 906 }
 907
 908 static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 909 {
 910         int i;
 911         int nr_pages = 1 << order;
 912         struct page *p = page + 1;
 913
 914         /* we rely on prep_new_huge_page to set the destructor */
 915         set_compound_order(page, order);
 916         __SetPageHead(page);
 917         __ClearPageReserved(page);
 918         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
 919                 /*
 920                  * For gigantic hugepages allocated through bootmem at
 921                  * boot, it's safer to be consistent with the not-gigantic
 922                  * hugepages and clear the PG_reserved bit from all tail pages
 923                  * too.  Otherwse drivers using get_user_pages() to access tail
 924                  * pages may get the reference counting wrong if they see
 925                  * PG_reserved set on a tail page (despite the head page not
 926                  * having PG_reserved set).  Enforcing this consistency between
 927                  * head and tail pages allows drivers to optimize away a check
 928                  * on the head page when they need know if put_page() is needed
 929                  * after get_user_pages().
 930                  */
 931                 __ClearPageReserved(p);
 932                 set_page_count(p, 0);
 933                 p->first_page = page;
 934                 /* Make sure p->first_page is always valid for PageTail() */
 935                 smp_wmb();
 936                 __SetPageTail(p);
 937         }
 938 }
 939
 940 /*
 941  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
 942  * transparent huge pages.  See the PageTransHuge() documentation for more
 943  * details.
 944  */
 945 int PageHuge(struct page *page)
 946 {
 947         if (!PageCompound(page))
 948                 return 0;
 949
 950         page = compound_head(page);
 951         return get_compound_page_dtor(page) == free_huge_page;
 952 }
 953 EXPORT_SYMBOL_GPL(PageHuge);
 954
 955 /*
 956  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
 957  * normal or transparent huge pages.
 958  */
 959 int PageHeadHuge(struct page *page_head)
 960 {
 961         if (!PageHead(page_head))
 962                 return 0;
 963
 964         return get_compound_page_dtor(page_head) == free_huge_page;
 965 }
 966
 967 pgoff_t __basepage_index(struct page *page)
 968 {
 969         struct page *page_head = compound_head(page);
 970         pgoff_t index = page_index(page_head);
 971         unsigned long compound_idx;
 972
 973         if (!PageHuge(page_head))
 974                 return page_index(page);
 975
 976         if (compound_order(page_head) >= MAX_ORDER)
 977                 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
 978         else
 979                 compound_idx = page - page_head;
 980
 981         return (index << compound_order(page_head)) + compound_idx;
 982 }
 983
 984 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 985 {
 986         struct page *page;
 987
 988         page = alloc_pages_exact_node(nid,
 989                 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
 990                                                 __GFP_REPEAT|__GFP_NOWARN,
 991                 huge_page_order(h));
 992         if (page) {
 993                 if (arch_prepare_hugepage(page)) {
 994                         __free_pages(page, huge_page_order(h));
 995                         return NULL;
 996                 }
 997                 prep_new_huge_page(h, page, nid);
 998         }
 999
1000         return page;
1001 }
1002
1003 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
1004 {
1005         struct page *page;
1006         int nr_nodes, node;
1007         int ret = 0;
1008
1009         for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1010                 page = alloc_fresh_huge_page_node(h, node);
1011                 if (page) {
1012                         ret = 1;
1013                         break;
1014                 }
1015         }
1016
1017         if (ret)
1018                 count_vm_event(HTLB_BUDDY_PGALLOC);
1019         else
1020                 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1021
1022         return ret;
1023 }
1024
1025 /*
1026  * Free huge page from pool from next node to free.
1027  * Attempt to keep persistent huge pages more or less
1028  * balanced over allowed nodes.
1029  * Called with hugetlb_lock locked.
1030  */
1031 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1032                                                          bool acct_surplus)
1033 {
1034         int nr_nodes, node;
1035         int ret = 0;
1036
1037         for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1038                 /*
1039                  * If we're returning unused surplus pages, only examine
1040                  * nodes with surplus pages.
1041                  */
1042                 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
1043                     !list_empty(&h->hugepage_freelists[node])) {
1044                         struct page *page =
1045                                 list_entry(h->hugepage_freelists[node].next,
1046                                           struct page, lru);
1047                         list_del(&page->lru);
1048                         h->free_huge_pages--;
1049                         h->free_huge_pages_node[node]--;
1050                         if (acct_surplus) {
1051                                 h->surplus_huge_pages--;
1052                                 h->surplus_huge_pages_node[node]--;
1053                         }
1054                         update_and_free_page(h, page);
1055                         ret = 1;
1056                         break;
1057                 }
1058         }
1059
1060         return ret;
1061 }
1062
1063 /*
1064  * Dissolve a given free hugepage into free buddy pages. This function does
1065  * nothing for in-use (including surplus) hugepages.
1066  */
1067 static void dissolve_free_huge_page(struct page *page)
1068 {
1069         spin_lock(&hugetlb_lock);
1070         if (PageHuge(page) && !page_count(page)) {
1071                 struct hstate *h = page_hstate(page);
1072                 int nid = page_to_nid(page);
1073                 list_del(&page->lru);
1074                 h->free_huge_pages--;
1075                 h->free_huge_pages_node[nid]--;
1076                 update_and_free_page(h, page);
1077         }
1078         spin_unlock(&hugetlb_lock);
1079 }
1080
1081 /*
1082  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
1083  * make specified memory blocks removable from the system.
1084  * Note that start_pfn should aligned with (minimum) hugepage size.
1085  */
1086 void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1087 {
1088         unsigned int order = 8 * sizeof(void *);
1089         unsigned long pfn;
1090         struct hstate *h;
1091
1092         if (!hugepages_supported())
1093                 return;
1094
1095         /* Set scan step to minimum hugepage size */
1096         for_each_hstate(h)
1097                 if (order > huge_page_order(h))
1098                         order = huge_page_order(h);
1099         VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
1100         for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
1101                 dissolve_free_huge_page(pfn_to_page(pfn));
1102 }
1103
1104 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
1105 {
1106         struct page *page;
1107         unsigned int r_nid;
1108
1109         if (hstate_is_gigantic(h))
1110                 return NULL;
1111
1112         /*
1113          * Assume we will successfully allocate the surplus page to
1114          * prevent racing processes from causing the surplus to exceed
1115          * overcommit
1116          *
1117          * This however introduces a different race, where a process B
1118          * tries to grow the static hugepage pool while alloc_pages() is
1119          * called by process A. B will only examine the per-node
1120          * counters in determining if surplus huge pages can be
1121          * converted to normal huge pages in adjust_pool_surplus(). A
1122          * won't be able to increment the per-node counter, until the
1123          * lock is dropped by B, but B doesn't drop hugetlb_lock until
1124          * no more huge pages can be converted from surplus to normal
1125          * state (and doesn't try to convert again). Thus, we have a
1126          * case where a surplus huge page exists, the pool is grown, and
1127          * the surplus huge page still exists after, even though it
1128          * should just have been converted to a normal huge page. This
1129          * does not leak memory, though, as the hugepage will be freed
1130          * once it is out of use. It also does not allow the counters to
1131          * go out of whack in adjust_pool_surplus() as we don't modify
1132          * the node values until we've gotten the hugepage and only the
1133          * per-node value is checked there.
1134          */
1135         spin_lock(&hugetlb_lock);
1136         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
1137                 spin_unlock(&hugetlb_lock);
1138                 return NULL;
1139         } else {
1140                 h->nr_huge_pages++;
1141                 h->surplus_huge_pages++;
1142         }
1143         spin_unlock(&hugetlb_lock);
1144
1145         if (nid == NUMA_NO_NODE)
1146                 page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
1147                                    __GFP_REPEAT|__GFP_NOWARN,
1148                                    huge_page_order(h));
1149         else
1150                 page = alloc_pages_exact_node(nid,
1151                         htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1152                         __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
1153
1154         if (page && arch_prepare_hugepage(page)) {
1155                 __free_pages(page, huge_page_order(h));
1156                 page = NULL;
1157         }
1158
1159         spin_lock(&hugetlb_lock);
1160         if (page) {
1161                 INIT_LIST_HEAD(&page->lru);
1162                 r_nid = page_to_nid(page);
1163                 set_compound_page_dtor(page, free_huge_page);
1164                 set_hugetlb_cgroup(page, NULL);
1165                 /*
1166                  * We incremented the global counters already
1167                  */
1168                 h->nr_huge_pages_node[r_nid]++;
1169                 h->surplus_huge_pages_node[r_nid]++;
1170                 __count_vm_event(HTLB_BUDDY_PGALLOC);
1171         } else {
1172                 h->nr_huge_pages--;
1173                 h->surplus_huge_pages--;
1174                 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1175         }
1176         spin_unlock(&hugetlb_lock);
1177
1178         return page;
1179 }
1180
1181 /*
1182  * This allocation function is useful in the context where vma is irrelevant.
1183  * E.g. soft-offlining uses this function because it only cares physical
1184  * address of error page.
1185  */
1186 struct page *alloc_huge_page_node(struct hstate *h, int nid)
1187 {
1188         struct page *page = NULL;
1189
1190         spin_lock(&hugetlb_lock);
1191         if (h->free_huge_pages - h->resv_huge_pages > 0)
1192                 page = dequeue_huge_page_node(h, nid);
1193         spin_unlock(&hugetlb_lock);
1194
1195         if (!page)
1196                 page = alloc_buddy_huge_page(h, nid);
1197
1198         return page;
1199 }
1200
1201 /*
1202  * Increase the hugetlb pool such that it can accommodate a reservation
1203  * of size 'delta'.
1204  */
1205 static int gather_surplus_pages(struct hstate *h, int delta)
1206 {
1207         struct list_head surplus_list;
1208         struct page *page, *tmp;
1209         int ret, i;
1210         int needed, allocated;
1211         bool alloc_ok = true;
1212
1213         needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
1214         if (needed <= 0) {
1215                 h->resv_huge_pages += delta;
1216                 return 0;
1217         }
1218
1219         allocated = 0;
1220         INIT_LIST_HEAD(&surplus_list);
1221
1222         ret = -ENOMEM;
1223 retry:
1224         spin_unlock(&hugetlb_lock);
1225         for (i = 0; i < needed; i++) {
1226                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1227                 if (!page) {
1228                         alloc_ok = false;
1229                         break;
1230                 }
1231                 list_add(&page->lru, &surplus_list);
1232         }
1233         allocated += i;
1234
1235         /*
1236          * After retaking hugetlb_lock, we need to recalculate 'needed'
1237          * because either resv_huge_pages or free_huge_pages may have changed.
1238          */
1239         spin_lock(&hugetlb_lock);
1240         needed = (h->resv_huge_pages + delta) -
1241                         (h->free_huge_pages + allocated);
1242         if (needed > 0) {
1243                 if (alloc_ok)
1244                         goto retry;
1245                 /*
1246                  * We were not able to allocate enough pages to
1247                  * satisfy the entire reservation so we free what
1248                  * we've allocated so far.
1249                  */
1250                 goto free;
1251         }
1252         /*
1253          * The surplus_list now contains _at_least_ the number of extra pages
1254          * needed to accommodate the reservation.  Add the appropriate number
1255          * of pages to the hugetlb pool and free the extras back to the buddy
1256          * allocator.  Commit the entire reservation here to prevent another
1257          * process from stealing the pages as they are added to the pool but
1258          * before they are reserved.
1259          */
1260         needed += allocated;
1261         h->resv_huge_pages += delta;
1262         ret = 0;
1263
1264         /* Free the needed pages to the hugetlb pool */
1265         list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1266                 if ((--needed) < 0)
1267                         break;
1268                 /*
1269                  * This page is now managed by the hugetlb allocator and has
1270                  * no users -- drop the buddy allocator's reference.
1271                  */
1272                 put_page_testzero(page);
1273                 VM_BUG_ON_PAGE(page_count(page), page);
1274                 enqueue_huge_page(h, page);
1275         }
1276 free:
1277         spin_unlock(&hugetlb_lock);
1278
1279         /* Free unnecessary surplus pages to the buddy allocator */
1280         list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1281                 put_page(page);
1282         spin_lock(&hugetlb_lock);
1283
1284         return ret;
1285 }
1286
1287 /*
1288  * When releasing a hugetlb pool reservation, any surplus pages that were
1289  * allocated to satisfy the reservation must be explicitly freed if they were
1290  * never used.
1291  * Called with hugetlb_lock held.
1292  */
1293 static void return_unused_surplus_pages(struct hstate *h,
1294                                         unsigned long unused_resv_pages)
1295 {
1296         unsigned long nr_pages;
1297
1298         /* Uncommit the reservation */
1299         h->resv_huge_pages -= unused_resv_pages;
1300
1301         /* Cannot return gigantic pages currently */
1302         if (hstate_is_gigantic(h))
1303                 return;
1304
1305         nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1306
1307         /*
1308          * We want to release as many surplus pages as possible, spread
1309          * evenly across all nodes with memory. Iterate across these nodes
1310          * until we can no longer free unreserved surplus pages. This occurs
1311          * when the nodes with surplus pages have no free pages.
1312          * free_pool_huge_page() will balance the the freed pages across the
1313          * on-line nodes with memory and will handle the hstate accounting.
1314          */
1315         while (nr_pages--) {
1316                 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1317                         break;
1318                 cond_resched_lock(&hugetlb_lock);
1319         }
1320 }
1321
1322 /*
1323  * Determine if the huge page at addr within the vma has an associated
1324  * reservation.  Where it does not we will need to logically increase
1325  * reservation and actually increase subpool usage before an allocation
1326  * can occur.  Where any new reservation would be required the
1327  * reservation change is prepared, but not committed.  Once the page
1328  * has been allocated from the subpool and instantiated the change should
1329  * be committed via vma_commit_reservation.  No action is required on
1330  * failure.
1331  */
1332 static long vma_needs_reservation(struct hstate *h,
1333                         struct vm_area_struct *vma, unsigned long addr)
1334 {
1335         struct resv_map *resv;
1336         pgoff_t idx;
1337         long chg;
1338
1339         resv = vma_resv_map(vma);
1340         if (!resv)
1341                 return 1;
1342
1343         idx = vma_hugecache_offset(h, vma, addr);
1344         chg = region_chg(resv, idx, idx + 1);
1345
1346         if (vma->vm_flags & VM_MAYSHARE)
1347                 return chg;
1348         else
1349                 return chg < 0 ? chg : 0;
1350 }
1351 static void vma_commit_reservation(struct hstate *h,
1352                         struct vm_area_struct *vma, unsigned long addr)
1353 {
1354         struct resv_map *resv;
1355         pgoff_t idx;
1356
1357         resv = vma_resv_map(vma);
1358         if (!resv)
1359                 return;
1360
1361         idx = vma_hugecache_offset(h, vma, addr);
1362         region_add(resv, idx, idx + 1);
1363 }
1364
1365 static struct page *alloc_huge_page(struct vm_area_struct *vma,
1366                                     unsigned long addr, int avoid_reserve)
1367 {
1368         struct hugepage_subpool *spool = subpool_vma(vma);
1369         struct hstate *h = hstate_vma(vma);
1370         struct page *page;
1371         long chg;
1372         int ret, idx;
1373         struct hugetlb_cgroup *h_cg;
1374
1375         idx = hstate_index(h);
1376         /*
1377          * Processes that did not create the mapping will have no
1378          * reserves and will not have accounted against subpool
1379          * limit. Check that the subpool limit can be made before
1380          * satisfying the allocation MAP_NORESERVE mappings may also
1381          * need pages and subpool limit allocated allocated if no reserve
1382          * mapping overlaps.
1383          */
1384         chg = vma_needs_reservation(h, vma, addr);
1385         if (chg < 0)
1386                 return ERR_PTR(-ENOMEM);
1387         if (chg || avoid_reserve)
1388                 if (hugepage_subpool_get_pages(spool, 1))
1389                         return ERR_PTR(-ENOSPC);
1390
1391         ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1392         if (ret)
1393                 goto out_subpool_put;
1394
1395         spin_lock(&hugetlb_lock);
1396         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
1397         if (!page) {
1398                 spin_unlock(&hugetlb_lock);
1399                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1400                 if (!page)
1401                         goto out_uncharge_cgroup;
1402
1403                 spin_lock(&hugetlb_lock);
1404                 list_move(&page->lru, &h->hugepage_activelist);
1405                 /* Fall through */
1406         }
1407         hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
1408         spin_unlock(&hugetlb_lock);
1409
1410         set_page_private(page, (unsigned long)spool);
1411
1412         vma_commit_reservation(h, vma, addr);
1413         return page;
1414
1415 out_uncharge_cgroup:
1416         hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
1417 out_subpool_put:
1418         if (chg || avoid_reserve)
1419                 hugepage_subpool_put_pages(spool, 1);
1420         return ERR_PTR(-ENOSPC);
1421 }
1422
1423 /*
1424  * alloc_huge_page()'s wrapper which simply returns the page if allocation
1425  * succeeds, otherwise NULL. This function is called from new_vma_page(),
1426  * where no ERR_VALUE is expected to be returned.
1427  */
1428 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
1429                                 unsigned long addr, int avoid_reserve)
1430 {
1431         struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
1432         if (IS_ERR(page))
1433                 page = NULL;
1434         return page;
1435 }
1436
1437 int __weak alloc_bootmem_huge_page(struct hstate *h)
1438 {
1439         struct huge_bootmem_page *m;
1440         int nr_nodes, node;
1441
1442         for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1443                 void *addr;
1444
1445                 addr = memblock_virt_alloc_try_nid_nopanic(
1446                                 huge_page_size(h), huge_page_size(h),
1447                                 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
1448                 if (addr) {
1449                         /*
1450                          * Use the beginning of the huge page to store the
1451                          * huge_bootmem_page struct (until gather_bootmem
1452                          * puts them into the mem_map).
1453                          */
1454                         m = addr;
1455                         goto found;
1456                 }
1457         }
1458         return 0;
1459
1460 found:
1461         BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
1462         /* Put them into a private list first because mem_map is not up yet */
1463         list_add(&m->list, &huge_boot_pages);
1464         m->hstate = h;
1465         return 1;
1466 }
1467
1468 static void __init prep_compound_huge_page(struct page *page, int order)
1469 {
1470         if (unlikely(order > (MAX_ORDER - 1)))
1471                 prep_compound_gigantic_page(page, order);
1472         else
1473                 prep_compound_page(page, order);
1474 }
1475
1476 /* Put bootmem huge pages into the standard lists after mem_map is up */
1477 static void __init gather_bootmem_prealloc(void)
1478 {
1479         struct huge_bootmem_page *m;
1480
1481         list_for_each_entry(m, &huge_boot_pages, list) {
1482                 struct hstate *h = m->hstate;
1483                 struct page *page;
1484
1485 #ifdef CONFIG_HIGHMEM
1486                 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1487                 memblock_free_late(__pa(m),
1488                                    sizeof(struct huge_bootmem_page));
1489 #else
1490                 page = virt_to_page(m);
1491 #endif
1492                 WARN_ON(page_count(page) != 1);
1493                 prep_compound_huge_page(page, h->order);
1494                 WARN_ON(PageReserved(page));
1495                 prep_new_huge_page(h, page, page_to_nid(page));
1496                 /*
1497                  * If we had gigantic hugepages allocated at boot time, we need
1498                  * to restore the 'stolen' pages to totalram_pages in order to
1499                  * fix confusing memory reports from free(1) and another
1500                  * side-effects, like CommitLimit going negative.
1501                  */
1502                 if (hstate_is_gigantic(h))
1503                         adjust_managed_page_count(page, 1 << h->order);
1504         }
1505 }
1506
1507 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1508 {
1509         unsigned long i;
1510
1511         for (i = 0; i < h->max_huge_pages; ++i) {
1512                 if (hstate_is_gigantic(h)) {
1513                         if (!alloc_bootmem_huge_page(h))
1514                                 break;
1515                 } else if (!alloc_fresh_huge_page(h,
1516                                          &node_states[N_MEMORY]))
1517                         break;
1518         }
1519         h->max_huge_pages = i;
1520 }
1521
1522 static void __init hugetlb_init_hstates(void)
1523 {
1524         struct hstate *h;
1525
1526         for_each_hstate(h) {
1527                 /* oversize hugepages were init'ed in early boot */
1528                 if (!hstate_is_gigantic(h))
1529                         hugetlb_hstate_alloc_pages(h);
1530         }
1531 }
1532
1533 static char * __init memfmt(char *buf, unsigned long n)
1534 {
1535         if (n >= (1UL << 30))
1536                 sprintf(buf, "%lu GB", n >> 30);
1537         else if (n >= (1UL << 20))
1538                 sprintf(buf, "%lu MB", n >> 20);
1539         else
1540                 sprintf(buf, "%lu KB", n >> 10);
1541         return buf;
1542 }
1543
1544 static void __init report_hugepages(void)
1545 {
1546         struct hstate *h;
1547
1548         for_each_hstate(h) {
1549                 char buf[32];
1550                 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
1551                         memfmt(buf, huge_page_size(h)),
1552                         h->free_huge_pages);
1553         }
1554 }
1555
1556 #ifdef CONFIG_HIGHMEM
1557 static void try_to_free_low(struct hstate *h, unsigned long count,
1558                                                 nodemask_t *nodes_allowed)
1559 {
1560         int i;
1561
1562         if (hstate_is_gigantic(h))
1563                 return;
1564
1565         for_each_node_mask(i, *nodes_allowed) {
1566                 struct page *page, *next;
1567                 struct list_head *freel = &h->hugepage_freelists[i];
1568                 list_for_each_entry_safe(page, next, freel, lru) {
1569                         if (count >= h->nr_huge_pages)
1570                                 return;
1571                         if (PageHighMem(page))
1572                                 continue;
1573                         list_del(&page->lru);
1574                         update_and_free_page(h, page);
1575                         h->free_huge_pages--;
1576                         h->free_huge_pages_node[page_to_nid(page)]--;
1577                 }
1578         }
1579 }
1580 #else
1581 static inline void try_to_free_low(struct hstate *h, unsigned long count,
1582                                                 nodemask_t *nodes_allowed)
1583 {
1584 }
1585 #endif
1586
1587 /*
1588  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
1589  * balanced by operating on them in a round-robin fashion.
1590  * Returns 1 if an adjustment was made.
1591  */
1592 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1593                                 int delta)
1594 {
1595         int nr_nodes, node;
1596
1597         VM_BUG_ON(delta != -1 && delta != 1);
1598
1599         if (delta < 0) {
1600                 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1601                         if (h->surplus_huge_pages_node[node])
1602                                 goto found;
1603                 }
1604         } else {
1605                 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1606                         if (h->surplus_huge_pages_node[node] <
1607                                         h->nr_huge_pages_node[node])
1608                                 goto found;
1609                 }
1610         }
1611         return 0;
1612
1613 found:
1614         h->surplus_huge_pages += delta;
1615         h->surplus_huge_pages_node[node] += delta;
1616         return 1;
1617 }
1618
1619 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1620 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1621                                                 nodemask_t *nodes_allowed)
1622 {
1623         unsigned long min_count, ret;
1624
1625         if (hstate_is_gigantic(h) && !gigantic_page_supported())
1626                 return h->max_huge_pages;
1627
1628         /*
1629          * Increase the pool size
1630          * First take pages out of surplus state.  Then make up the
1631          * remaining difference by allocating fresh huge pages.
1632          *
1633          * We might race with alloc_buddy_huge_page() here and be unable
1634          * to convert a surplus huge page to a normal huge page. That is
1635          * not critical, though, it just means the overall size of the
1636          * pool might be one hugepage larger than it needs to be, but
1637          * within all the constraints specified by the sysctls.
1638          */
1639         spin_lock(&hugetlb_lock);
1640         while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1641                 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1642                         break;
1643         }
1644
1645         while (count > persistent_huge_pages(h)) {
1646                 /*
1647                  * If this allocation races such that we no longer need the
1648                  * page, free_huge_page will handle it by freeing the page
1649                  * and reducing the surplus.
1650                  */
1651                 spin_unlock(&hugetlb_lock);
1652                 if (hstate_is_gigantic(h))
1653                         ret = alloc_fresh_gigantic_page(h, nodes_allowed);
1654                 else
1655                         ret = alloc_fresh_huge_page(h, nodes_allowed);
1656                 spin_lock(&hugetlb_lock);
1657                 if (!ret)
1658                         goto out;
1659
1660                 /* Bail for signals. Probably ctrl-c from user */
1661                 if (signal_pending(current))
1662                         goto out;
1663         }
1664
1665         /*
1666          * Decrease the pool size
1667          * First return free pages to the buddy allocator (being careful
1668          * to keep enough around to satisfy reservations).  Then place
1669          * pages into surplus state as needed so the pool will shrink
1670          * to the desired size as pages become free.
1671          *
1672          * By placing pages into the surplus state independent of the
1673          * overcommit value, we are allowing the surplus pool size to
1674          * exceed overcommit. There are few sane options here. Since
1675          * alloc_buddy_huge_page() is checking the global counter,
1676          * though, we'll note that we're not allowed to exceed surplus
1677          * and won't grow the pool anywhere else. Not until one of the
1678          * sysctls are changed, or the surplus pages go out of use.
1679          */
1680         min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1681         min_count = max(count, min_count);
1682         try_to_free_low(h, min_count, nodes_allowed);
1683         while (min_count < persistent_huge_pages(h)) {
1684                 if (!free_pool_huge_page(h, nodes_allowed, 0))
1685                         break;
1686                 cond_resched_lock(&hugetlb_lock);
1687         }
1688         while (count < persistent_huge_pages(h)) {
1689                 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1690                         break;
1691         }
1692 out:
1693         ret = persistent_huge_pages(h);
1694         spin_unlock(&hugetlb_lock);
1695         return ret;
1696 }
1697
1698 #define HSTATE_ATTR_RO(_name) \
1699         static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1700
1701 #define HSTATE_ATTR(_name) \
1702         static struct kobj_attribute _name##_attr = \
1703                 __ATTR(_name, 0644, _name##_show, _name##_store)
1704
1705 static struct kobject *hugepages_kobj;
1706 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1707
1708 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1709
1710 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1711 {
1712         int i;
1713
1714         for (i = 0; i < HUGE_MAX_HSTATE; i++)
1715                 if (hstate_kobjs[i] == kobj) {
1716                         if (nidp)
1717                                 *nidp = NUMA_NO_NODE;
1718                         return &hstates[i];
1719                 }
1720
1721         return kobj_to_node_hstate(kobj, nidp);
1722 }
1723
1724 static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1725                                         struct kobj_attribute *attr, char *buf)
1726 {
1727         struct hstate *h;
1728         unsigned long nr_huge_pages;
1729         int nid;
1730
1731         h = kobj_to_hstate(kobj, &nid);
1732         if (nid == NUMA_NO_NODE)
1733                 nr_huge_pages = h->nr_huge_pages;
1734         else
1735                 nr_huge_pages = h->nr_huge_pages_node[nid];
1736
1737         return sprintf(buf, "%lu\n", nr_huge_pages);
1738 }
1739
1740 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
1741                                            struct hstate *h, int nid,
1742                                            unsigned long count, size_t len)
1743 {
1744         int err;
1745         NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1746
1747         if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
1748                 err = -EINVAL;
1749                 goto out;
1750         }
1751
1752         if (nid == NUMA_NO_NODE) {
1753                 /*
1754                  * global hstate attribute
1755                  */
1756                 if (!(obey_mempolicy &&
1757                                 init_nodemask_of_mempolicy(nodes_allowed))) {
1758                         NODEMASK_FREE(nodes_allowed);
1759                         nodes_allowed = &node_states[N_MEMORY];
1760                 }
1761         } else if (nodes_allowed) {
1762                 /*
1763                  * per node hstate attribute: adjust count to global,
1764                  * but restrict alloc/free to the specified node.
1765                  */
1766                 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1767                 init_nodemask_of_node(nodes_allowed, nid);
1768         } else
1769                 nodes_allowed = &node_states[N_MEMORY];
1770
1771         h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1772
1773         if (nodes_allowed != &node_states[N_MEMORY])
1774                 NODEMASK_FREE(nodes_allowed);
1775
1776         return len;
1777 out:
1778         NODEMASK_FREE(nodes_allowed);
1779         return err;
1780 }
1781
1782 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1783                                          struct kobject *kobj, const char *buf,
1784                                          size_t len)
1785 {
1786         struct hstate *h;
1787         unsigned long count;
1788         int nid;
1789         int err;
1790
1791         err = kstrtoul(buf, 10, &count);
1792         if (err)
1793                 return err;
1794
1795         h = kobj_to_hstate(kobj, &nid);
1796         return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
1797 }
1798
1799 static ssize_t nr_hugepages_show(struct kobject *kobj,
1800                                        struct kobj_attribute *attr, char *buf)
1801 {
1802         return nr_hugepages_show_common(kobj, attr, buf);
1803 }
1804
1805 static ssize_t nr_hugepages_store(struct kobject *kobj,
1806                struct kobj_attribute *attr, const char *buf, size_t len)
1807 {
1808         return nr_hugepages_store_common(false, kobj, buf, len);
1809 }
1810 HSTATE_ATTR(nr_hugepages);
1811
1812 #ifdef CONFIG_NUMA
1813
1814 /*
1815  * hstate attribute for optionally mempolicy-based constraint on persistent
1816  * huge page alloc/free.
1817  */
1818 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1819                                        struct kobj_attribute *attr, char *buf)
1820 {
1821         return nr_hugepages_show_common(kobj, attr, buf);
1822 }
1823
1824 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1825                struct kobj_attribute *attr, const char *buf, size_t len)
1826 {
1827         return nr_hugepages_store_common(true, kobj, buf, len);
1828 }
1829 HSTATE_ATTR(nr_hugepages_mempolicy);
1830 #endif
1831
1832
1833 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1834                                         struct kobj_attribute *attr, char *buf)
1835 {
1836         struct hstate *h = kobj_to_hstate(kobj, NULL);
1837         return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1838 }
1839
1840 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1841                 struct kobj_attribute *attr, const char *buf, size_t count)
1842 {
1843         int err;
1844         unsigned long input;
1845         struct hstate *h = kobj_to_hstate(kobj, NULL);
1846
1847         if (hstate_is_gigantic(h))
1848                 return -EINVAL;
1849
1850         err = kstrtoul(buf, 10, &input);
1851         if (err)
1852                 return err;
1853
1854         spin_lock(&hugetlb_lock);
1855         h->nr_overcommit_huge_pages = input;
1856         spin_unlock(&hugetlb_lock);
1857
1858         return count;
1859 }
1860 HSTATE_ATTR(nr_overcommit_hugepages);
1861
1862 static ssize_t free_hugepages_show(struct kobject *kobj,
1863                                         struct kobj_attribute *attr, char *buf)
1864 {
1865         struct hstate *h;
1866         unsigned long free_huge_pages;
1867         int nid;
1868
1869         h = kobj_to_hstate(kobj, &nid);
1870         if (nid == NUMA_NO_NODE)
1871                 free_huge_pages = h->free_huge_pages;
1872         else
1873                 free_huge_pages = h->free_huge_pages_node[nid];
1874
1875         return sprintf(buf, "%lu\n", free_huge_pages);
1876 }
1877 HSTATE_ATTR_RO(free_hugepages);
1878
1879 static ssize_t resv_hugepages_show(struct kobject *kobj,
1880                                         struct kobj_attribute *attr, char *buf)
1881 {
1882         struct hstate *h = kobj_to_hstate(kobj, NULL);
1883         return sprintf(buf, "%lu\n", h->resv_huge_pages);
1884 }
1885 HSTATE_ATTR_RO(resv_hugepages);
1886
1887 static ssize_t surplus_hugepages_show(struct kobject *kobj,
1888                                         struct kobj_attribute *attr, char *buf)
1889 {
1890         struct hstate *h;
1891         unsigned long surplus_huge_pages;
1892         int nid;
1893
1894         h = kobj_to_hstate(kobj, &nid);
1895         if (nid == NUMA_NO_NODE)
1896                 surplus_huge_pages = h->surplus_huge_pages;
1897         else
1898                 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1899
1900         return sprintf(buf, "%lu\n", surplus_huge_pages);
1901 }
1902 HSTATE_ATTR_RO(surplus_hugepages);
1903
1904 static struct attribute *hstate_attrs[] = {
1905         &nr_hugepages_attr.attr,
1906         &nr_overcommit_hugepages_attr.attr,
1907         &free_hugepages_attr.attr,
1908         &resv_hugepages_attr.attr,
1909         &surplus_hugepages_attr.attr,
1910 #ifdef CONFIG_NUMA
1911         &nr_hugepages_mempolicy_attr.attr,
1912 #endif
1913         NULL,
1914 };
1915
1916 static struct attribute_group hstate_attr_group = {
1917         .attrs = hstate_attrs,
1918 };
1919
1920 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1921                                     struct kobject **hstate_kobjs,
1922                                     struct attribute_group *hstate_attr_group)
1923 {
1924         int retval;
1925         int hi = hstate_index(h);
1926
1927         hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1928         if (!hstate_kobjs[hi])
1929                 return -ENOMEM;
1930
1931         retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1932         if (retval)
1933                 kobject_put(hstate_kobjs[hi]);
1934
1935         return retval;
1936 }
1937
1938 static void __init hugetlb_sysfs_init(void)
1939 {
1940         struct hstate *h;
1941         int err;
1942
1943         hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1944         if (!hugepages_kobj)
1945                 return;
1946
1947         for_each_hstate(h) {
1948                 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1949                                          hstate_kobjs, &hstate_attr_group);
1950                 if (err)
1951                         pr_err("Hugetlb: Unable to add hstate %s", h->name);
1952         }
1953 }
1954
1955 #ifdef CONFIG_NUMA
1956
1957 /*
1958  * node_hstate/s - associate per node hstate attributes, via their kobjects,
1959  * with node devices in node_devices[] using a parallel array.  The array
1960  * index of a node device or _hstate == node id.
1961  * This is here to avoid any static dependency of the node device driver, in
1962  * the base kernel, on the hugetlb module.
1963  */
1964 struct node_hstate {
1965         struct kobject          *hugepages_kobj;
1966         struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
1967 };
1968 struct node_hstate node_hstates[MAX_NUMNODES];
1969
1970 /*
1971  * A subset of global hstate attributes for node devices
1972  */
1973 static struct attribute *per_node_hstate_attrs[] = {
1974         &nr_hugepages_attr.attr,
1975         &free_hugepages_attr.attr,
1976         &surplus_hugepages_attr.attr,
1977         NULL,
1978 };
1979
1980 static struct attribute_group per_node_hstate_attr_group = {
1981         .attrs = per_node_hstate_attrs,
1982 };
1983
1984 /*
1985  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
1986  * Returns node id via non-NULL nidp.
1987  */
1988 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1989 {
1990         int nid;
1991
1992         for (nid = 0; nid < nr_node_ids; nid++) {
1993                 struct node_hstate *nhs = &node_hstates[nid];
1994                 int i;
1995                 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1996                         if (nhs->hstate_kobjs[i] == kobj) {
1997                                 if (nidp)
1998                                         *nidp = nid;
1999                                 return &hstates[i];
2000                         }
2001         }
2002
2003         BUG();
2004         return NULL;
2005 }
2006
2007 /*
2008  * Unregister hstate attributes from a single node device.
2009  * No-op if no hstate attributes attached.
2010  */
2011 static void hugetlb_unregister_node(struct node *node)
2012 {
2013         struct hstate *h;
2014         struct node_hstate *nhs = &node_hstates[node->dev.id];
2015
2016         if (!nhs->hugepages_kobj)
2017                 return;         /* no hstate attributes */
2018
2019         for_each_hstate(h) {
2020                 int idx = hstate_index(h);
2021                 if (nhs->hstate_kobjs[idx]) {
2022                         kobject_put(nhs->hstate_kobjs[idx]);
2023                         nhs->hstate_kobjs[idx] = NULL;
2024                 }
2025         }
2026
2027         kobject_put(nhs->hugepages_kobj);
2028         nhs->hugepages_kobj = NULL;
2029 }
2030
2031 /*
2032  * hugetlb module exit:  unregister hstate attributes from node devices
2033  * that have them.
2034  */
2035 static void hugetlb_unregister_all_nodes(void)
2036 {
2037         int nid;
2038
2039         /*
2040          * disable node device registrations.
2041          */
2042         register_hugetlbfs_with_node(NULL, NULL);
2043
2044         /*
2045          * remove hstate attributes from any nodes that have them.
2046          */
2047         for (nid = 0; nid < nr_node_ids; nid++)
2048                 hugetlb_unregister_node(node_devices[nid]);
2049 }
2050
2051 /*
2052  * Register hstate attributes for a single node device.
2053  * No-op if attributes already registered.
2054  */
2055 static void hugetlb_register_node(struct node *node)
2056 {
2057         struct hstate *h;
2058         struct node_hstate *nhs = &node_hstates[node->dev.id];
2059         int err;
2060
2061         if (nhs->hugepages_kobj)
2062                 return;         /* already allocated */
2063
2064         nhs->hugepages_kobj = kobject_create_and_add("hugepages",
2065                                                         &node->dev.kobj);
2066         if (!nhs->hugepages_kobj)
2067                 return;
2068
2069         for_each_hstate(h) {
2070                 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
2071                                                 nhs->hstate_kobjs,
2072                                                 &per_node_hstate_attr_group);
2073                 if (err) {
2074                         pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
2075                                 h->name, node->dev.id);
2076                         hugetlb_unregister_node(node);
2077                         break;
2078                 }
2079         }
2080 }
2081
2082 /*
2083  * hugetlb init time:  register hstate attributes for all registered node
2084  * devices of nodes that have memory.  All on-line nodes should have
2085  * registered their associated device by this time.
2086  */
2087 static void __init hugetlb_register_all_nodes(void)
2088 {
2089         int nid;
2090
2091         for_each_node_state(nid, N_MEMORY) {
2092                 struct node *node = node_devices[nid];
2093                 if (node->dev.id == nid)
2094                         hugetlb_register_node(node);
2095         }
2096
2097         /*
2098          * Let the node device driver know we're here so it can
2099          * [un]register hstate attributes on node hotplug.
2100          */
2101         register_hugetlbfs_with_node(hugetlb_register_node,
2102                                      hugetlb_unregister_node);
2103 }
2104 #else   /* !CONFIG_NUMA */
2105
2106 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
2107 {
2108         BUG();
2109         if (nidp)
2110                 *nidp = -1;
2111         return NULL;
2112 }
2113
2114 static void hugetlb_unregister_all_nodes(void) { }
2115
2116 static void hugetlb_register_all_nodes(void) { }
2117
2118 #endif
2119
2120 static void __exit hugetlb_exit(void)
2121 {
2122         struct hstate *h;
2123
2124         hugetlb_unregister_all_nodes();
2125
2126         for_each_hstate(h) {
2127                 kobject_put(hstate_kobjs[hstate_index(h)]);
2128         }
2129
2130         kobject_put(hugepages_kobj);
2131         kfree(htlb_fault_mutex_table);
2132 }
2133 module_exit(hugetlb_exit);
2134
2135 static int __init hugetlb_init(void)
2136 {
2137         int i;
2138
2139         if (!hugepages_supported())
2140                 return 0;
2141
2142         if (!size_to_hstate(default_hstate_size)) {
2143                 default_hstate_size = HPAGE_SIZE;
2144                 if (!size_to_hstate(default_hstate_size))
2145                         hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
2146         }
2147         default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
2148         if (default_hstate_max_huge_pages)
2149                 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
2150
2151         hugetlb_init_hstates();
2152         gather_bootmem_prealloc();
2153         report_hugepages();
2154
2155         hugetlb_sysfs_init();
2156         hugetlb_register_all_nodes();
2157         hugetlb_cgroup_file_init();
2158
2159 #ifdef CONFIG_SMP
2160         num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
2161 #else
2162         num_fault_mutexes = 1;
2163 #endif
2164         htlb_fault_mutex_table =
2165                 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
2166         BUG_ON(!htlb_fault_mutex_table);
2167
2168         for (i = 0; i < num_fault_mutexes; i++)
2169                 mutex_init(&htlb_fault_mutex_table[i]);
2170         return 0;
2171 }
2172 module_init(hugetlb_init);
2173
2174 /* Should be called on processing a hugepagesz=... option */
2175 void __init hugetlb_add_hstate(unsigned order)
2176 {
2177         struct hstate *h;
2178         unsigned long i;
2179
2180         if (size_to_hstate(PAGE_SIZE << order)) {
2181                 pr_warning("hugepagesz= specified twice, ignoring\n");
2182                 return;
2183         }
2184         BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
2185         BUG_ON(order == 0);
2186         h = &hstates[hugetlb_max_hstate++];
2187         h->order = order;
2188         h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
2189         h->nr_huge_pages = 0;
2190         h->free_huge_pages = 0;
2191         for (i = 0; i < MAX_NUMNODES; ++i)
2192                 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
2193         INIT_LIST_HEAD(&h->hugepage_activelist);
2194         h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
2195         h->next_nid_to_free = first_node(node_states[N_MEMORY]);
2196         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
2197                                         huge_page_size(h)/1024);
2198
2199         parsed_hstate = h;
2200 }
2201
2202 static int __init hugetlb_nrpages_setup(char *s)
2203 {
2204         unsigned long *mhp;
2205         static unsigned long *last_mhp;
2206
2207         /*
2208          * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
2209          * so this hugepages= parameter goes to the "default hstate".
2210          */
2211         if (!hugetlb_max_hstate)
2212                 mhp = &default_hstate_max_huge_pages;
2213         else
2214                 mhp = &parsed_hstate->max_huge_pages;
2215
2216         if (mhp == last_mhp) {
2217                 pr_warning("hugepages= specified twice without "
2218                            "interleaving hugepagesz=, ignoring\n");
2219                 return 1;
2220         }
2221
2222         if (sscanf(s, "%lu", mhp) <= 0)
2223                 *mhp = 0;
2224
2225         /*
2226          * Global state is always initialized later in hugetlb_init.
2227          * But we need to allocate >= MAX_ORDER hstates here early to still
2228          * use the bootmem allocator.
2229          */
2230         if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
2231                 hugetlb_hstate_alloc_pages(parsed_hstate);
2232
2233         last_mhp = mhp;
2234
2235         return 1;
2236 }
2237 __setup("hugepages=", hugetlb_nrpages_setup);
2238
2239 static int __init hugetlb_default_setup(char *s)
2240 {
2241         default_hstate_size = memparse(s, &s);
2242         return 1;
2243 }
2244 __setup("default_hugepagesz=", hugetlb_default_setup);
2245
2246 static unsigned int cpuset_mems_nr(unsigned int *array)
2247 {
2248         int node;
2249         unsigned int nr = 0;
2250
2251         for_each_node_mask(node, cpuset_current_mems_allowed)
2252                 nr += array[node];
2253
2254         return nr;
2255 }
2256
2257 #ifdef CONFIG_SYSCTL
2258 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2259                          struct ctl_table *table, int write,
2260                          void __user *buffer, size_t *length, loff_t *ppos)
2261 {
2262         struct hstate *h = &default_hstate;
2263         unsigned long tmp = h->max_huge_pages;
2264         int ret;
2265
2266         if (!hugepages_supported())
2267                 return -ENOTSUPP;
2268
2269         table->data = &tmp;
2270         table->maxlen = sizeof(unsigned long);
2271         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2272         if (ret)
2273                 goto out;
2274
2275         if (write)
2276                 ret = __nr_hugepages_store_common(obey_mempolicy, h,
2277                                                   NUMA_NO_NODE, tmp, *length);
2278 out:
2279         return ret;
2280 }
2281
2282 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
2283                           void __user *buffer, size_t *length, loff_t *ppos)
2284 {
2285
2286         return hugetlb_sysctl_handler_common(false, table, write,
2287                                                         buffer, length, ppos);
2288 }
2289
2290 #ifdef CONFIG_NUMA
2291 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
2292                           void __user *buffer, size_t *length, loff_t *ppos)
2293 {
2294         return hugetlb_sysctl_handler_common(true, table, write,
2295                                                         buffer, length, ppos);
2296 }
2297 #endif /* CONFIG_NUMA */
2298
2299 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2300                         void __user *buffer,
2301                         size_t *length, loff_t *ppos)
2302 {
2303         struct hstate *h = &default_hstate;
2304         unsigned long tmp;
2305         int ret;
2306
2307         if (!hugepages_supported())
2308                 return -ENOTSUPP;
2309
2310         tmp = h->nr_overcommit_huge_pages;
2311
2312         if (write && hstate_is_gigantic(h))
2313                 return -EINVAL;
2314
2315         table->data = &tmp;
2316         table->maxlen = sizeof(unsigned long);
2317         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2318         if (ret)
2319                 goto out;
2320
2321         if (write) {
2322                 spin_lock(&hugetlb_lock);
2323                 h->nr_overcommit_huge_pages = tmp;
2324                 spin_unlock(&hugetlb_lock);
2325         }
2326 out:
2327         return ret;
2328 }
2329
2330 #endif /* CONFIG_SYSCTL */
2331
2332 void hugetlb_report_meminfo(struct seq_file *m)
2333 {
2334         struct hstate *h = &default_hstate;
2335         if (!hugepages_supported())
2336                 return;
2337         seq_printf(m,
2338                         "HugePages_Total:   %5lu\n"
2339                         "HugePages_Free:    %5lu\n"
2340                         "HugePages_Rsvd:    %5lu\n"
2341                         "HugePages_Surp:    %5lu\n"
2342                         "Hugepagesize:   %8lu kB\n",
2343                         h->nr_huge_pages,
2344                         h->free_huge_pages,
2345                         h->resv_huge_pages,
2346                         h->surplus_huge_pages,
2347                         1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2348 }
2349
2350 int hugetlb_report_node_meminfo(int nid, char *buf)
2351 {
2352         struct hstate *h = &default_hstate;
2353         if (!hugepages_supported())
2354                 return 0;
2355         return sprintf(buf,
2356                 "Node %d HugePages_Total: %5u\n"
2357                 "Node %d HugePages_Free:  %5u\n"
2358                 "Node %d HugePages_Surp:  %5u\n",
2359                 nid, h->nr_huge_pages_node[nid],
2360                 nid, h->free_huge_pages_node[nid],
2361                 nid, h->surplus_huge_pages_node[nid]);
2362 }
2363
2364 void hugetlb_show_meminfo(void)
2365 {
2366         struct hstate *h;
2367         int nid;
2368
2369         if (!hugepages_supported())
2370                 return;
2371
2372         for_each_node_state(nid, N_MEMORY)
2373                 for_each_hstate(h)
2374                         pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
2375                                 nid,
2376                                 h->nr_huge_pages_node[nid],
2377                                 h->free_huge_pages_node[nid],
2378                                 h->surplus_huge_pages_node[nid],
2379                                 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2380 }
2381
2382 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
2383 unsigned long hugetlb_total_pages(void)
2384 {
2385         struct hstate *h;
2386         unsigned long nr_total_pages = 0;
2387
2388         for_each_hstate(h)
2389                 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2390         return nr_total_pages;
2391 }
2392
2393 static int hugetlb_acct_memory(struct hstate *h, long delta)
2394 {
2395         int ret = -ENOMEM;
2396
2397         spin_lock(&hugetlb_lock);
2398         /*
2399          * When cpuset is configured, it breaks the strict hugetlb page
2400          * reservation as the accounting is done on a global variable. Such
2401          * reservation is completely rubbish in the presence of cpuset because
2402          * the reservation is not checked against page availability for the
2403          * current cpuset. Application can still potentially OOM'ed by kernel
2404          * with lack of free htlb page in cpuset that the task is in.
2405          * Attempt to enforce strict accounting with cpuset is almost
2406          * impossible (or too ugly) because cpuset is too fluid that
2407          * task or memory node can be dynamically moved between cpusets.
2408          *
2409          * The change of semantics for shared hugetlb mapping with cpuset is
2410          * undesirable. However, in order to preserve some of the semantics,
2411          * we fall back to check against current free page availability as
2412          * a best attempt and hopefully to minimize the impact of changing
2413          * semantics that cpuset has.
2414          */
2415         if (delta > 0) {
2416                 if (gather_surplus_pages(h, delta) < 0)
2417                         goto out;
2418
2419                 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2420                         return_unused_surplus_pages(h, delta);
2421                         goto out;
2422                 }
2423         }
2424
2425         ret = 0;
2426         if (delta < 0)
2427                 return_unused_surplus_pages(h, (unsigned long) -delta);
2428
2429 out:
2430         spin_unlock(&hugetlb_lock);
2431         return ret;
2432 }
2433
2434 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2435 {
2436         struct resv_map *resv = vma_resv_map(vma);
2437
2438         /*
2439          * This new VMA should share its siblings reservation map if present.
2440          * The VMA will only ever have a valid reservation map pointer where
2441          * it is being copied for another still existing VMA.  As that VMA
2442          * has a reference to the reservation map it cannot disappear until
2443          * after this open call completes.  It is therefore safe to take a
2444          * new reference here without additional locking.
2445          */
2446         if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2447                 kref_get(&resv->refs);
2448 }
2449
2450 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2451 {
2452         struct hstate *h = hstate_vma(vma);
2453         struct resv_map *resv = vma_resv_map(vma);
2454         struct hugepage_subpool *spool = subpool_vma(vma);
2455         unsigned long reserve, start, end;
2456
2457         if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2458                 return;
2459
2460         start = vma_hugecache_offset(h, vma, vma->vm_start);
2461         end = vma_hugecache_offset(h, vma, vma->vm_end);
2462
2463         reserve = (end - start) - region_count(resv, start, end);
2464
2465         kref_put(&resv->refs, resv_map_release);
2466
2467         if (reserve) {
2468                 hugetlb_acct_memory(h, -reserve);
2469                 hugepage_subpool_put_pages(spool, reserve);
2470         }
2471 }
2472
2473 /*
2474  * We cannot handle pagefaults against hugetlb pages at all.  They cause
2475  * handle_mm_fault() to try to instantiate regular-sized pages in the
2476  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
2477  * this far.
2478  */
2479 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2480 {
2481         BUG();
2482         return 0;
2483 }
2484
2485 const struct vm_operations_struct hugetlb_vm_ops = {
2486         .fault = hugetlb_vm_op_fault,
2487         .open = hugetlb_vm_op_open,
2488         .close = hugetlb_vm_op_close,
2489 };
2490
2491 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2492                                 int writable)
2493 {
2494         pte_t entry;
2495
2496         if (writable) {
2497                 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
2498                                          vma->vm_page_prot)));
2499         } else {
2500                 entry = huge_pte_wrprotect(mk_huge_pte(page,
2501                                            vma->vm_page_prot));
2502         }
2503         entry = pte_mkyoung(entry);
2504         entry = pte_mkhuge(entry);
2505         entry = arch_make_huge_pte(entry, vma, page, writable);
2506
2507         return entry;
2508 }
2509
2510 static void set_huge_ptep_writable(struct vm_area_struct *vma,
2511                                    unsigned long address, pte_t *ptep)
2512 {
2513         pte_t entry;
2514
2515         entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
2516         if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
2517                 update_mmu_cache(vma, address, ptep);
2518 }
2519
2520 static int is_hugetlb_entry_migration(pte_t pte)
2521 {
2522         swp_entry_t swp;
2523
2524         if (huge_pte_none(pte) || pte_present(pte))
2525                 return 0;
2526         swp = pte_to_swp_entry(pte);
2527         if (non_swap_entry(swp) && is_migration_entry(swp))
2528                 return 1;
2529         else
2530                 return 0;
2531 }
2532
2533 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2534 {
2535         swp_entry_t swp;
2536
2537         if (huge_pte_none(pte) || pte_present(pte))
2538                 return 0;
2539         swp = pte_to_swp_entry(pte);
2540         if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2541                 return 1;
2542         else
2543                 return 0;
2544 }
2545
2546 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2547                             struct vm_area_struct *vma)
2548 {
2549         pte_t *src_pte, *dst_pte, entry;
2550         struct page *ptepage;
2551         unsigned long addr;
2552         int cow;
2553         struct hstate *h = hstate_vma(vma);
2554         unsigned long sz = huge_page_size(h);
2555         unsigned long mmun_start;       /* For mmu_notifiers */
2556         unsigned long mmun_end;         /* For mmu_notifiers */
2557         int ret = 0;
2558
2559         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2560
2561         mmun_start = vma->vm_start;
2562         mmun_end = vma->vm_end;
2563         if (cow)
2564                 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
2565
2566         for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2567                 spinlock_t *src_ptl, *dst_ptl;
2568                 src_pte = huge_pte_offset(src, addr);
2569                 if (!src_pte)
2570                         continue;
2571                 dst_pte = huge_pte_alloc(dst, addr, sz);
2572                 if (!dst_pte) {
2573                         ret = -ENOMEM;
2574                         break;
2575                 }
2576
2577                 /* If the pagetables are shared don't copy or take references */
2578                 if (dst_pte == src_pte)
2579                         continue;
2580
2581                 dst_ptl = huge_pte_lock(h, dst, dst_pte);
2582                 src_ptl = huge_pte_lockptr(h, src, src_pte);
2583                 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
2584                 entry = huge_ptep_get(src_pte);
2585                 if (huge_pte_none(entry)) { /* skip none entry */
2586                         ;
2587                 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
2588                                     is_hugetlb_entry_hwpoisoned(entry))) {
2589                         swp_entry_t swp_entry = pte_to_swp_entry(entry);
2590
2591                         if (is_write_migration_entry(swp_entry) && cow) {
2592                                 /*
2593                                  * COW mappings require pages in both
2594                                  * parent and child to be set to read.
2595                                  */
2596                                 make_migration_entry_read(&swp_entry);
2597                                 entry = swp_entry_to_pte(swp_entry);
2598                                 set_huge_pte_at(src, addr, src_pte, entry);
2599                         }
2600                         set_huge_pte_at(dst, addr, dst_pte, entry);
2601                 } else {
2602                         if (cow) {
2603                                 huge_ptep_set_wrprotect(src, addr, src_pte);
2604                                 mmu_notifier_invalidate_range(src, mmun_start,
2605                                                                    mmun_end);
2606                         }
2607                         entry = huge_ptep_get(src_pte);
2608                         ptepage = pte_page(entry);
2609                         get_page(ptepage);
2610                         page_dup_rmap(ptepage);
2611                         set_huge_pte_at(dst, addr, dst_pte, entry);
2612                 }
2613                 spin_unlock(src_ptl);
2614                 spin_unlock(dst_ptl);
2615         }
2616
2617         if (cow)
2618                 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
2619
2620         return ret;
2621 }
2622
2623 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2624                             unsigned long start, unsigned long end,
2625                             struct page *ref_page)
2626 {
2627         int force_flush = 0;
2628         struct mm_struct *mm = vma->vm_mm;
2629         unsigned long address;
2630         pte_t *ptep;
2631         pte_t pte;
2632         spinlock_t *ptl;
2633         struct page *page;
2634         struct hstate *h = hstate_vma(vma);
2635         unsigned long sz = huge_page_size(h);
2636         const unsigned long mmun_start = start; /* For mmu_notifiers */
2637         const unsigned long mmun_end   = end;   /* For mmu_notifiers */
2638
2639         WARN_ON(!is_vm_hugetlb_page(vma));
2640         BUG_ON(start & ~huge_page_mask(h));
2641         BUG_ON(end & ~huge_page_mask(h));
2642
2643         tlb_start_vma(tlb, vma);
2644         mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2645         address = start;
2646 again:
2647         for (; address < end; address += sz) {
2648                 ptep = huge_pte_offset(mm, address);
2649                 if (!ptep)
2650                         continue;
2651
2652                 ptl = huge_pte_lock(h, mm, ptep);
2653                 if (huge_pmd_unshare(mm, &address, ptep))
2654                         goto unlock;
2655
2656                 pte = huge_ptep_get(ptep);
2657                 if (huge_pte_none(pte))
2658                         goto unlock;
2659
2660                 /*
2661                  * Migrating hugepage or HWPoisoned hugepage is already
2662                  * unmapped and its refcount is dropped, so just clear pte here.
2663                  */
2664                 if (unlikely(!pte_present(pte))) {
2665                         huge_pte_clear(mm, address, ptep);
2666                         goto unlock;
2667                 }
2668
2669                 page = pte_page(pte);
2670                 /*
2671                  * If a reference page is supplied, it is because a specific
2672                  * page is being unmapped, not a range. Ensure the page we
2673                  * are about to unmap is the actual page of interest.
2674                  */
2675                 if (ref_page) {
2676                         if (page != ref_page)
2677                                 goto unlock;
2678
2679                         /*
2680                          * Mark the VMA as having unmapped its page so that
2681                          * future faults in this VMA will fail rather than
2682                          * looking like data was lost
2683                          */
2684                         set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
2685                 }
2686
2687                 pte = huge_ptep_get_and_clear(mm, address, ptep);
2688                 tlb_remove_tlb_entry(tlb, ptep, address);
2689                 if (huge_pte_dirty(pte))
2690                         set_page_dirty(page);
2691
2692                 page_remove_rmap(page);
2693                 force_flush = !__tlb_remove_page(tlb, page);
2694                 if (force_flush) {
2695                         address += sz;
2696                         spin_unlock(ptl);
2697                         break;
2698                 }
2699                 /* Bail out after unmapping reference page if supplied */
2700                 if (ref_page) {
2701                         spin_unlock(ptl);
2702                         break;
2703                 }
2704 unlock:
2705                 spin_unlock(ptl);
2706         }
2707         /*
2708          * mmu_gather ran out of room to batch pages, we break out of
2709          * the PTE lock to avoid doing the potential expensive TLB invalidate
2710          * and page-free while holding it.
2711          */
2712         if (force_flush) {
2713                 force_flush = 0;
2714                 tlb_flush_mmu(tlb);
2715                 if (address < end && !ref_page)
2716                         goto again;
2717         }
2718         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2719         tlb_end_vma(tlb, vma);
2720 }
2721
2722 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2723                           struct vm_area_struct *vma, unsigned long start,
2724                           unsigned long end, struct page *ref_page)
2725 {
2726         __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2727
2728         /*
2729          * Clear this flag so that x86's huge_pmd_share page_table_shareable
2730          * test will fail on a vma being torn down, and not grab a page table
2731          * on its way out.  We're lucky that the flag has such an appropriate
2732          * name, and can in fact be safely cleared here. We could clear it
2733          * before the __unmap_hugepage_range above, but all that's necessary
2734          * is to clear it before releasing the i_mmap_rwsem. This works
2735          * because in the context this is called, the VMA is about to be
2736          * destroyed and the i_mmap_rwsem is held.
2737          */
2738         vma->vm_flags &= ~VM_MAYSHARE;
2739 }
2740
2741 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2742                           unsigned long end, struct page *ref_page)
2743 {
2744         struct mm_struct *mm;
2745         struct mmu_gather tlb;
2746
2747         mm = vma->vm_mm;
2748
2749         tlb_gather_mmu(&tlb, mm, start, end);
2750         __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2751         tlb_finish_mmu(&tlb, start, end);
2752 }
2753
2754 /*
2755  * This is called when the original mapper is failing to COW a MAP_PRIVATE
2756  * mappping it owns the reserve page for. The intention is to unmap the page
2757  * from other VMAs and let the children be SIGKILLed if they are faulting the
2758  * same region.
2759  */
2760 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2761                               struct page *page, unsigned long address)
2762 {
2763         struct hstate *h = hstate_vma(vma);
2764         struct vm_area_struct *iter_vma;
2765         struct address_space *mapping;
2766         pgoff_t pgoff;
2767
2768         /*
2769          * vm_pgoff is in PAGE_SIZE units, hence the different calculation
2770          * from page cache lookup which is in HPAGE_SIZE units.
2771          */
2772         address = address & huge_page_mask(h);
2773         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
2774                         vma->vm_pgoff;
2775         mapping = file_inode(vma->vm_file)->i_mapping;
2776
2777         /*
2778          * Take the mapping lock for the duration of the table walk. As
2779          * this mapping should be shared between all the VMAs,
2780          * __unmap_hugepage_range() is called as the lock is already held
2781          */
2782         i_mmap_lock_write(mapping);
2783         vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
2784                 /* Do not unmap the current VMA */
2785                 if (iter_vma == vma)
2786                         continue;
2787
2788                 /*
2789                  * Unmap the page from other VMAs without their own reserves.
2790                  * They get marked to be SIGKILLed if they fault in these
2791                  * areas. This is because a future no-page fault on this VMA
2792                  * could insert a zeroed page instead of the data existing
2793                  * from the time of fork. This would look like data corruption
2794                  */
2795                 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2796                         unmap_hugepage_range(iter_vma, address,
2797                                              address + huge_page_size(h), page);
2798         }
2799         i_mmap_unlock_write(mapping);
2800 }
2801
2802 /*
2803  * Hugetlb_cow() should be called with page lock of the original hugepage held.
2804  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
2805  * cannot race with other handlers or page migration.
2806  * Keep the pte_same checks anyway to make transition from the mutex easier.
2807  */
2808 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2809                         unsigned long address, pte_t *ptep, pte_t pte,
2810                         struct page *pagecache_page, spinlock_t *ptl)
2811 {
2812         struct hstate *h = hstate_vma(vma);
2813         struct page *old_page, *new_page;
2814         int ret = 0, outside_reserve = 0;
2815         unsigned long mmun_start;       /* For mmu_notifiers */
2816         unsigned long mmun_end;         /* For mmu_notifiers */
2817
2818         old_page = pte_page(pte);
2819
2820 retry_avoidcopy:
2821         /* If no-one else is actually using this page, avoid the copy
2822          * and just make the page writable */
2823         if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
2824                 page_move_anon_rmap(old_page, vma, address);
2825                 set_huge_ptep_writable(vma, address, ptep);
2826                 return 0;
2827         }
2828
2829         /*
2830          * If the process that created a MAP_PRIVATE mapping is about to
2831          * perform a COW due to a shared page count, attempt to satisfy
2832          * the allocation without using the existing reserves. The pagecache
2833          * page is used to determine if the reserve at this address was
2834          * consumed or not. If reserves were used, a partial faulted mapping
2835          * at the time of fork() could consume its reserves on COW instead
2836          * of the full address range.
2837          */
2838         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2839                         old_page != pagecache_page)
2840                 outside_reserve = 1;
2841
2842         page_cache_get(old_page);
2843
2844         /*
2845          * Drop page table lock as buddy allocator may be called. It will
2846          * be acquired again before returning to the caller, as expected.
2847          */
2848         spin_unlock(ptl);
2849         new_page = alloc_huge_page(vma, address, outside_reserve);
2850
2851         if (IS_ERR(new_page)) {
2852                 /*
2853                  * If a process owning a MAP_PRIVATE mapping fails to COW,
2854                  * it is due to references held by a child and an insufficient
2855                  * huge page pool. To guarantee the original mappers
2856                  * reliability, unmap the page from child processes. The child
2857                  * may get SIGKILLed if it later faults.
2858                  */
2859                 if (outside_reserve) {
2860                         page_cache_release(old_page);
2861                         BUG_ON(huge_pte_none(pte));
2862                         unmap_ref_private(mm, vma, old_page, address);
2863                         BUG_ON(huge_pte_none(pte));
2864                         spin_lock(ptl);
2865                         ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2866                         if (likely(ptep &&
2867                                    pte_same(huge_ptep_get(ptep), pte)))
2868                                 goto retry_avoidcopy;
2869                         /*
2870                          * race occurs while re-acquiring page table
2871                          * lock, and our job is done.
2872                          */
2873                         return 0;
2874                 }
2875
2876                 ret = (PTR_ERR(new_page) == -ENOMEM) ?
2877                         VM_FAULT_OOM : VM_FAULT_SIGBUS;
2878                 goto out_release_old;
2879         }
2880
2881         /*
2882          * When the original hugepage is shared one, it does not have
2883          * anon_vma prepared.
2884          */
2885         if (unlikely(anon_vma_prepare(vma))) {
2886                 ret = VM_FAULT_OOM;
2887                 goto out_release_all;
2888         }
2889
2890         copy_user_huge_page(new_page, old_page, address, vma,
2891                             pages_per_huge_page(h));
2892         __SetPageUptodate(new_page);
2893
2894         mmun_start = address & huge_page_mask(h);
2895         mmun_end = mmun_start + huge_page_size(h);
2896         mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2897
2898         /*
2899          * Retake the page table lock to check for racing updates
2900          * before the page tables are altered
2901          */
2902         spin_lock(ptl);
2903         ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2904         if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
2905                 ClearPagePrivate(new_page);
2906
2907                 /* Break COW */
2908                 huge_ptep_clear_flush(vma, address, ptep);
2909                 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
2910                 set_huge_pte_at(mm, address, ptep,
2911                                 make_huge_pte(vma, new_page, 1));
2912                 page_remove_rmap(old_page);
2913                 hugepage_add_new_anon_rmap(new_page, vma, address);
2914                 /* Make the old page be freed below */
2915                 new_page = old_page;
2916         }
2917         spin_unlock(ptl);
2918         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2919 out_release_all:
2920         page_cache_release(new_page);
2921 out_release_old:
2922         page_cache_release(old_page);
2923
2924         spin_lock(ptl); /* Caller expects lock to be held */
2925         return ret;
2926 }
2927
2928 /* Return the pagecache page at a given address within a VMA */
2929 static struct page *hugetlbfs_pagecache_page(struct hstate *h,
2930                         struct vm_area_struct *vma, unsigned long address)
2931 {
2932         struct address_space *mapping;
2933         pgoff_t idx;
2934
2935         mapping = vma->vm_file->f_mapping;
2936         idx = vma_hugecache_offset(h, vma, address);
2937
2938         return find_lock_page(mapping, idx);
2939 }
2940
2941 /*
2942  * Return whether there is a pagecache page to back given address within VMA.
2943  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2944  */
2945 static bool hugetlbfs_pagecache_present(struct hstate *h,
2946                         struct vm_area_struct *vma, unsigned long address)
2947 {
2948         struct address_space *mapping;
2949         pgoff_t idx;
2950         struct page *page;
2951
2952         mapping = vma->vm_file->f_mapping;
2953         idx = vma_hugecache_offset(h, vma, address);
2954
2955         page = find_get_page(mapping, idx);
2956         if (page)
2957                 put_page(page);
2958         return page != NULL;
2959 }
2960
2961 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2962                            struct address_space *mapping, pgoff_t idx,
2963                            unsigned long address, pte_t *ptep, unsigned int flags)
2964 {
2965         struct hstate *h = hstate_vma(vma);
2966         int ret = VM_FAULT_SIGBUS;
2967         int anon_rmap = 0;
2968         unsigned long size;
2969         struct page *page;
2970         pte_t new_pte;
2971         spinlock_t *ptl;
2972
2973         /*
2974          * Currently, we are forced to kill the process in the event the
2975          * original mapper has unmapped pages from the child due to a failed
2976          * COW. Warn that such a situation has occurred as it may not be obvious
2977          */
2978         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2979                 pr_warning("PID %d killed due to inadequate hugepage pool\n",
2980                            current->pid);
2981                 return ret;
2982         }
2983
2984         /*
2985          * Use page lock to guard against racing truncation
2986          * before we get page_table_lock.
2987          */
2988 retry:
2989         page = find_lock_page(mapping, idx);
2990         if (!page) {
2991                 size = i_size_read(mapping->host) >> huge_page_shift(h);
2992                 if (idx >= size)
2993                         goto out;
2994                 page = alloc_huge_page(vma, address, 0);
2995                 if (IS_ERR(page)) {
2996                         ret = PTR_ERR(page);
2997                         if (ret == -ENOMEM)
2998                                 ret = VM_FAULT_OOM;
2999                         else
3000                                 ret = VM_FAULT_SIGBUS;
3001                         goto out;
3002                 }
3003                 clear_huge_page(page, address, pages_per_huge_page(h));
3004                 __SetPageUptodate(page);
3005
3006                 if (vma->vm_flags & VM_MAYSHARE) {
3007                         int err;
3008                         struct inode *inode = mapping->host;
3009
3010                         err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
3011                         if (err) {
3012                                 put_page(page);
3013                                 if (err == -EEXIST)
3014                                         goto retry;
3015                                 goto out;
3016                         }
3017                         ClearPagePrivate(page);
3018
3019                         spin_lock(&inode->i_lock);
3020                         inode->i_blocks += blocks_per_huge_page(h);
3021                         spin_unlock(&inode->i_lock);
3022                 } else {
3023                         lock_page(page);
3024                         if (unlikely(anon_vma_prepare(vma))) {
3025                                 ret = VM_FAULT_OOM;
3026                                 goto backout_unlocked;
3027                         }
3028                         anon_rmap = 1;
3029                 }
3030         } else {
3031                 /*
3032                  * If memory error occurs between mmap() and fault, some process
3033                  * don't have hwpoisoned swap entry for errored virtual address.
3034                  * So we need to block hugepage fault by PG_hwpoison bit check.
3035                  */
3036                 if (unlikely(PageHWPoison(page))) {
3037                         ret = VM_FAULT_HWPOISON |
3038                                 VM_FAULT_SET_HINDEX(hstate_index(h));
3039                         goto backout_unlocked;
3040                 }
3041         }
3042
3043         /*
3044          * If we are going to COW a private mapping later, we examine the
3045          * pending reservations for this page now. This will ensure that
3046          * any allocations necessary to record that reservation occur outside
3047          * the spinlock.
3048          */
3049         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
3050                 if (vma_needs_reservation(h, vma, address) < 0) {
3051                         ret = VM_FAULT_OOM;
3052                         goto backout_unlocked;
3053                 }
3054
3055         ptl = huge_pte_lockptr(h, mm, ptep);
3056         spin_lock(ptl);
3057         size = i_size_read(mapping->host) >> huge_page_shift(h);
3058         if (idx >= size)
3059                 goto backout;
3060
3061         ret = 0;
3062         if (!huge_pte_none(huge_ptep_get(ptep)))
3063                 goto backout;
3064
3065         if (anon_rmap) {
3066                 ClearPagePrivate(page);
3067                 hugepage_add_new_anon_rmap(page, vma, address);
3068         } else
3069                 page_dup_rmap(page);
3070         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
3071                                 && (vma->vm_flags & VM_SHARED)));
3072         set_huge_pte_at(mm, address, ptep, new_pte);
3073
3074         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3075                 /* Optimization, do the COW without a second fault */
3076                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
3077         }
3078
3079         spin_unlock(ptl);
3080         unlock_page(page);
3081 out:
3082         return ret;
3083
3084 backout:
3085         spin_unlock(ptl);
3086 backout_unlocked:
3087         unlock_page(page);
3088         put_page(page);
3089         goto out;
3090 }
3091
3092 #ifdef CONFIG_SMP
3093 static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
3094                             struct vm_area_struct *vma,
3095                             struct address_space *mapping,
3096                             pgoff_t idx, unsigned long address)
3097 {
3098         unsigned long key[2];
3099         u32 hash;
3100
3101         if (vma->vm_flags & VM_SHARED) {
3102                 key[0] = (unsigned long) mapping;
3103                 key[1] = idx;
3104         } else {
3105                 key[0] = (unsigned long) mm;
3106                 key[1] = address >> huge_page_shift(h);
3107         }
3108
3109         hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
3110
3111         return hash & (num_fault_mutexes - 1);
3112 }
3113 #else
3114 /*
3115  * For uniprocesor systems we always use a single mutex, so just
3116  * return 0 and avoid the hashing overhead.
3117  */
3118 static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
3119                             struct vm_area_struct *vma,
3120                             struct address_space *mapping,
3121                             pgoff_t idx, unsigned long address)
3122 {
3123         return 0;
3124 }
3125 #endif
3126
3127 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3128                         unsigned long address, unsigned int flags)
3129 {
3130         pte_t *ptep, entry;
3131         spinlock_t *ptl;
3132         int ret;
3133         u32 hash;
3134         pgoff_t idx;
3135         struct page *page = NULL;
3136         struct page *pagecache_page = NULL;
3137         struct hstate *h = hstate_vma(vma);
3138         struct address_space *mapping;
3139         int need_wait_lock = 0;
3140
3141         address &= huge_page_mask(h);
3142
3143         ptep = huge_pte_offset(mm, address);
3144         if (ptep) {
3145                 entry = huge_ptep_get(ptep);
3146                 if (unlikely(is_hugetlb_entry_migration(entry))) {
3147                         migration_entry_wait_huge(vma, mm, ptep);
3148                         return 0;
3149                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3150                         return VM_FAULT_HWPOISON_LARGE |
3151                                 VM_FAULT_SET_HINDEX(hstate_index(h));
3152         }
3153
3154         ptep = huge_pte_alloc(mm, address, huge_page_size(h));
3155         if (!ptep)
3156                 return VM_FAULT_OOM;
3157
3158         mapping = vma->vm_file->f_mapping;
3159         idx = vma_hugecache_offset(h, vma, address);
3160
3161         /*
3162          * Serialize hugepage allocation and instantiation, so that we don't
3163          * get spurious allocation failures if two CPUs race to instantiate
3164          * the same page in the page cache.
3165          */
3166         hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
3167         mutex_lock(&htlb_fault_mutex_table[hash]);
3168
3169         entry = huge_ptep_get(ptep);
3170         if (huge_pte_none(entry)) {
3171                 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
3172                 goto out_mutex;
3173         }
3174
3175         ret = 0;
3176
3177         /*
3178          * entry could be a migration/hwpoison entry at this point, so this
3179          * check prevents the kernel from going below assuming that we have
3180          * a active hugepage in pagecache. This goto expects the 2nd page fault,
3181          * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
3182          * handle it.
3183          */
3184         if (!pte_present(entry))
3185                 goto out_mutex;
3186
3187         /*
3188          * If we are going to COW the mapping later, we examine the pending
3189          * reservations for this page now. This will ensure that any
3190          * allocations necessary to record that reservation occur outside the
3191          * spinlock. For private mappings, we also lookup the pagecache
3192          * page now as it is used to determine if a reservation has been
3193          * consumed.
3194          */
3195         if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
3196                 if (vma_needs_reservation(h, vma, address) < 0) {
3197                         ret = VM_FAULT_OOM;
3198                         goto out_mutex;
3199                 }
3200
3201                 if (!(vma->vm_flags & VM_MAYSHARE))
3202                         pagecache_page = hugetlbfs_pagecache_page(h,
3203                                                                 vma, address);
3204         }
3205
3206         ptl = huge_pte_lock(h, mm, ptep);
3207
3208         /* Check for a racing update before calling hugetlb_cow */
3209         if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3210                 goto out_ptl;
3211
3212         /*
3213          * hugetlb_cow() requires page locks of pte_page(entry) and
3214          * pagecache_page, so here we need take the former one
3215          * when page != pagecache_page or !pagecache_page.
3216          */
3217         page = pte_page(entry);
3218         if (page != pagecache_page)
3219                 if (!trylock_page(page)) {
3220                         need_wait_lock = 1;
3221                         goto out_ptl;
3222                 }
3223
3224         get_page(page);
3225
3226         if (flags & FAULT_FLAG_WRITE) {
3227                 if (!huge_pte_write(entry)) {
3228                         ret = hugetlb_cow(mm, vma, address, ptep, entry,
3229                                         pagecache_page, ptl);
3230                         goto out_put_page;
3231                 }
3232                 entry = huge_pte_mkdirty(entry);
3233         }
3234         entry = pte_mkyoung(entry);
3235         if (huge_ptep_set_access_flags(vma, address, ptep, entry,
3236                                                 flags & FAULT_FLAG_WRITE))
3237                 update_mmu_cache(vma, address, ptep);
3238 out_put_page:
3239         if (page != pagecache_page)
3240                 unlock_page(page);
3241         put_page(page);
3242 out_ptl:
3243         spin_unlock(ptl);
3244
3245         if (pagecache_page) {
3246                 unlock_page(pagecache_page);
3247                 put_page(pagecache_page);
3248         }
3249 out_mutex:
3250         mutex_unlock(&htlb_fault_mutex_table[hash]);
3251         /*
3252          * Generally it's safe to hold refcount during waiting page lock. But
3253          * here we just wait to defer the next page fault to avoid busy loop and
3254          * the page is not used after unlocked before returning from the current
3255          * page fault. So we are safe from accessing freed page, even if we wait
3256          * here without taking refcount.
3257          */
3258         if (need_wait_lock)
3259                 wait_on_page_locked(page);
3260         return ret;
3261 }
3262
3263 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3264                          struct page **pages, struct vm_area_struct **vmas,
3265                          unsigned long *position, unsigned long *nr_pages,
3266                          long i, unsigned int flags)
3267 {
3268         unsigned long pfn_offset;
3269         unsigned long vaddr = *position;
3270         unsigned long remainder = *nr_pages;
3271         struct hstate *h = hstate_vma(vma);
3272
3273         while (vaddr < vma->vm_end && remainder) {
3274                 pte_t *pte;
3275                 spinlock_t *ptl = NULL;
3276                 int absent;
3277                 struct page *page;
3278
3279                 /*
3280                  * If we have a pending SIGKILL, don't keep faulting pages and
3281                  * potentially allocating memory.
3282                  */
3283                 if (unlikely(fatal_signal_pending(current))) {
3284                         remainder = 0;
3285                         break;
3286                 }
3287
3288                 /*
3289                  * Some archs (sparc64, sh*) have multiple pte_ts to
3290                  * each hugepage.  We have to make sure we get the
3291                  * first, for the page indexing below to work.
3292                  *
3293                  * Note that page table lock is not held when pte is null.
3294                  */
3295                 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
3296                 if (pte)
3297                         ptl = huge_pte_lock(h, mm, pte);
3298                 absent = !pte || huge_pte_none(huge_ptep_get(pte));
3299
3300                 /*
3301                  * When coredumping, it suits get_dump_page if we just return
3302                  * an error where there's an empty slot with no huge pagecache
3303                  * to back it.  This way, we avoid allocating a hugepage, and
3304                  * the sparse dumpfile avoids allocating disk blocks, but its
3305                  * huge holes still show up with zeroes where they need to be.
3306                  */
3307                 if (absent && (flags & FOLL_DUMP) &&
3308                     !hugetlbfs_pagecache_present(h, vma, vaddr)) {
3309                         if (pte)
3310                                 spin_unlock(ptl);
3311                         remainder = 0;
3312                         break;
3313                 }
3314
3315                 /*
3316                  * We need call hugetlb_fault for both hugepages under migration
3317                  * (in which case hugetlb_fault waits for the migration,) and
3318                  * hwpoisoned hugepages (in which case we need to prevent the
3319                  * caller from accessing to them.) In order to do this, we use
3320                  * here is_swap_pte instead of is_hugetlb_entry_migration and
3321                  * is_hugetlb_entry_hwpoisoned. This is because it simply covers
3322                  * both cases, and because we can't follow correct pages
3323                  * directly from any kind of swap entries.
3324                  */
3325                 if (absent || is_swap_pte(huge_ptep_get(pte)) ||
3326                     ((flags & FOLL_WRITE) &&
3327                       !huge_pte_write(huge_ptep_get(pte)))) {
3328                         int ret;
3329
3330                         if (pte)
3331                                 spin_unlock(ptl);
3332                         ret = hugetlb_fault(mm, vma, vaddr,
3333                                 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
3334                         if (!(ret & VM_FAULT_ERROR))
3335                                 continue;
3336
3337                         remainder = 0;
3338                         break;
3339                 }
3340
3341                 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
3342                 page = pte_page(huge_ptep_get(pte));
3343 same_page:
3344                 if (pages) {
3345                         pages[i] = mem_map_offset(page, pfn_offset);
3346                         get_page_foll(pages[i]);
3347                 }
3348
3349                 if (vmas)
3350                         vmas[i] = vma;
3351
3352                 vaddr += PAGE_SIZE;
3353                 ++pfn_offset;
3354                 --remainder;
3355                 ++i;
3356                 if (vaddr < vma->vm_end && remainder &&
3357                                 pfn_offset < pages_per_huge_page(h)) {
3358                         /*
3359                          * We use pfn_offset to avoid touching the pageframes
3360                          * of this compound page.
3361                          */
3362                         goto same_page;
3363                 }
3364                 spin_unlock(ptl);
3365         }
3366         *nr_pages = remainder;
3367         *position = vaddr;
3368
3369         return i ? i : -EFAULT;
3370 }
3371
3372 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3373                 unsigned long address, unsigned long end, pgprot_t newprot)
3374 {
3375         struct mm_struct *mm = vma->vm_mm;
3376         unsigned long start = address;
3377         pte_t *ptep;
3378         pte_t pte;
3379         struct hstate *h = hstate_vma(vma);
3380         unsigned long pages = 0;
3381
3382         BUG_ON(address >= end);
3383         flush_cache_range(vma, address, end);
3384
3385         mmu_notifier_invalidate_range_start(mm, start, end);
3386         i_mmap_lock_write(vma->vm_file->f_mapping);
3387         for (; address < end; address += huge_page_size(h)) {
3388                 spinlock_t *ptl;
3389                 ptep = huge_pte_offset(mm, address);
3390                 if (!ptep)
3391                         continue;
3392                 ptl = huge_pte_lock(h, mm, ptep);
3393                 if (huge_pmd_unshare(mm, &address, ptep)) {
3394                         pages++;
3395                         spin_unlock(ptl);
3396                         continue;
3397                 }
3398                 pte = huge_ptep_get(ptep);
3399                 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
3400                         spin_unlock(ptl);
3401                         continue;
3402                 }
3403                 if (unlikely(is_hugetlb_entry_migration(pte))) {
3404                         swp_entry_t entry = pte_to_swp_entry(pte);
3405
3406                         if (is_write_migration_entry(entry)) {
3407                                 pte_t newpte;
3408
3409                                 make_migration_entry_read(&entry);
3410                                 newpte = swp_entry_to_pte(entry);
3411                                 set_huge_pte_at(mm, address, ptep, newpte);
3412                                 pages++;
3413                         }
3414                         spin_unlock(ptl);
3415                         continue;
3416                 }
3417                 if (!huge_pte_none(pte)) {
3418                         pte = huge_ptep_get_and_clear(mm, address, ptep);
3419                         pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3420                         pte = arch_make_huge_pte(pte, vma, NULL, 0);
3421                         set_huge_pte_at(mm, address, ptep, pte);
3422                         pages++;
3423                 }
3424                 spin_unlock(ptl);
3425         }
3426         /*
3427          * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
3428          * may have cleared our pud entry and done put_page on the page table:
3429          * once we release i_mmap_rwsem, another task can do the final put_page
3430          * and that page table be reused and filled with junk.
3431          */
3432         flush_tlb_range(vma, start, end);
3433         mmu_notifier_invalidate_range(mm, start, end);
3434         i_mmap_unlock_write(vma->vm_file->f_mapping);
3435         mmu_notifier_invalidate_range_end(mm, start, end);
3436
3437         return pages << h->order;
3438 }
3439
3440 int hugetlb_reserve_pages(struct inode *inode,
3441                                         long from, long to,
3442                                         struct vm_area_struct *vma,
3443                                         vm_flags_t vm_flags)
3444 {
3445         long ret, chg;
3446         struct hstate *h = hstate_inode(inode);
3447         struct hugepage_subpool *spool = subpool_inode(inode);
3448         struct resv_map *resv_map;
3449
3450         /*
3451          * Only apply hugepage reservation if asked. At fault time, an
3452          * attempt will be made for VM_NORESERVE to allocate a page
3453          * without using reserves
3454          */
3455         if (vm_flags & VM_NORESERVE)
3456                 return 0;
3457
3458         /*
3459          * Shared mappings base their reservation on the number of pages that
3460          * are already allocated on behalf of the file. Private mappings need
3461          * to reserve the full area even if read-only as mprotect() may be
3462          * called to make the mapping read-write. Assume !vma is a shm mapping
3463          */
3464         if (!vma || vma->vm_flags & VM_MAYSHARE) {
3465                 resv_map = inode_resv_map(inode);
3466
3467                 chg = region_chg(resv_map, from, to);
3468
3469         } else {
3470                 resv_map = resv_map_alloc();
3471                 if (!resv_map)
3472                         return -ENOMEM;
3473
3474                 chg = to - from;
3475
3476                 set_vma_resv_map(vma, resv_map);
3477                 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
3478         }
3479
3480         if (chg < 0) {
3481                 ret = chg;
3482                 goto out_err;
3483         }
3484
3485         /* There must be enough pages in the subpool for the mapping */
3486         if (hugepage_subpool_get_pages(spool, chg)) {
3487                 ret = -ENOSPC;
3488                 goto out_err;
3489         }
3490
3491         /*
3492          * Check enough hugepages are available for the reservation.
3493          * Hand the pages back to the subpool if there are not
3494          */
3495         ret = hugetlb_acct_memory(h, chg);
3496         if (ret < 0) {
3497                 hugepage_subpool_put_pages(spool, chg);
3498                 goto out_err;
3499         }
3500
3501         /*
3502          * Account for the reservations made. Shared mappings record regions
3503          * that have reservations as they are shared by multiple VMAs.
3504          * When the last VMA disappears, the region map says how much
3505          * the reservation was and the page cache tells how much of
3506          * the reservation was consumed. Private mappings are per-VMA and
3507          * only the consumed reservations are tracked. When the VMA
3508          * disappears, the original reservation is the VMA size and the
3509          * consumed reservations are stored in the map. Hence, nothing
3510          * else has to be done for private mappings here
3511          */
3512         if (!vma || vma->vm_flags & VM_MAYSHARE)
3513                 region_add(resv_map, from, to);
3514         return 0;
3515 out_err:
3516         if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3517                 kref_put(&resv_map->refs, resv_map_release);
3518         return ret;
3519 }
3520
3521 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3522 {
3523         struct hstate *h = hstate_inode(inode);
3524         struct resv_map *resv_map = inode_resv_map(inode);
3525         long chg = 0;
3526         struct hugepage_subpool *spool = subpool_inode(inode);
3527
3528         if (resv_map)
3529                 chg = region_truncate(resv_map, offset);
3530         spin_lock(&inode->i_lock);
3531         inode->i_blocks -= (blocks_per_huge_page(h) * freed);
3532         spin_unlock(&inode->i_lock);
3533
3534         hugepage_subpool_put_pages(spool, (chg - freed));
3535         hugetlb_acct_memory(h, -(chg - freed));
3536 }
3537
3538 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
3539 static unsigned long page_table_shareable(struct vm_area_struct *svma,
3540                                 struct vm_area_struct *vma,
3541                                 unsigned long addr, pgoff_t idx)
3542 {
3543         unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
3544                                 svma->vm_start;
3545         unsigned long sbase = saddr & PUD_MASK;
3546         unsigned long s_end = sbase + PUD_SIZE;
3547
3548         /* Allow segments to share if only one is marked locked */
3549         unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
3550         unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
3551
3552         /*
3553          * match the virtual addresses, permission and the alignment of the
3554          * page table page.
3555          */
3556         if (pmd_index(addr) != pmd_index(saddr) ||
3557             vm_flags != svm_flags ||
3558             sbase < svma->vm_start || svma->vm_end < s_end)
3559                 return 0;
3560
3561         return saddr;
3562 }
3563
3564 static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
3565 {
3566         unsigned long base = addr & PUD_MASK;
3567         unsigned long end = base + PUD_SIZE;
3568
3569         /*
3570          * check on proper vm_flags and page table alignment
3571          */
3572         if (vma->vm_flags & VM_MAYSHARE &&
3573             vma->vm_start <= base && end <= vma->vm_end)
3574                 return 1;
3575         return 0;
3576 }
3577
3578 /*
3579  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
3580  * and returns the corresponding pte. While this is not necessary for the
3581  * !shared pmd case because we can allocate the pmd later as well, it makes the
3582  * code much cleaner. pmd allocation is essential for the shared case because
3583  * pud has to be populated inside the same i_mmap_rwsem section - otherwise
3584  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
3585  * bad pmd for sharing.
3586  */
3587 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3588 {
3589         struct vm_area_struct *vma = find_vma(mm, addr);
3590         struct address_space *mapping = vma->vm_file->f_mapping;
3591         pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
3592                         vma->vm_pgoff;
3593         struct vm_area_struct *svma;
3594         unsigned long saddr;
3595         pte_t *spte = NULL;
3596         pte_t *pte;
3597         spinlock_t *ptl;
3598
3599         if (!vma_shareable(vma, addr))
3600                 return (pte_t *)pmd_alloc(mm, pud, addr);
3601
3602         i_mmap_lock_write(mapping);
3603         vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
3604                 if (svma == vma)
3605                         continue;
3606
3607                 saddr = page_table_shareable(svma, vma, addr, idx);
3608                 if (saddr) {
3609                         spte = huge_pte_offset(svma->vm_mm, saddr);
3610                         if (spte) {
3611                                 mm_inc_nr_pmds(mm);
3612                                 get_page(virt_to_page(spte));
3613                                 break;
3614                         }
3615                 }
3616         }
3617
3618         if (!spte)
3619                 goto out;
3620
3621         ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
3622         spin_lock(ptl);
3623         if (pud_none(*pud)) {
3624                 pud_populate(mm, pud,
3625                                 (pmd_t *)((unsigned long)spte & PAGE_MASK));
3626         } else {
3627                 put_page(virt_to_page(spte));
3628                 mm_inc_nr_pmds(mm);
3629         }
3630         spin_unlock(ptl);
3631 out:
3632         pte = (pte_t *)pmd_alloc(mm, pud, addr);
3633         i_mmap_unlock_write(mapping);
3634         return pte;
3635 }
3636
3637 /*
3638  * unmap huge page backed by shared pte.
3639  *
3640  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
3641  * indicated by page_count > 1, unmap is achieved by clearing pud and
3642  * decrementing the ref count. If count == 1, the pte page is not shared.
3643  *
3644  * called with page table lock held.
3645  *
3646  * returns: 1 successfully unmapped a shared pte page
3647  *          0 the underlying pte page is not shared, or it is the last user
3648  */
3649 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
3650 {
3651         pgd_t *pgd = pgd_offset(mm, *addr);
3652         pud_t *pud = pud_offset(pgd, *addr);
3653
3654         BUG_ON(page_count(virt_to_page(ptep)) == 0);
3655         if (page_count(virt_to_page(ptep)) == 1)
3656                 return 0;
3657
3658         pud_clear(pud);
3659         put_page(virt_to_page(ptep));
3660         mm_dec_nr_pmds(mm);
3661         *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
3662         return 1;
3663 }
3664 #define want_pmd_share()        (1)
3665 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
3666 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3667 {
3668         return NULL;
3669 }
3670 #define want_pmd_share()        (0)
3671 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
3672
3673 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
3674 pte_t *huge_pte_alloc(struct mm_struct *mm,
3675                         unsigned long addr, unsigned long sz)
3676 {
3677         pgd_t *pgd;
3678         pud_t *pud;
3679         pte_t *pte = NULL;
3680
3681         pgd = pgd_offset(mm, addr);
3682         pud = pud_alloc(mm, pgd, addr);
3683         if (pud) {
3684                 if (sz == PUD_SIZE) {
3685                         pte = (pte_t *)pud;
3686                 } else {
3687                         BUG_ON(sz != PMD_SIZE);
3688                         if (want_pmd_share() && pud_none(*pud))
3689                                 pte = huge_pmd_share(mm, addr, pud);
3690                         else
3691                                 pte = (pte_t *)pmd_alloc(mm, pud, addr);
3692                 }
3693         }
3694         BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
3695
3696         return pte;
3697 }
3698
3699 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
3700 {
3701         pgd_t *pgd;
3702         pud_t *pud;
3703         pmd_t *pmd = NULL;
3704
3705         pgd = pgd_offset(mm, addr);
3706         if (pgd_present(*pgd)) {
3707                 pud = pud_offset(pgd, addr);
3708                 if (pud_present(*pud)) {
3709                         if (pud_huge(*pud))
3710                                 return (pte_t *)pud;
3711                         pmd = pmd_offset(pud, addr);
3712                 }
3713         }
3714         return (pte_t *) pmd;
3715 }
3716
3717 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3718
3719 /*
3720  * These functions are overwritable if your architecture needs its own
3721  * behavior.
3722  */
3723 struct page * __weak
3724 follow_huge_addr(struct mm_struct *mm, unsigned long address,
3725                               int write)
3726 {
3727         return ERR_PTR(-EINVAL);
3728 }
3729
3730 struct page * __weak
3731 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3732                 pmd_t *pmd, int flags)
3733 {
3734         struct page *page = NULL;
3735         spinlock_t *ptl;
3736 retry:
3737         ptl = pmd_lockptr(mm, pmd);
3738         spin_lock(ptl);
3739         /*
3740          * make sure that the address range covered by this pmd is not
3741          * unmapped from other threads.
3742          */
3743         if (!pmd_huge(*pmd))
3744                 goto out;
3745         if (pmd_present(*pmd)) {
3746                 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
3747                 if (flags & FOLL_GET)
3748                         get_page(page);
3749         } else {
3750                 if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
3751                         spin_unlock(ptl);
3752                         __migration_entry_wait(mm, (pte_t *)pmd, ptl);
3753                         goto retry;
3754                 }
3755                 /*
3756                  * hwpoisoned entry is treated as no_page_table in
3757                  * follow_page_mask().
3758                  */
3759         }
3760 out:
3761         spin_unlock(ptl);
3762         return page;
3763 }
3764
3765 struct page * __weak
3766 follow_huge_pud(struct mm_struct *mm, unsigned long address,
3767                 pud_t *pud, int flags)
3768 {
3769         if (flags & FOLL_GET)
3770                 return NULL;
3771
3772         return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
3773 }
3774
3775 #ifdef CONFIG_MEMORY_FAILURE
3776
3777 /* Should be called in hugetlb_lock */
3778 static int is_hugepage_on_freelist(struct page *hpage)
3779 {
3780         struct page *page;
3781         struct page *tmp;
3782         struct hstate *h = page_hstate(hpage);
3783         int nid = page_to_nid(hpage);
3784
3785         list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
3786                 if (page == hpage)
3787                         return 1;
3788         return 0;
3789 }
3790
3791 /*
3792  * This function is called from memory failure code.
3793  * Assume the caller holds page lock of the head page.
3794  */
3795 int dequeue_hwpoisoned_huge_page(struct page *hpage)
3796 {
3797         struct hstate *h = page_hstate(hpage);
3798         int nid = page_to_nid(hpage);
3799         int ret = -EBUSY;
3800
3801         spin_lock(&hugetlb_lock);
3802         if (is_hugepage_on_freelist(hpage)) {
3803                 /*
3804                  * Hwpoisoned hugepage isn't linked to activelist or freelist,
3805                  * but dangling hpage->lru can trigger list-debug warnings
3806                  * (this happens when we call unpoison_memory() on it),
3807                  * so let it point to itself with list_del_init().
3808                  */
3809                 list_del_init(&hpage->lru);
3810                 set_page_refcounted(hpage);
3811                 h->free_huge_pages--;
3812                 h->free_huge_pages_node[nid]--;
3813                 ret = 0;
3814         }
3815         spin_unlock(&hugetlb_lock);
3816         return ret;
3817 }
3818 #endif
3819
3820 bool isolate_huge_page(struct page *page, struct list_head *list)
3821 {
3822         VM_BUG_ON_PAGE(!PageHead(page), page);
3823         if (!get_page_unless_zero(page))
3824                 return false;
3825         spin_lock(&hugetlb_lock);
3826         list_move_tail(&page->lru, list);
3827         spin_unlock(&hugetlb_lock);
3828         return true;
3829 }
3830
3831 void putback_active_hugepage(struct page *page)
3832 {
3833         VM_BUG_ON_PAGE(!PageHead(page), page);
3834         spin_lock(&hugetlb_lock);
3835         list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3836         spin_unlock(&hugetlb_lock);
3837         put_page(page);
3838 }
3839
3840 bool is_hugepage_active(struct page *page)
3841 {
3842         VM_BUG_ON_PAGE(!PageHuge(page), page);
3843         /*
3844          * This function can be called for a tail page because the caller,
3845          * scan_movable_pages, scans through a given pfn-range which typically
3846          * covers one memory block. In systems using gigantic hugepage (1GB
3847          * for x86_64,) a hugepage is larger than a memory block, and we don't
3848          * support migrating such large hugepages for now, so return false
3849          * when called for tail pages.
3850          */
3851         if (PageTail(page))
3852                 return false;
3853         /*
3854          * Refcount of a hwpoisoned hugepages is 1, but they are not active,
3855          * so we should return false for them.
3856          */
3857         if (unlikely(PageHWPoison(page)))
3858                 return false;
3859         return page_count(page) > 0;
3860 }