mm/huge_memory.c

   1 /*
   2  *  Copyright (C) 2009  Red Hat, Inc.
   3  *
   4  *  This work is licensed under the terms of the GNU GPL, version 2. See
   5  *  the COPYING file in the top-level directory.
   6  */
   7
   8 #include <linux/mm.h>
   9 #include <linux/sched.h>
  10 #include <linux/highmem.h>
  11 #include <linux/hugetlb.h>
  12 #include <linux/mmu_notifier.h>
  13 #include <linux/rmap.h>
  14 #include <linux/swap.h>
  15 #include <asm/tlb.h>
  16 #include <asm/pgalloc.h>
  17 #include "internal.h"
  18
  19 unsigned long transparent_hugepage_flags __read_mostly =
  20         (1<<TRANSPARENT_HUGEPAGE_FLAG);
  21
  22 #ifdef CONFIG_SYSFS
  23 static ssize_t double_flag_show(struct kobject *kobj,
  24                                 struct kobj_attribute *attr, char *buf,
  25                                 enum transparent_hugepage_flag enabled,
  26                                 enum transparent_hugepage_flag req_madv)
  27 {
  28         if (test_bit(enabled, &transparent_hugepage_flags)) {
  29                 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
  30                 return sprintf(buf, "[always] madvise never\n");
  31         } else if (test_bit(req_madv, &transparent_hugepage_flags))
  32                 return sprintf(buf, "always [madvise] never\n");
  33         else
  34                 return sprintf(buf, "always madvise [never]\n");
  35 }
  36 static ssize_t double_flag_store(struct kobject *kobj,
  37                                  struct kobj_attribute *attr,
  38                                  const char *buf, size_t count,
  39                                  enum transparent_hugepage_flag enabled,
  40                                  enum transparent_hugepage_flag req_madv)
  41 {
  42         if (!memcmp("always", buf,
  43                     min(sizeof("always")-1, count))) {
  44                 set_bit(enabled, &transparent_hugepage_flags);
  45                 clear_bit(req_madv, &transparent_hugepage_flags);
  46         } else if (!memcmp("madvise", buf,
  47                            min(sizeof("madvise")-1, count))) {
  48                 clear_bit(enabled, &transparent_hugepage_flags);
  49                 set_bit(req_madv, &transparent_hugepage_flags);
  50         } else if (!memcmp("never", buf,
  51                            min(sizeof("never")-1, count))) {
  52                 clear_bit(enabled, &transparent_hugepage_flags);
  53                 clear_bit(req_madv, &transparent_hugepage_flags);
  54         } else
  55                 return -EINVAL;
  56
  57         return count;
  58 }
  59
  60 static ssize_t enabled_show(struct kobject *kobj,
  61                             struct kobj_attribute *attr, char *buf)
  62 {
  63         return double_flag_show(kobj, attr, buf,
  64                                 TRANSPARENT_HUGEPAGE_FLAG,
  65                                 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
  66 }
  67 static ssize_t enabled_store(struct kobject *kobj,
  68                              struct kobj_attribute *attr,
  69                              const char *buf, size_t count)
  70 {
  71         return double_flag_store(kobj, attr, buf, count,
  72                                  TRANSPARENT_HUGEPAGE_FLAG,
  73                                  TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
  74 }
  75 static struct kobj_attribute enabled_attr =
  76         __ATTR(enabled, 0644, enabled_show, enabled_store);
  77
  78 static ssize_t single_flag_show(struct kobject *kobj,
  79                                 struct kobj_attribute *attr, char *buf,
  80                                 enum transparent_hugepage_flag flag)
  81 {
  82         if (test_bit(flag, &transparent_hugepage_flags))
  83                 return sprintf(buf, "[yes] no\n");
  84         else
  85                 return sprintf(buf, "yes [no]\n");
  86 }
  87 static ssize_t single_flag_store(struct kobject *kobj,
  88                                  struct kobj_attribute *attr,
  89                                  const char *buf, size_t count,
  90                                  enum transparent_hugepage_flag flag)
  91 {
  92         if (!memcmp("yes", buf,
  93                     min(sizeof("yes")-1, count))) {
  94                 set_bit(flag, &transparent_hugepage_flags);
  95         } else if (!memcmp("no", buf,
  96                            min(sizeof("no")-1, count))) {
  97                 clear_bit(flag, &transparent_hugepage_flags);
  98         } else
  99                 return -EINVAL;
 100
 101         return count;
 102 }
 103
 104 /*
 105  * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
 106  * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
 107  * memory just to allocate one more hugepage.
 108  */
 109 static ssize_t defrag_show(struct kobject *kobj,
 110                            struct kobj_attribute *attr, char *buf)
 111 {
 112         return double_flag_show(kobj, attr, buf,
 113                                 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
 114                                 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
 115 }
 116 static ssize_t defrag_store(struct kobject *kobj,
 117                             struct kobj_attribute *attr,
 118                             const char *buf, size_t count)
 119 {
 120         return double_flag_store(kobj, attr, buf, count,
 121                                  TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
 122                                  TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
 123 }
 124 static struct kobj_attribute defrag_attr =
 125         __ATTR(defrag, 0644, defrag_show, defrag_store);
 126
 127 #ifdef CONFIG_DEBUG_VM
 128 static ssize_t debug_cow_show(struct kobject *kobj,
 129                                 struct kobj_attribute *attr, char *buf)
 130 {
 131         return single_flag_show(kobj, attr, buf,
 132                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 133 }
 134 static ssize_t debug_cow_store(struct kobject *kobj,
 135                                struct kobj_attribute *attr,
 136                                const char *buf, size_t count)
 137 {
 138         return single_flag_store(kobj, attr, buf, count,
 139                                  TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 140 }
 141 static struct kobj_attribute debug_cow_attr =
 142         __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
 143 #endif /* CONFIG_DEBUG_VM */
 144
 145 static struct attribute *hugepage_attr[] = {
 146         &enabled_attr.attr,
 147         &defrag_attr.attr,
 148 #ifdef CONFIG_DEBUG_VM
 149         &debug_cow_attr.attr,
 150 #endif
 151         NULL,
 152 };
 153
 154 static struct attribute_group hugepage_attr_group = {
 155         .attrs = hugepage_attr,
 156         .name = "transparent_hugepage",
 157 };
 158 #endif /* CONFIG_SYSFS */
 159
 160 static int __init hugepage_init(void)
 161 {
 162 #ifdef CONFIG_SYSFS
 163         int err;
 164
 165         err = sysfs_create_group(mm_kobj, &hugepage_attr_group);
 166         if (err)
 167                 printk(KERN_ERR "hugepage: register sysfs failed\n");
 168 #endif
 169         return 0;
 170 }
 171 module_init(hugepage_init)
 172
 173 static int __init setup_transparent_hugepage(char *str)
 174 {
 175         int ret = 0;
 176         if (!str)
 177                 goto out;
 178         if (!strcmp(str, "always")) {
 179                 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
 180                         &transparent_hugepage_flags);
 181                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 182                           &transparent_hugepage_flags);
 183                 ret = 1;
 184         } else if (!strcmp(str, "madvise")) {
 185                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 186                           &transparent_hugepage_flags);
 187                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 188                         &transparent_hugepage_flags);
 189                 ret = 1;
 190         } else if (!strcmp(str, "never")) {
 191                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 192                           &transparent_hugepage_flags);
 193                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 194                           &transparent_hugepage_flags);
 195                 ret = 1;
 196         }
 197 out:
 198         if (!ret)
 199                 printk(KERN_WARNING
 200                        "transparent_hugepage= cannot parse, ignored\n");
 201         return ret;
 202 }
 203 __setup("transparent_hugepage=", setup_transparent_hugepage);
 204
 205 static void prepare_pmd_huge_pte(pgtable_t pgtable,
 206                                  struct mm_struct *mm)
 207 {
 208         assert_spin_locked(&mm->page_table_lock);
 209
 210         /* FIFO */
 211         if (!mm->pmd_huge_pte)
 212                 INIT_LIST_HEAD(&pgtable->lru);
 213         else
 214                 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
 215         mm->pmd_huge_pte = pgtable;
 216 }
 217
 218 static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 219 {
 220         if (likely(vma->vm_flags & VM_WRITE))
 221                 pmd = pmd_mkwrite(pmd);
 222         return pmd;
 223 }
 224
 225 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 226                                         struct vm_area_struct *vma,
 227                                         unsigned long haddr, pmd_t *pmd,
 228                                         struct page *page)
 229 {
 230         int ret = 0;
 231         pgtable_t pgtable;
 232
 233         VM_BUG_ON(!PageCompound(page));
 234         pgtable = pte_alloc_one(mm, haddr);
 235         if (unlikely(!pgtable)) {
 236                 mem_cgroup_uncharge_page(page);
 237                 put_page(page);
 238                 return VM_FAULT_OOM;
 239         }
 240
 241         clear_huge_page(page, haddr, HPAGE_PMD_NR);
 242         __SetPageUptodate(page);
 243
 244         spin_lock(&mm->page_table_lock);
 245         if (unlikely(!pmd_none(*pmd))) {
 246                 spin_unlock(&mm->page_table_lock);
 247                 mem_cgroup_uncharge_page(page);
 248                 put_page(page);
 249                 pte_free(mm, pgtable);
 250         } else {
 251                 pmd_t entry;
 252                 entry = mk_pmd(page, vma->vm_page_prot);
 253                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 254                 entry = pmd_mkhuge(entry);
 255                 /*
 256                  * The spinlocking to take the lru_lock inside
 257                  * page_add_new_anon_rmap() acts as a full memory
 258                  * barrier to be sure clear_huge_page writes become
 259                  * visible after the set_pmd_at() write.
 260                  */
 261                 page_add_new_anon_rmap(page, vma, haddr);
 262                 set_pmd_at(mm, haddr, pmd, entry);
 263                 prepare_pmd_huge_pte(pgtable, mm);
 264                 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 265                 spin_unlock(&mm->page_table_lock);
 266         }
 267
 268         return ret;
 269 }
 270
 271 static inline struct page *alloc_hugepage(int defrag)
 272 {
 273         return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
 274                            HPAGE_PMD_ORDER);
 275 }
 276
 277 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 278                                unsigned long address, pmd_t *pmd,
 279                                unsigned int flags)
 280 {
 281         struct page *page;
 282         unsigned long haddr = address & HPAGE_PMD_MASK;
 283         pte_t *pte;
 284
 285         if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
 286                 if (unlikely(anon_vma_prepare(vma)))
 287                         return VM_FAULT_OOM;
 288                 page = alloc_hugepage(transparent_hugepage_defrag(vma));
 289                 if (unlikely(!page))
 290                         goto out;
 291                 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
 292                         put_page(page);
 293                         goto out;
 294                 }
 295
 296                 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
 297         }
 298 out:
 299         /*
 300          * Use __pte_alloc instead of pte_alloc_map, because we can't
 301          * run pte_offset_map on the pmd, if an huge pmd could
 302          * materialize from under us from a different thread.
 303          */
 304         if (unlikely(__pte_alloc(mm, vma, pmd, address)))
 305                 return VM_FAULT_OOM;
 306         /* if an huge pmd materialized from under us just retry later */
 307         if (unlikely(pmd_trans_huge(*pmd)))
 308                 return 0;
 309         /*
 310          * A regular pmd is established and it can't morph into a huge pmd
 311          * from under us anymore at this point because we hold the mmap_sem
 312          * read mode and khugepaged takes it in write mode. So now it's
 313          * safe to run pte_offset_map().
 314          */
 315         pte = pte_offset_map(pmd, address);
 316         return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 317 }
 318
 319 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 320                   pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 321                   struct vm_area_struct *vma)
 322 {
 323         struct page *src_page;
 324         pmd_t pmd;
 325         pgtable_t pgtable;
 326         int ret;
 327
 328         ret = -ENOMEM;
 329         pgtable = pte_alloc_one(dst_mm, addr);
 330         if (unlikely(!pgtable))
 331                 goto out;
 332
 333         spin_lock(&dst_mm->page_table_lock);
 334         spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
 335
 336         ret = -EAGAIN;
 337         pmd = *src_pmd;
 338         if (unlikely(!pmd_trans_huge(pmd))) {
 339                 pte_free(dst_mm, pgtable);
 340                 goto out_unlock;
 341         }
 342         if (unlikely(pmd_trans_splitting(pmd))) {
 343                 /* split huge page running from under us */
 344                 spin_unlock(&src_mm->page_table_lock);
 345                 spin_unlock(&dst_mm->page_table_lock);
 346                 pte_free(dst_mm, pgtable);
 347
 348                 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
 349                 goto out;
 350         }
 351         src_page = pmd_page(pmd);
 352         VM_BUG_ON(!PageHead(src_page));
 353         get_page(src_page);
 354         page_dup_rmap(src_page);
 355         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 356
 357         pmdp_set_wrprotect(src_mm, addr, src_pmd);
 358         pmd = pmd_mkold(pmd_wrprotect(pmd));
 359         set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 360         prepare_pmd_huge_pte(pgtable, dst_mm);
 361
 362         ret = 0;
 363 out_unlock:
 364         spin_unlock(&src_mm->page_table_lock);
 365         spin_unlock(&dst_mm->page_table_lock);
 366 out:
 367         return ret;
 368 }
 369
 370 /* no "address" argument so destroys page coloring of some arch */
 371 pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
 372 {
 373         pgtable_t pgtable;
 374
 375         assert_spin_locked(&mm->page_table_lock);
 376
 377         /* FIFO */
 378         pgtable = mm->pmd_huge_pte;
 379         if (list_empty(&pgtable->lru))
 380                 mm->pmd_huge_pte = NULL;
 381         else {
 382                 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
 383                                               struct page, lru);
 384                 list_del(&pgtable->lru);
 385         }
 386         return pgtable;
 387 }
 388
 389 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 390                                         struct vm_area_struct *vma,
 391                                         unsigned long address,
 392                                         pmd_t *pmd, pmd_t orig_pmd,
 393                                         struct page *page,
 394                                         unsigned long haddr)
 395 {
 396         pgtable_t pgtable;
 397         pmd_t _pmd;
 398         int ret = 0, i;
 399         struct page **pages;
 400
 401         pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
 402                         GFP_KERNEL);
 403         if (unlikely(!pages)) {
 404                 ret |= VM_FAULT_OOM;
 405                 goto out;
 406         }
 407
 408         for (i = 0; i < HPAGE_PMD_NR; i++) {
 409                 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
 410                                           vma, address);
 411                 if (unlikely(!pages[i] ||
 412                              mem_cgroup_newpage_charge(pages[i], mm,
 413                                                        GFP_KERNEL))) {
 414                         if (pages[i])
 415                                 put_page(pages[i]);
 416                         mem_cgroup_uncharge_start();
 417                         while (--i >= 0) {
 418                                 mem_cgroup_uncharge_page(pages[i]);
 419                                 put_page(pages[i]);
 420                         }
 421                         mem_cgroup_uncharge_end();
 422                         kfree(pages);
 423                         ret |= VM_FAULT_OOM;
 424                         goto out;
 425                 }
 426         }
 427
 428         for (i = 0; i < HPAGE_PMD_NR; i++) {
 429                 copy_user_highpage(pages[i], page + i,
 430                                    haddr + PAGE_SHIFT*i, vma);
 431                 __SetPageUptodate(pages[i]);
 432                 cond_resched();
 433         }
 434
 435         spin_lock(&mm->page_table_lock);
 436         if (unlikely(!pmd_same(*pmd, orig_pmd)))
 437                 goto out_free_pages;
 438         VM_BUG_ON(!PageHead(page));
 439
 440         pmdp_clear_flush_notify(vma, haddr, pmd);
 441         /* leave pmd empty until pte is filled */
 442
 443         pgtable = get_pmd_huge_pte(mm);
 444         pmd_populate(mm, &_pmd, pgtable);
 445
 446         for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 447                 pte_t *pte, entry;
 448                 entry = mk_pte(pages[i], vma->vm_page_prot);
 449                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 450                 page_add_new_anon_rmap(pages[i], vma, haddr);
 451                 pte = pte_offset_map(&_pmd, haddr);
 452                 VM_BUG_ON(!pte_none(*pte));
 453                 set_pte_at(mm, haddr, pte, entry);
 454                 pte_unmap(pte);
 455         }
 456         kfree(pages);
 457
 458         mm->nr_ptes++;
 459         smp_wmb(); /* make pte visible before pmd */
 460         pmd_populate(mm, pmd, pgtable);
 461         page_remove_rmap(page);
 462         spin_unlock(&mm->page_table_lock);
 463
 464         ret |= VM_FAULT_WRITE;
 465         put_page(page);
 466
 467 out:
 468         return ret;
 469
 470 out_free_pages:
 471         spin_unlock(&mm->page_table_lock);
 472         mem_cgroup_uncharge_start();
 473         for (i = 0; i < HPAGE_PMD_NR; i++) {
 474                 mem_cgroup_uncharge_page(pages[i]);
 475                 put_page(pages[i]);
 476         }
 477         mem_cgroup_uncharge_end();
 478         kfree(pages);
 479         goto out;
 480 }
 481
 482 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 483                         unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
 484 {
 485         int ret = 0;
 486         struct page *page, *new_page;
 487         unsigned long haddr;
 488
 489         VM_BUG_ON(!vma->anon_vma);
 490         spin_lock(&mm->page_table_lock);
 491         if (unlikely(!pmd_same(*pmd, orig_pmd)))
 492                 goto out_unlock;
 493
 494         page = pmd_page(orig_pmd);
 495         VM_BUG_ON(!PageCompound(page) || !PageHead(page));
 496         haddr = address & HPAGE_PMD_MASK;
 497         if (page_mapcount(page) == 1) {
 498                 pmd_t entry;
 499                 entry = pmd_mkyoung(orig_pmd);
 500                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 501                 if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
 502                         update_mmu_cache(vma, address, entry);
 503                 ret |= VM_FAULT_WRITE;
 504                 goto out_unlock;
 505         }
 506         get_page(page);
 507         spin_unlock(&mm->page_table_lock);
 508
 509         if (transparent_hugepage_enabled(vma) &&
 510             !transparent_hugepage_debug_cow())
 511                 new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
 512         else
 513                 new_page = NULL;
 514
 515         if (unlikely(!new_page)) {
 516                 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 517                                                    pmd, orig_pmd, page, haddr);
 518                 put_page(page);
 519                 goto out;
 520         }
 521
 522         if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 523                 put_page(new_page);
 524                 put_page(page);
 525                 ret |= VM_FAULT_OOM;
 526                 goto out;
 527         }
 528
 529         copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 530         __SetPageUptodate(new_page);
 531
 532         spin_lock(&mm->page_table_lock);
 533         put_page(page);
 534         if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 535                 mem_cgroup_uncharge_page(new_page);
 536                 put_page(new_page);
 537         } else {
 538                 pmd_t entry;
 539                 VM_BUG_ON(!PageHead(page));
 540                 entry = mk_pmd(new_page, vma->vm_page_prot);
 541                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 542                 entry = pmd_mkhuge(entry);
 543                 pmdp_clear_flush_notify(vma, haddr, pmd);
 544                 page_add_new_anon_rmap(new_page, vma, haddr);
 545                 set_pmd_at(mm, haddr, pmd, entry);
 546                 update_mmu_cache(vma, address, entry);
 547                 page_remove_rmap(page);
 548                 put_page(page);
 549                 ret |= VM_FAULT_WRITE;
 550         }
 551 out_unlock:
 552         spin_unlock(&mm->page_table_lock);
 553 out:
 554         return ret;
 555 }
 556
 557 struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 558                                    unsigned long addr,
 559                                    pmd_t *pmd,
 560                                    unsigned int flags)
 561 {
 562         struct page *page = NULL;
 563
 564         assert_spin_locked(&mm->page_table_lock);
 565
 566         if (flags & FOLL_WRITE && !pmd_write(*pmd))
 567                 goto out;
 568
 569         page = pmd_page(*pmd);
 570         VM_BUG_ON(!PageHead(page));
 571         if (flags & FOLL_TOUCH) {
 572                 pmd_t _pmd;
 573                 /*
 574                  * We should set the dirty bit only for FOLL_WRITE but
 575                  * for now the dirty bit in the pmd is meaningless.
 576                  * And if the dirty bit will become meaningful and
 577                  * we'll only set it with FOLL_WRITE, an atomic
 578                  * set_bit will be required on the pmd to set the
 579                  * young bit, instead of the current set_pmd_at.
 580                  */
 581                 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
 582                 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
 583         }
 584         page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 585         VM_BUG_ON(!PageCompound(page));
 586         if (flags & FOLL_GET)
 587                 get_page(page);
 588
 589 out:
 590         return page;
 591 }
 592
 593 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 594                  pmd_t *pmd)
 595 {
 596         int ret = 0;
 597
 598         spin_lock(&tlb->mm->page_table_lock);
 599         if (likely(pmd_trans_huge(*pmd))) {
 600                 if (unlikely(pmd_trans_splitting(*pmd))) {
 601                         spin_unlock(&tlb->mm->page_table_lock);
 602                         wait_split_huge_page(vma->anon_vma,
 603                                              pmd);
 604                 } else {
 605                         struct page *page;
 606                         pgtable_t pgtable;
 607                         pgtable = get_pmd_huge_pte(tlb->mm);
 608                         page = pmd_page(*pmd);
 609                         pmd_clear(pmd);
 610                         page_remove_rmap(page);
 611                         VM_BUG_ON(page_mapcount(page) < 0);
 612                         add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 613                         VM_BUG_ON(!PageHead(page));
 614                         spin_unlock(&tlb->mm->page_table_lock);
 615                         tlb_remove_page(tlb, page);
 616                         pte_free(tlb->mm, pgtable);
 617                         ret = 1;
 618                 }
 619         } else
 620                 spin_unlock(&tlb->mm->page_table_lock);
 621
 622         return ret;
 623 }
 624
 625 pmd_t *page_check_address_pmd(struct page *page,
 626                               struct mm_struct *mm,
 627                               unsigned long address,
 628                               enum page_check_address_pmd_flag flag)
 629 {
 630         pgd_t *pgd;
 631         pud_t *pud;
 632         pmd_t *pmd, *ret = NULL;
 633
 634         if (address & ~HPAGE_PMD_MASK)
 635                 goto out;
 636
 637         pgd = pgd_offset(mm, address);
 638         if (!pgd_present(*pgd))
 639                 goto out;
 640
 641         pud = pud_offset(pgd, address);
 642         if (!pud_present(*pud))
 643                 goto out;
 644
 645         pmd = pmd_offset(pud, address);
 646         if (pmd_none(*pmd))
 647                 goto out;
 648         if (pmd_page(*pmd) != page)
 649                 goto out;
 650         VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
 651                   pmd_trans_splitting(*pmd));
 652         if (pmd_trans_huge(*pmd)) {
 653                 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
 654                           !pmd_trans_splitting(*pmd));
 655                 ret = pmd;
 656         }
 657 out:
 658         return ret;
 659 }
 660
 661 static int __split_huge_page_splitting(struct page *page,
 662                                        struct vm_area_struct *vma,
 663                                        unsigned long address)
 664 {
 665         struct mm_struct *mm = vma->vm_mm;
 666         pmd_t *pmd;
 667         int ret = 0;
 668
 669         spin_lock(&mm->page_table_lock);
 670         pmd = page_check_address_pmd(page, mm, address,
 671                                      PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
 672         if (pmd) {
 673                 /*
 674                  * We can't temporarily set the pmd to null in order
 675                  * to split it, the pmd must remain marked huge at all
 676                  * times or the VM won't take the pmd_trans_huge paths
 677                  * and it won't wait on the anon_vma->root->lock to
 678                  * serialize against split_huge_page*.
 679                  */
 680                 pmdp_splitting_flush_notify(vma, address, pmd);
 681                 ret = 1;
 682         }
 683         spin_unlock(&mm->page_table_lock);
 684
 685         return ret;
 686 }
 687
 688 static void __split_huge_page_refcount(struct page *page)
 689 {
 690         int i;
 691         unsigned long head_index = page->index;
 692         struct zone *zone = page_zone(page);
 693
 694         /* prevent PageLRU to go away from under us, and freeze lru stats */
 695         spin_lock_irq(&zone->lru_lock);
 696         compound_lock(page);
 697
 698         for (i = 1; i < HPAGE_PMD_NR; i++) {
 699                 struct page *page_tail = page + i;
 700
 701                 /* tail_page->_count cannot change */
 702                 atomic_sub(atomic_read(&page_tail->_count), &page->_count);
 703                 BUG_ON(page_count(page) <= 0);
 704                 atomic_add(page_mapcount(page) + 1, &page_tail->_count);
 705                 BUG_ON(atomic_read(&page_tail->_count) <= 0);
 706
 707                 /* after clearing PageTail the gup refcount can be released */
 708                 smp_mb();
 709
 710                 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 711                 page_tail->flags |= (page->flags &
 712                                      ((1L << PG_referenced) |
 713                                       (1L << PG_swapbacked) |
 714                                       (1L << PG_mlocked) |
 715                                       (1L << PG_uptodate)));
 716                 page_tail->flags |= (1L << PG_dirty);
 717
 718                 /*
 719                  * 1) clear PageTail before overwriting first_page
 720                  * 2) clear PageTail before clearing PageHead for VM_BUG_ON
 721                  */
 722                 smp_wmb();
 723
 724                 /*
 725                  * __split_huge_page_splitting() already set the
 726                  * splitting bit in all pmd that could map this
 727                  * hugepage, that will ensure no CPU can alter the
 728                  * mapcount on the head page. The mapcount is only
 729                  * accounted in the head page and it has to be
 730                  * transferred to all tail pages in the below code. So
 731                  * for this code to be safe, the split the mapcount
 732                  * can't change. But that doesn't mean userland can't
 733                  * keep changing and reading the page contents while
 734                  * we transfer the mapcount, so the pmd splitting
 735                  * status is achieved setting a reserved bit in the
 736                  * pmd, not by clearing the present bit.
 737                 */
 738                 BUG_ON(page_mapcount(page_tail));
 739                 page_tail->_mapcount = page->_mapcount;
 740
 741                 BUG_ON(page_tail->mapping);
 742                 page_tail->mapping = page->mapping;
 743
 744                 page_tail->index = ++head_index;
 745
 746                 BUG_ON(!PageAnon(page_tail));
 747                 BUG_ON(!PageUptodate(page_tail));
 748                 BUG_ON(!PageDirty(page_tail));
 749                 BUG_ON(!PageSwapBacked(page_tail));
 750
 751                 lru_add_page_tail(zone, page, page_tail);
 752         }
 753
 754         ClearPageCompound(page);
 755         compound_unlock(page);
 756         spin_unlock_irq(&zone->lru_lock);
 757
 758         for (i = 1; i < HPAGE_PMD_NR; i++) {
 759                 struct page *page_tail = page + i;
 760                 BUG_ON(page_count(page_tail) <= 0);
 761                 /*
 762                  * Tail pages may be freed if there wasn't any mapping
 763                  * like if add_to_swap() is running on a lru page that
 764                  * had its mapping zapped. And freeing these pages
 765                  * requires taking the lru_lock so we do the put_page
 766                  * of the tail pages after the split is complete.
 767                  */
 768                 put_page(page_tail);
 769         }
 770
 771         /*
 772          * Only the head page (now become a regular page) is required
 773          * to be pinned by the caller.
 774          */
 775         BUG_ON(page_count(page) <= 0);
 776 }
 777
 778 static int __split_huge_page_map(struct page *page,
 779                                  struct vm_area_struct *vma,
 780                                  unsigned long address)
 781 {
 782         struct mm_struct *mm = vma->vm_mm;
 783         pmd_t *pmd, _pmd;
 784         int ret = 0, i;
 785         pgtable_t pgtable;
 786         unsigned long haddr;
 787
 788         spin_lock(&mm->page_table_lock);
 789         pmd = page_check_address_pmd(page, mm, address,
 790                                      PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
 791         if (pmd) {
 792                 pgtable = get_pmd_huge_pte(mm);
 793                 pmd_populate(mm, &_pmd, pgtable);
 794
 795                 for (i = 0, haddr = address; i < HPAGE_PMD_NR;
 796                      i++, haddr += PAGE_SIZE) {
 797                         pte_t *pte, entry;
 798                         BUG_ON(PageCompound(page+i));
 799                         entry = mk_pte(page + i, vma->vm_page_prot);
 800                         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 801                         if (!pmd_write(*pmd))
 802                                 entry = pte_wrprotect(entry);
 803                         else
 804                                 BUG_ON(page_mapcount(page) != 1);
 805                         if (!pmd_young(*pmd))
 806                                 entry = pte_mkold(entry);
 807                         pte = pte_offset_map(&_pmd, haddr);
 808                         BUG_ON(!pte_none(*pte));
 809                         set_pte_at(mm, haddr, pte, entry);
 810                         pte_unmap(pte);
 811                 }
 812
 813                 mm->nr_ptes++;
 814                 smp_wmb(); /* make pte visible before pmd */
 815                 /*
 816                  * Up to this point the pmd is present and huge and
 817                  * userland has the whole access to the hugepage
 818                  * during the split (which happens in place). If we
 819                  * overwrite the pmd with the not-huge version
 820                  * pointing to the pte here (which of course we could
 821                  * if all CPUs were bug free), userland could trigger
 822                  * a small page size TLB miss on the small sized TLB
 823                  * while the hugepage TLB entry is still established
 824                  * in the huge TLB. Some CPU doesn't like that. See
 825                  * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
 826                  * Erratum 383 on page 93. Intel should be safe but is
 827                  * also warns that it's only safe if the permission
 828                  * and cache attributes of the two entries loaded in
 829                  * the two TLB is identical (which should be the case
 830                  * here). But it is generally safer to never allow
 831                  * small and huge TLB entries for the same virtual
 832                  * address to be loaded simultaneously. So instead of
 833                  * doing "pmd_populate(); flush_tlb_range();" we first
 834                  * mark the current pmd notpresent (atomically because
 835                  * here the pmd_trans_huge and pmd_trans_splitting
 836                  * must remain set at all times on the pmd until the
 837                  * split is complete for this pmd), then we flush the
 838                  * SMP TLB and finally we write the non-huge version
 839                  * of the pmd entry with pmd_populate.
 840                  */
 841                 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
 842                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 843                 pmd_populate(mm, pmd, pgtable);
 844                 ret = 1;
 845         }
 846         spin_unlock(&mm->page_table_lock);
 847
 848         return ret;
 849 }
 850
 851 /* must be called with anon_vma->root->lock hold */
 852 static void __split_huge_page(struct page *page,
 853                               struct anon_vma *anon_vma)
 854 {
 855         int mapcount, mapcount2;
 856         struct anon_vma_chain *avc;
 857
 858         BUG_ON(!PageHead(page));
 859         BUG_ON(PageTail(page));
 860
 861         mapcount = 0;
 862         list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
 863                 struct vm_area_struct *vma = avc->vma;
 864                 unsigned long addr = vma_address(page, vma);
 865                 BUG_ON(is_vma_temporary_stack(vma));
 866                 if (addr == -EFAULT)
 867                         continue;
 868                 mapcount += __split_huge_page_splitting(page, vma, addr);
 869         }
 870         /*
 871          * It is critical that new vmas are added to the tail of the
 872          * anon_vma list. This guarantes that if copy_huge_pmd() runs
 873          * and establishes a child pmd before
 874          * __split_huge_page_splitting() freezes the parent pmd (so if
 875          * we fail to prevent copy_huge_pmd() from running until the
 876          * whole __split_huge_page() is complete), we will still see
 877          * the newly established pmd of the child later during the
 878          * walk, to be able to set it as pmd_trans_splitting too.
 879          */
 880         if (mapcount != page_mapcount(page))
 881                 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
 882                        mapcount, page_mapcount(page));
 883         BUG_ON(mapcount != page_mapcount(page));
 884
 885         __split_huge_page_refcount(page);
 886
 887         mapcount2 = 0;
 888         list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
 889                 struct vm_area_struct *vma = avc->vma;
 890                 unsigned long addr = vma_address(page, vma);
 891                 BUG_ON(is_vma_temporary_stack(vma));
 892                 if (addr == -EFAULT)
 893                         continue;
 894                 mapcount2 += __split_huge_page_map(page, vma, addr);
 895         }
 896         if (mapcount != mapcount2)
 897                 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
 898                        mapcount, mapcount2, page_mapcount(page));
 899         BUG_ON(mapcount != mapcount2);
 900 }
 901
 902 int split_huge_page(struct page *page)
 903 {
 904         struct anon_vma *anon_vma;
 905         int ret = 1;
 906
 907         BUG_ON(!PageAnon(page));
 908         anon_vma = page_lock_anon_vma(page);
 909         if (!anon_vma)
 910                 goto out;
 911         ret = 0;
 912         if (!PageCompound(page))
 913                 goto out_unlock;
 914
 915         BUG_ON(!PageSwapBacked(page));
 916         __split_huge_page(page, anon_vma);
 917
 918         BUG_ON(PageCompound(page));
 919 out_unlock:
 920         page_unlock_anon_vma(anon_vma);
 921 out:
 922         return ret;
 923 }
 924
 925 int hugepage_madvise(unsigned long *vm_flags)
 926 {
 927         /*
 928          * Be somewhat over-protective like KSM for now!
 929          */
 930         if (*vm_flags & (VM_HUGEPAGE | VM_SHARED  | VM_MAYSHARE   |
 931                          VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
 932                          VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
 933                          VM_MIXEDMAP | VM_SAO))
 934                 return -EINVAL;
 935
 936         *vm_flags |= VM_HUGEPAGE;
 937
 938         return 0;
 939 }
 940
 941 void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
 942 {
 943         struct page *page;
 944
 945         spin_lock(&mm->page_table_lock);
 946         if (unlikely(!pmd_trans_huge(*pmd))) {
 947                 spin_unlock(&mm->page_table_lock);
 948                 return;
 949         }
 950         page = pmd_page(*pmd);
 951         VM_BUG_ON(!page_count(page));
 952         get_page(page);
 953         spin_unlock(&mm->page_table_lock);
 954
 955         split_huge_page(page);
 956
 957         put_page(page);
 958         BUG_ON(pmd_trans_huge(*pmd));
 959 }