arch/powerpc/mm/hugetlbpage.c

   1 /*
   2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3  *
   4  * Copyright (C) 2003 David Gibson, IBM Corporation.
   5  *
   6  * Based on the IA-32 version:
   7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/hugetlb.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/slab.h>
  16 #include <linux/err.h>
  17 #include <linux/sysctl.h>
  18 #include <asm/mman.h>
  19 #include <asm/pgalloc.h>
  20 #include <asm/tlb.h>
  21 #include <asm/tlbflush.h>
  22 #include <asm/mmu_context.h>
  23 #include <asm/machdep.h>
  24 #include <asm/cputable.h>
  25 #include <asm/spu.h>
  26
  27 #define PAGE_SHIFT_64K  16
  28 #define PAGE_SHIFT_16M  24
  29 #define PAGE_SHIFT_16G  34
  30
  31 #define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
  32 #define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  33 #define MAX_NUMBER_GPAGES       1024
  34
  35 /* Tracks the 16G pages after the device tree is scanned and before the
  36  * huge_boot_pages list is ready.  */
  37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
  38 static unsigned nr_gpages;
  39
  40 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  41  * stored for the huge page sizes that are valid.
  42  */
  43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
  44
  45 #define hugepte_shift                   mmu_huge_psizes
  46 #define HUGEPTE_INDEX_SIZE(psize)       (mmu_huge_psizes[(psize)])
  47 #define PTRS_PER_HUGEPTE(psize)         (1 << mmu_huge_psizes[psize])
  48
  49 #define HUGEPD_SHIFT(psize)             (mmu_psize_to_shift(psize) \
  50                                          + HUGEPTE_INDEX_SIZE(psize))
  51 #define HUGEPD_SIZE(psize)              (1UL << HUGEPD_SHIFT(psize))
  52 #define HUGEPD_MASK(psize)              (~(HUGEPD_SIZE(psize)-1))
  53
  54 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  55  * will choke on pointers to hugepte tables, which is handy for
  56  * catching screwups early. */
  57 #define HUGEPD_OK       0x1
  58
  59 typedef struct { unsigned long pd; } hugepd_t;
  60
  61 #define hugepd_none(hpd)        ((hpd).pd == 0)
  62
  63 static inline int shift_to_mmu_psize(unsigned int shift)
  64 {
  65         switch (shift) {
  66 #ifndef CONFIG_PPC_64K_PAGES
  67         case PAGE_SHIFT_64K:
  68             return MMU_PAGE_64K;
  69 #endif
  70         case PAGE_SHIFT_16M:
  71             return MMU_PAGE_16M;
  72         case PAGE_SHIFT_16G:
  73             return MMU_PAGE_16G;
  74         }
  75         return -1;
  76 }
  77
  78 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  79 {
  80         if (mmu_psize_defs[mmu_psize].shift)
  81                 return mmu_psize_defs[mmu_psize].shift;
  82         BUG();
  83 }
  84
  85 static inline pte_t *hugepd_page(hugepd_t hpd)
  86 {
  87         BUG_ON(!(hpd.pd & HUGEPD_OK));
  88         return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  89 }
  90
  91 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
  92                                     struct hstate *hstate)
  93 {
  94         unsigned int shift = huge_page_shift(hstate);
  95         int psize = shift_to_mmu_psize(shift);
  96         unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
  97         pte_t *dir = hugepd_page(*hpdp);
  98
  99         return dir + idx;
 100 }
 101
 102 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 103                            unsigned long address, unsigned int psize)
 104 {
 105         pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]),
 106                                        GFP_KERNEL|__GFP_REPEAT);
 107
 108         if (! new)
 109                 return -ENOMEM;
 110
 111         spin_lock(&mm->page_table_lock);
 112         if (!hugepd_none(*hpdp))
 113                 kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new);
 114         else
 115                 hpdp->pd = (unsigned long)new | HUGEPD_OK;
 116         spin_unlock(&mm->page_table_lock);
 117         return 0;
 118 }
 119
 120
 121 static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
 122 {
 123         if (huge_page_shift(hstate) < PUD_SHIFT)
 124                 return pud_offset(pgd, addr);
 125         else
 126                 return (pud_t *) pgd;
 127 }
 128 static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
 129                          struct hstate *hstate)
 130 {
 131         if (huge_page_shift(hstate) < PUD_SHIFT)
 132                 return pud_alloc(mm, pgd, addr);
 133         else
 134                 return (pud_t *) pgd;
 135 }
 136 static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 137 {
 138         if (huge_page_shift(hstate) < PMD_SHIFT)
 139                 return pmd_offset(pud, addr);
 140         else
 141                 return (pmd_t *) pud;
 142 }
 143 static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
 144                          struct hstate *hstate)
 145 {
 146         if (huge_page_shift(hstate) < PMD_SHIFT)
 147                 return pmd_alloc(mm, pud, addr);
 148         else
 149                 return (pmd_t *) pud;
 150 }
 151
 152 /* Build list of addresses of gigantic pages.  This function is used in early
 153  * boot before the buddy or bootmem allocator is setup.
 154  */
 155 void add_gpage(unsigned long addr, unsigned long page_size,
 156         unsigned long number_of_pages)
 157 {
 158         if (!addr)
 159                 return;
 160         while (number_of_pages > 0) {
 161                 gpage_freearray[nr_gpages] = addr;
 162                 nr_gpages++;
 163                 number_of_pages--;
 164                 addr += page_size;
 165         }
 166 }
 167
 168 /* Moves the gigantic page addresses from the temporary list to the
 169  * huge_boot_pages list.
 170  */
 171 int alloc_bootmem_huge_page(struct hstate *hstate)
 172 {
 173         struct huge_bootmem_page *m;
 174         if (nr_gpages == 0)
 175                 return 0;
 176         m = phys_to_virt(gpage_freearray[--nr_gpages]);
 177         gpage_freearray[nr_gpages] = 0;
 178         list_add(&m->list, &huge_boot_pages);
 179         m->hstate = hstate;
 180         return 1;
 181 }
 182
 183
 184 /* Modelled after find_linux_pte() */
 185 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 186 {
 187         pgd_t *pg;
 188         pud_t *pu;
 189         pmd_t *pm;
 190
 191         unsigned int psize;
 192         unsigned int shift;
 193         unsigned long sz;
 194         struct hstate *hstate;
 195         psize = get_slice_psize(mm, addr);
 196         shift = mmu_psize_to_shift(psize);
 197         sz = ((1UL) << shift);
 198         hstate = size_to_hstate(sz);
 199
 200         addr &= hstate->mask;
 201
 202         pg = pgd_offset(mm, addr);
 203         if (!pgd_none(*pg)) {
 204                 pu = hpud_offset(pg, addr, hstate);
 205                 if (!pud_none(*pu)) {
 206                         pm = hpmd_offset(pu, addr, hstate);
 207                         if (!pmd_none(*pm))
 208                                 return hugepte_offset((hugepd_t *)pm, addr,
 209                                                       hstate);
 210                 }
 211         }
 212
 213         return NULL;
 214 }
 215
 216 pte_t *huge_pte_alloc(struct mm_struct *mm,
 217                         unsigned long addr, unsigned long sz)
 218 {
 219         pgd_t *pg;
 220         pud_t *pu;
 221         pmd_t *pm;
 222         hugepd_t *hpdp = NULL;
 223         struct hstate *hstate;
 224         unsigned int psize;
 225         hstate = size_to_hstate(sz);
 226
 227         psize = get_slice_psize(mm, addr);
 228         BUG_ON(!mmu_huge_psizes[psize]);
 229
 230         addr &= hstate->mask;
 231
 232         pg = pgd_offset(mm, addr);
 233         pu = hpud_alloc(mm, pg, addr, hstate);
 234
 235         if (pu) {
 236                 pm = hpmd_alloc(mm, pu, addr, hstate);
 237                 if (pm)
 238                         hpdp = (hugepd_t *)pm;
 239         }
 240
 241         if (! hpdp)
 242                 return NULL;
 243
 244         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
 245                 return NULL;
 246
 247         return hugepte_offset(hpdp, addr, hstate);
 248 }
 249
 250 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 251 {
 252         return 0;
 253 }
 254
 255 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
 256                                unsigned int psize)
 257 {
 258         pte_t *hugepte = hugepd_page(*hpdp);
 259
 260         hpdp->pd = 0;
 261         tlb->need_flush = 1;
 262         pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]);
 263 }
 264
 265 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 266                                    unsigned long addr, unsigned long end,
 267                                    unsigned long floor, unsigned long ceiling,
 268                                    unsigned int psize)
 269 {
 270         pmd_t *pmd;
 271         unsigned long next;
 272         unsigned long start;
 273
 274         start = addr;
 275         pmd = pmd_offset(pud, addr);
 276         do {
 277                 next = pmd_addr_end(addr, end);
 278                 if (pmd_none(*pmd))
 279                         continue;
 280                 free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
 281         } while (pmd++, addr = next, addr != end);
 282
 283         start &= PUD_MASK;
 284         if (start < floor)
 285                 return;
 286         if (ceiling) {
 287                 ceiling &= PUD_MASK;
 288                 if (!ceiling)
 289                         return;
 290         }
 291         if (end - 1 > ceiling - 1)
 292                 return;
 293
 294         pmd = pmd_offset(pud, start);
 295         pud_clear(pud);
 296         pmd_free_tlb(tlb, pmd, start);
 297 }
 298
 299 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 300                                    unsigned long addr, unsigned long end,
 301                                    unsigned long floor, unsigned long ceiling)
 302 {
 303         pud_t *pud;
 304         unsigned long next;
 305         unsigned long start;
 306         unsigned int shift;
 307         unsigned int psize = get_slice_psize(tlb->mm, addr);
 308         shift = mmu_psize_to_shift(psize);
 309
 310         start = addr;
 311         pud = pud_offset(pgd, addr);
 312         do {
 313                 next = pud_addr_end(addr, end);
 314                 if (shift < PMD_SHIFT) {
 315                         if (pud_none_or_clear_bad(pud))
 316                                 continue;
 317                         hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 318                                                ceiling, psize);
 319                 } else {
 320                         if (pud_none(*pud))
 321                                 continue;
 322                         free_hugepte_range(tlb, (hugepd_t *)pud, psize);
 323                 }
 324         } while (pud++, addr = next, addr != end);
 325
 326         start &= PGDIR_MASK;
 327         if (start < floor)
 328                 return;
 329         if (ceiling) {
 330                 ceiling &= PGDIR_MASK;
 331                 if (!ceiling)
 332                         return;
 333         }
 334         if (end - 1 > ceiling - 1)
 335                 return;
 336
 337         pud = pud_offset(pgd, start);
 338         pgd_clear(pgd);
 339         pud_free_tlb(tlb, pud, start);
 340 }
 341
 342 /*
 343  * This function frees user-level page tables of a process.
 344  *
 345  * Must be called with pagetable lock held.
 346  */
 347 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 348                             unsigned long addr, unsigned long end,
 349                             unsigned long floor, unsigned long ceiling)
 350 {
 351         pgd_t *pgd;
 352         unsigned long next;
 353         unsigned long start;
 354
 355         /*
 356          * Comments below take from the normal free_pgd_range().  They
 357          * apply here too.  The tests against HUGEPD_MASK below are
 358          * essential, because we *don't* test for this at the bottom
 359          * level.  Without them we'll attempt to free a hugepte table
 360          * when we unmap just part of it, even if there are other
 361          * active mappings using it.
 362          *
 363          * The next few lines have given us lots of grief...
 364          *
 365          * Why are we testing HUGEPD* at this top level?  Because
 366          * often there will be no work to do at all, and we'd prefer
 367          * not to go all the way down to the bottom just to discover
 368          * that.
 369          *
 370          * Why all these "- 1"s?  Because 0 represents both the bottom
 371          * of the address space and the top of it (using -1 for the
 372          * top wouldn't help much: the masks would do the wrong thing).
 373          * The rule is that addr 0 and floor 0 refer to the bottom of
 374          * the address space, but end 0 and ceiling 0 refer to the top
 375          * Comparisons need to use "end - 1" and "ceiling - 1" (though
 376          * that end 0 case should be mythical).
 377          *
 378          * Wherever addr is brought up or ceiling brought down, we
 379          * must be careful to reject "the opposite 0" before it
 380          * confuses the subsequent tests.  But what about where end is
 381          * brought down by HUGEPD_SIZE below? no, end can't go down to
 382          * 0 there.
 383          *
 384          * Whereas we round start (addr) and ceiling down, by different
 385          * masks at different levels, in order to test whether a table
 386          * now has no other vmas using it, so can be freed, we don't
 387          * bother to round floor or end up - the tests don't need that.
 388          */
 389         unsigned int psize = get_slice_psize(tlb->mm, addr);
 390
 391         addr &= HUGEPD_MASK(psize);
 392         if (addr < floor) {
 393                 addr += HUGEPD_SIZE(psize);
 394                 if (!addr)
 395                         return;
 396         }
 397         if (ceiling) {
 398                 ceiling &= HUGEPD_MASK(psize);
 399                 if (!ceiling)
 400                         return;
 401         }
 402         if (end - 1 > ceiling - 1)
 403                 end -= HUGEPD_SIZE(psize);
 404         if (addr > end - 1)
 405                 return;
 406
 407         start = addr;
 408         pgd = pgd_offset(tlb->mm, addr);
 409         do {
 410                 psize = get_slice_psize(tlb->mm, addr);
 411                 BUG_ON(!mmu_huge_psizes[psize]);
 412                 next = pgd_addr_end(addr, end);
 413                 if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
 414                         if (pgd_none_or_clear_bad(pgd))
 415                                 continue;
 416                         hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 417                 } else {
 418                         if (pgd_none(*pgd))
 419                                 continue;
 420                         free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
 421                 }
 422         } while (pgd++, addr = next, addr != end);
 423 }
 424
 425 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 426                      pte_t *ptep, pte_t pte)
 427 {
 428         if (pte_present(*ptep)) {
 429                 /* We open-code pte_clear because we need to pass the right
 430                  * argument to hpte_need_flush (huge / !huge). Might not be
 431                  * necessary anymore if we make hpte_need_flush() get the
 432                  * page size from the slices
 433                  */
 434                 pte_update(mm, addr, ptep, ~0UL, 1);
 435         }
 436         *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 437 }
 438
 439 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 440                               pte_t *ptep)
 441 {
 442         unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 443         return __pte(old);
 444 }
 445
 446 struct page *
 447 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 448 {
 449         pte_t *ptep;
 450         struct page *page;
 451         unsigned int mmu_psize = get_slice_psize(mm, address);
 452
 453         /* Verify it is a huge page else bail. */
 454         if (!mmu_huge_psizes[mmu_psize])
 455                 return ERR_PTR(-EINVAL);
 456
 457         ptep = huge_pte_offset(mm, address);
 458         page = pte_page(*ptep);
 459         if (page) {
 460                 unsigned int shift = mmu_psize_to_shift(mmu_psize);
 461                 unsigned long sz = ((1UL) << shift);
 462                 page += (address % sz) / PAGE_SIZE;
 463         }
 464
 465         return page;
 466 }
 467
 468 int pmd_huge(pmd_t pmd)
 469 {
 470         return 0;
 471 }
 472
 473 int pud_huge(pud_t pud)
 474 {
 475         return 0;
 476 }
 477
 478 struct page *
 479 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 480                 pmd_t *pmd, int write)
 481 {
 482         BUG();
 483         return NULL;
 484 }
 485
 486
 487 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 488                                         unsigned long len, unsigned long pgoff,
 489                                         unsigned long flags)
 490 {
 491         struct hstate *hstate = hstate_file(file);
 492         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 493
 494         if (!mmu_huge_psizes[mmu_psize])
 495                 return -EINVAL;
 496         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 497 }
 498
 499 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 500 {
 501         unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 502
 503         return 1UL << mmu_psize_to_shift(psize);
 504 }
 505
 506 /*
 507  * Called by asm hashtable.S for doing lazy icache flush
 508  */
 509 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 510                                         pte_t pte, int trap, unsigned long sz)
 511 {
 512         struct page *page;
 513         int i;
 514
 515         if (!pfn_valid(pte_pfn(pte)))
 516                 return rflags;
 517
 518         page = pte_page(pte);
 519
 520         /* page is dirty */
 521         if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 522                 if (trap == 0x400) {
 523                         for (i = 0; i < (sz / PAGE_SIZE); i++)
 524                                 __flush_dcache_icache(page_address(page+i));
 525                         set_bit(PG_arch_1, &page->flags);
 526                 } else {
 527                         rflags |= HPTE_R_N;
 528                 }
 529         }
 530         return rflags;
 531 }
 532
 533 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 534                    unsigned long ea, unsigned long vsid, int local,
 535                    unsigned long trap)
 536 {
 537         pte_t *ptep;
 538         unsigned long old_pte, new_pte;
 539         unsigned long va, rflags, pa, sz;
 540         long slot;
 541         int err = 1;
 542         int ssize = user_segment_size(ea);
 543         unsigned int mmu_psize;
 544         int shift;
 545         mmu_psize = get_slice_psize(mm, ea);
 546
 547         if (!mmu_huge_psizes[mmu_psize])
 548                 goto out;
 549         ptep = huge_pte_offset(mm, ea);
 550
 551         /* Search the Linux page table for a match with va */
 552         va = hpt_va(ea, vsid, ssize);
 553
 554         /*
 555          * If no pte found or not present, send the problem up to
 556          * do_page_fault
 557          */
 558         if (unlikely(!ptep || pte_none(*ptep)))
 559                 goto out;
 560
 561         /*
 562          * Check the user's access rights to the page.  If access should be
 563          * prevented then send the problem up to do_page_fault.
 564          */
 565         if (unlikely(access & ~pte_val(*ptep)))
 566                 goto out;
 567         /*
 568          * At this point, we have a pte (old_pte) which can be used to build
 569          * or update an HPTE. There are 2 cases:
 570          *
 571          * 1. There is a valid (present) pte with no associated HPTE (this is
 572          *      the most common case)
 573          * 2. There is a valid (present) pte with an associated HPTE. The
 574          *      current values of the pp bits in the HPTE prevent access
 575          *      because we are doing software DIRTY bit management and the
 576          *      page is currently not DIRTY.
 577          */
 578
 579
 580         do {
 581                 old_pte = pte_val(*ptep);
 582                 if (old_pte & _PAGE_BUSY)
 583                         goto out;
 584                 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
 585         } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 586                                          old_pte, new_pte));
 587
 588         rflags = 0x2 | (!(new_pte & _PAGE_RW));
 589         /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 590         rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 591         shift = mmu_psize_to_shift(mmu_psize);
 592         sz = ((1UL) << shift);
 593         if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 594                 /* No CPU has hugepages but lacks no execute, so we
 595                  * don't need to worry about that case */
 596                 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
 597                                                        trap, sz);
 598
 599         /* Check if pte already has an hpte (case 2) */
 600         if (unlikely(old_pte & _PAGE_HASHPTE)) {
 601                 /* There MIGHT be an HPTE for this pte */
 602                 unsigned long hash, slot;
 603
 604                 hash = hpt_hash(va, shift, ssize);
 605                 if (old_pte & _PAGE_F_SECOND)
 606                         hash = ~hash;
 607                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 608                 slot += (old_pte & _PAGE_F_GIX) >> 12;
 609
 610                 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
 611                                          ssize, local) == -1)
 612                         old_pte &= ~_PAGE_HPTEFLAGS;
 613         }
 614
 615         if (likely(!(old_pte & _PAGE_HASHPTE))) {
 616                 unsigned long hash = hpt_hash(va, shift, ssize);
 617                 unsigned long hpte_group;
 618
 619                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 620
 621 repeat:
 622                 hpte_group = ((hash & htab_hash_mask) *
 623                               HPTES_PER_GROUP) & ~0x7UL;
 624
 625                 /* clear HPTE slot informations in new PTE */
 626 #ifdef CONFIG_PPC_64K_PAGES
 627                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
 628 #else
 629                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 630 #endif
 631                 /* Add in WIMG bits */
 632                 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 633                                       _PAGE_COHERENT | _PAGE_GUARDED));
 634
 635                 /* Insert into the hash table, primary slot */
 636                 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
 637                                           mmu_psize, ssize);
 638
 639                 /* Primary is full, try the secondary */
 640                 if (unlikely(slot == -1)) {
 641                         hpte_group = ((~hash & htab_hash_mask) *
 642                                       HPTES_PER_GROUP) & ~0x7UL;
 643                         slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 644                                                   HPTE_V_SECONDARY,
 645                                                   mmu_psize, ssize);
 646                         if (slot == -1) {
 647                                 if (mftb() & 0x1)
 648                                         hpte_group = ((hash & htab_hash_mask) *
 649                                                       HPTES_PER_GROUP)&~0x7UL;
 650
 651                                 ppc_md.hpte_remove(hpte_group);
 652                                 goto repeat;
 653                         }
 654                 }
 655
 656                 if (unlikely(slot == -2))
 657                         panic("hash_huge_page: pte_insert failed\n");
 658
 659                 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
 660         }
 661
 662         /*
 663          * No need to use ldarx/stdcx here
 664          */
 665         *ptep = __pte(new_pte & ~_PAGE_BUSY);
 666
 667         err = 0;
 668
 669  out:
 670         return err;
 671 }
 672
 673 static void __init set_huge_psize(int psize)
 674 {
 675         /* Check that it is a page size supported by the hardware and
 676          * that it fits within pagetable limits. */
 677         if (mmu_psize_defs[psize].shift &&
 678                 mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
 679                 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
 680                  mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
 681                  mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
 682                 /* Return if huge page size has already been setup or is the
 683                  * same as the base page size. */
 684                 if (mmu_huge_psizes[psize] ||
 685                    mmu_psize_defs[psize].shift == PAGE_SHIFT)
 686                         return;
 687                 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 688
 689                 switch (mmu_psize_defs[psize].shift) {
 690                 case PAGE_SHIFT_64K:
 691                     /* We only allow 64k hpages with 4k base page,
 692                      * which was checked above, and always put them
 693                      * at the PMD */
 694                     hugepte_shift[psize] = PMD_SHIFT;
 695                     break;
 696                 case PAGE_SHIFT_16M:
 697                     /* 16M pages can be at two different levels
 698                      * of pagestables based on base page size */
 699                     if (PAGE_SHIFT == PAGE_SHIFT_64K)
 700                             hugepte_shift[psize] = PMD_SHIFT;
 701                     else /* 4k base page */
 702                             hugepte_shift[psize] = PUD_SHIFT;
 703                     break;
 704                 case PAGE_SHIFT_16G:
 705                     /* 16G pages are always at PGD level */
 706                     hugepte_shift[psize] = PGDIR_SHIFT;
 707                     break;
 708                 }
 709                 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
 710         } else
 711                 hugepte_shift[psize] = 0;
 712 }
 713
 714 static int __init hugepage_setup_sz(char *str)
 715 {
 716         unsigned long long size;
 717         int mmu_psize;
 718         int shift;
 719
 720         size = memparse(str, &str);
 721
 722         shift = __ffs(size);
 723         mmu_psize = shift_to_mmu_psize(shift);
 724         if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
 725                 set_huge_psize(mmu_psize);
 726         else
 727                 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 728
 729         return 1;
 730 }
 731 __setup("hugepagesz=", hugepage_setup_sz);
 732
 733 static int __init hugetlbpage_init(void)
 734 {
 735         unsigned int psize;
 736
 737         if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 738                 return -ENODEV;
 739
 740         /* Add supported huge page sizes.  Need to change
 741          *  HUGE_MAX_HSTATE if the number of supported huge page sizes
 742          *  changes.
 743          */
 744         set_huge_psize(MMU_PAGE_16M);
 745         set_huge_psize(MMU_PAGE_16G);
 746
 747         /* Temporarily disable support for 64K huge pages when 64K SPU local
 748          * store support is enabled as the current implementation conflicts.
 749          */
 750 #ifndef CONFIG_SPU_FS_64K_LS
 751         set_huge_psize(MMU_PAGE_64K);
 752 #endif
 753
 754         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 755                 if (mmu_huge_psizes[psize]) {
 756                         pgtable_cache_add(hugepte_shift[psize], NULL);
 757                         if (!PGT_CACHE(hugepte_shift[psize]))
 758                                 panic("hugetlbpage_init(): could not create "
 759                                       "pgtable cache for %d bit pagesize\n",
 760                                       mmu_psize_to_shift(psize));
 761                 }
 762         }
 763
 764         return 0;
 765 }
 766
 767 module_init(hugetlbpage_init);