arch/x86/xen/mmu_pv.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched/mm.h>
  42 #include <linux/highmem.h>
  43 #include <linux/debugfs.h>
  44 #include <linux/bug.h>
  45 #include <linux/vmalloc.h>
  46 #include <linux/export.h>
  47 #include <linux/init.h>
  48 #include <linux/gfp.h>
  49 #include <linux/memblock.h>
  50 #include <linux/seq_file.h>
  51 #include <linux/crash_dump.h>
  52 #ifdef CONFIG_KEXEC_CORE
  53 #include <linux/kexec.h>
  54 #endif
  55
  56 #include <trace/events/xen.h>
  57
  58 #include <asm/pgtable.h>
  59 #include <asm/tlbflush.h>
  60 #include <asm/fixmap.h>
  61 #include <asm/mmu_context.h>
  62 #include <asm/setup.h>
  63 #include <asm/paravirt.h>
  64 #include <asm/e820/api.h>
  65 #include <asm/linkage.h>
  66 #include <asm/page.h>
  67 #include <asm/init.h>
  68 #include <asm/pat.h>
  69 #include <asm/smp.h>
  70
  71 #include <asm/xen/hypercall.h>
  72 #include <asm/xen/hypervisor.h>
  73
  74 #include <xen/xen.h>
  75 #include <xen/page.h>
  76 #include <xen/interface/xen.h>
  77 #include <xen/interface/hvm/hvm_op.h>
  78 #include <xen/interface/version.h>
  79 #include <xen/interface/memory.h>
  80 #include <xen/hvc-console.h>
  81
  82 #include "multicalls.h"
  83 #include "mmu.h"
  84 #include "debugfs.h"
  85
  86 #ifdef CONFIG_X86_32
  87 /*
  88  * Identity map, in addition to plain kernel map.  This needs to be
  89  * large enough to allocate page table pages to allocate the rest.
  90  * Each page can map 2MB.
  91  */
  92 #define LEVEL1_IDENT_ENTRIES    (PTRS_PER_PTE * 4)
  93 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
  94 #endif
  95 #ifdef CONFIG_X86_64
  96 /* l3 pud for userspace vsyscall mapping */
  97 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
  98 #endif /* CONFIG_X86_64 */
  99
 100 /*
 101  * Note about cr3 (pagetable base) values:
 102  *
 103  * xen_cr3 contains the current logical cr3 value; it contains the
 104  * last set cr3.  This may not be the current effective cr3, because
 105  * its update may be being lazily deferred.  However, a vcpu looking
 106  * at its own cr3 can use this value knowing that it everything will
 107  * be self-consistent.
 108  *
 109  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 110  * hypercall to set the vcpu cr3 is complete (so it may be a little
 111  * out of date, but it will never be set early).  If one vcpu is
 112  * looking at another vcpu's cr3 value, it should use this variable.
 113  */
 114 DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
 115 DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 116
 117 static phys_addr_t xen_pt_base, xen_pt_size __initdata;
 118
 119 /*
 120  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 121  * redzone above it, so round it up to a PGD boundary.
 122  */
 123 #define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 124
 125 void make_lowmem_page_readonly(void *vaddr)
 126 {
 127         pte_t *pte, ptev;
 128         unsigned long address = (unsigned long)vaddr;
 129         unsigned int level;
 130
 131         pte = lookup_address(address, &level);
 132         if (pte == NULL)
 133                 return;         /* vaddr missing */
 134
 135         ptev = pte_wrprotect(*pte);
 136
 137         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 138                 BUG();
 139 }
 140
 141 void make_lowmem_page_readwrite(void *vaddr)
 142 {
 143         pte_t *pte, ptev;
 144         unsigned long address = (unsigned long)vaddr;
 145         unsigned int level;
 146
 147         pte = lookup_address(address, &level);
 148         if (pte == NULL)
 149                 return;         /* vaddr missing */
 150
 151         ptev = pte_mkwrite(*pte);
 152
 153         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 154                 BUG();
 155 }
 156
 157
 158 static bool xen_page_pinned(void *ptr)
 159 {
 160         struct page *page = virt_to_page(ptr);
 161
 162         return PagePinned(page);
 163 }
 164
 165 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 166 {
 167         struct multicall_space mcs;
 168         struct mmu_update *u;
 169
 170         trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
 171
 172         mcs = xen_mc_entry(sizeof(*u));
 173         u = mcs.args;
 174
 175         /* ptep might be kmapped when using 32-bit HIGHPTE */
 176         u->ptr = virt_to_machine(ptep).maddr;
 177         u->val = pte_val_ma(pteval);
 178
 179         MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 180
 181         xen_mc_issue(PARAVIRT_LAZY_MMU);
 182 }
 183 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
 184
 185 static void xen_extend_mmu_update(const struct mmu_update *update)
 186 {
 187         struct multicall_space mcs;
 188         struct mmu_update *u;
 189
 190         mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 191
 192         if (mcs.mc != NULL) {
 193                 mcs.mc->args[1]++;
 194         } else {
 195                 mcs = __xen_mc_entry(sizeof(*u));
 196                 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 197         }
 198
 199         u = mcs.args;
 200         *u = *update;
 201 }
 202
 203 static void xen_extend_mmuext_op(const struct mmuext_op *op)
 204 {
 205         struct multicall_space mcs;
 206         struct mmuext_op *u;
 207
 208         mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
 209
 210         if (mcs.mc != NULL) {
 211                 mcs.mc->args[1]++;
 212         } else {
 213                 mcs = __xen_mc_entry(sizeof(*u));
 214                 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 215         }
 216
 217         u = mcs.args;
 218         *u = *op;
 219 }
 220
 221 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 222 {
 223         struct mmu_update u;
 224
 225         preempt_disable();
 226
 227         xen_mc_batch();
 228
 229         /* ptr may be ioremapped for 64-bit pagetable setup */
 230         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 231         u.val = pmd_val_ma(val);
 232         xen_extend_mmu_update(&u);
 233
 234         xen_mc_issue(PARAVIRT_LAZY_MMU);
 235
 236         preempt_enable();
 237 }
 238
 239 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 240 {
 241         trace_xen_mmu_set_pmd(ptr, val);
 242
 243         /* If page is not pinned, we can just update the entry
 244            directly */
 245         if (!xen_page_pinned(ptr)) {
 246                 *ptr = val;
 247                 return;
 248         }
 249
 250         xen_set_pmd_hyper(ptr, val);
 251 }
 252
 253 /*
 254  * Associate a virtual page frame with a given physical page frame
 255  * and protection flags for that frame.
 256  */
 257 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 258 {
 259         set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 260 }
 261
 262 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 263 {
 264         struct mmu_update u;
 265
 266         if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
 267                 return false;
 268
 269         xen_mc_batch();
 270
 271         u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 272         u.val = pte_val_ma(pteval);
 273         xen_extend_mmu_update(&u);
 274
 275         xen_mc_issue(PARAVIRT_LAZY_MMU);
 276
 277         return true;
 278 }
 279
 280 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 281 {
 282         if (!xen_batched_set_pte(ptep, pteval)) {
 283                 /*
 284                  * Could call native_set_pte() here and trap and
 285                  * emulate the PTE write but with 32-bit guests this
 286                  * needs two traps (one for each of the two 32-bit
 287                  * words in the PTE) so do one hypercall directly
 288                  * instead.
 289                  */
 290                 struct mmu_update u;
 291
 292                 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 293                 u.val = pte_val_ma(pteval);
 294                 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
 295         }
 296 }
 297
 298 static void xen_set_pte(pte_t *ptep, pte_t pteval)
 299 {
 300         trace_xen_mmu_set_pte(ptep, pteval);
 301         __xen_set_pte(ptep, pteval);
 302 }
 303
 304 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 305                     pte_t *ptep, pte_t pteval)
 306 {
 307         trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
 308         __xen_set_pte(ptep, pteval);
 309 }
 310
 311 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 312                                  unsigned long addr, pte_t *ptep)
 313 {
 314         /* Just return the pte as-is.  We preserve the bits on commit */
 315         trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
 316         return *ptep;
 317 }
 318
 319 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 320                                  pte_t *ptep, pte_t pte)
 321 {
 322         struct mmu_update u;
 323
 324         trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
 325         xen_mc_batch();
 326
 327         u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 328         u.val = pte_val_ma(pte);
 329         xen_extend_mmu_update(&u);
 330
 331         xen_mc_issue(PARAVIRT_LAZY_MMU);
 332 }
 333
 334 /* Assume pteval_t is equivalent to all the other *val_t types. */
 335 static pteval_t pte_mfn_to_pfn(pteval_t val)
 336 {
 337         if (val & _PAGE_PRESENT) {
 338                 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 339                 unsigned long pfn = mfn_to_pfn(mfn);
 340
 341                 pteval_t flags = val & PTE_FLAGS_MASK;
 342                 if (unlikely(pfn == ~0))
 343                         val = flags & ~_PAGE_PRESENT;
 344                 else
 345                         val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 346         }
 347
 348         return val;
 349 }
 350
 351 static pteval_t pte_pfn_to_mfn(pteval_t val)
 352 {
 353         if (val & _PAGE_PRESENT) {
 354                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 355                 pteval_t flags = val & PTE_FLAGS_MASK;
 356                 unsigned long mfn;
 357
 358                 if (!xen_feature(XENFEAT_auto_translated_physmap))
 359                         mfn = __pfn_to_mfn(pfn);
 360                 else
 361                         mfn = pfn;
 362                 /*
 363                  * If there's no mfn for the pfn, then just create an
 364                  * empty non-present pte.  Unfortunately this loses
 365                  * information about the original pfn, so
 366                  * pte_mfn_to_pfn is asymmetric.
 367                  */
 368                 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 369                         mfn = 0;
 370                         flags = 0;
 371                 } else
 372                         mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
 373                 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 374         }
 375
 376         return val;
 377 }
 378
 379 __visible pteval_t xen_pte_val(pte_t pte)
 380 {
 381         pteval_t pteval = pte.pte;
 382
 383         return pte_mfn_to_pfn(pteval);
 384 }
 385 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 386
 387 __visible pgdval_t xen_pgd_val(pgd_t pgd)
 388 {
 389         return pte_mfn_to_pfn(pgd.pgd);
 390 }
 391 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 392
 393 __visible pte_t xen_make_pte(pteval_t pte)
 394 {
 395         pte = pte_pfn_to_mfn(pte);
 396
 397         return native_make_pte(pte);
 398 }
 399 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 400
 401 __visible pgd_t xen_make_pgd(pgdval_t pgd)
 402 {
 403         pgd = pte_pfn_to_mfn(pgd);
 404         return native_make_pgd(pgd);
 405 }
 406 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 407
 408 __visible pmdval_t xen_pmd_val(pmd_t pmd)
 409 {
 410         return pte_mfn_to_pfn(pmd.pmd);
 411 }
 412 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 413
 414 static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 415 {
 416         struct mmu_update u;
 417
 418         preempt_disable();
 419
 420         xen_mc_batch();
 421
 422         /* ptr may be ioremapped for 64-bit pagetable setup */
 423         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 424         u.val = pud_val_ma(val);
 425         xen_extend_mmu_update(&u);
 426
 427         xen_mc_issue(PARAVIRT_LAZY_MMU);
 428
 429         preempt_enable();
 430 }
 431
 432 static void xen_set_pud(pud_t *ptr, pud_t val)
 433 {
 434         trace_xen_mmu_set_pud(ptr, val);
 435
 436         /* If page is not pinned, we can just update the entry
 437            directly */
 438         if (!xen_page_pinned(ptr)) {
 439                 *ptr = val;
 440                 return;
 441         }
 442
 443         xen_set_pud_hyper(ptr, val);
 444 }
 445
 446 #ifdef CONFIG_X86_PAE
 447 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 448 {
 449         trace_xen_mmu_set_pte_atomic(ptep, pte);
 450         set_64bit((u64 *)ptep, native_pte_val(pte));
 451 }
 452
 453 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 454 {
 455         trace_xen_mmu_pte_clear(mm, addr, ptep);
 456         if (!xen_batched_set_pte(ptep, native_make_pte(0)))
 457                 native_pte_clear(mm, addr, ptep);
 458 }
 459
 460 static void xen_pmd_clear(pmd_t *pmdp)
 461 {
 462         trace_xen_mmu_pmd_clear(pmdp);
 463         set_pmd(pmdp, __pmd(0));
 464 }
 465 #endif  /* CONFIG_X86_PAE */
 466
 467 __visible pmd_t xen_make_pmd(pmdval_t pmd)
 468 {
 469         pmd = pte_pfn_to_mfn(pmd);
 470         return native_make_pmd(pmd);
 471 }
 472 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 473
 474 #if CONFIG_PGTABLE_LEVELS == 4
 475 __visible pudval_t xen_pud_val(pud_t pud)
 476 {
 477         return pte_mfn_to_pfn(pud.pud);
 478 }
 479 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 480
 481 __visible pud_t xen_make_pud(pudval_t pud)
 482 {
 483         pud = pte_pfn_to_mfn(pud);
 484
 485         return native_make_pud(pud);
 486 }
 487 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 488
 489 static pgd_t *xen_get_user_pgd(pgd_t *pgd)
 490 {
 491         pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 492         unsigned offset = pgd - pgd_page;
 493         pgd_t *user_ptr = NULL;
 494
 495         if (offset < pgd_index(USER_LIMIT)) {
 496                 struct page *page = virt_to_page(pgd_page);
 497                 user_ptr = (pgd_t *)page->private;
 498                 if (user_ptr)
 499                         user_ptr += offset;
 500         }
 501
 502         return user_ptr;
 503 }
 504
 505 static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
 506 {
 507         struct mmu_update u;
 508
 509         u.ptr = virt_to_machine(ptr).maddr;
 510         u.val = p4d_val_ma(val);
 511         xen_extend_mmu_update(&u);
 512 }
 513
 514 /*
 515  * Raw hypercall-based set_p4d, intended for in early boot before
 516  * there's a page structure.  This implies:
 517  *  1. The only existing pagetable is the kernel's
 518  *  2. It is always pinned
 519  *  3. It has no user pagetable attached to it
 520  */
 521 static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
 522 {
 523         preempt_disable();
 524
 525         xen_mc_batch();
 526
 527         __xen_set_p4d_hyper(ptr, val);
 528
 529         xen_mc_issue(PARAVIRT_LAZY_MMU);
 530
 531         preempt_enable();
 532 }
 533
 534 static void xen_set_p4d(p4d_t *ptr, p4d_t val)
 535 {
 536         pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
 537         pgd_t pgd_val;
 538
 539         trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
 540
 541         /* If page is not pinned, we can just update the entry
 542            directly */
 543         if (!xen_page_pinned(ptr)) {
 544                 *ptr = val;
 545                 if (user_ptr) {
 546                         WARN_ON(xen_page_pinned(user_ptr));
 547                         pgd_val.pgd = p4d_val_ma(val);
 548                         *user_ptr = pgd_val;
 549                 }
 550                 return;
 551         }
 552
 553         /* If it's pinned, then we can at least batch the kernel and
 554            user updates together. */
 555         xen_mc_batch();
 556
 557         __xen_set_p4d_hyper(ptr, val);
 558         if (user_ptr)
 559                 __xen_set_p4d_hyper((p4d_t *)user_ptr, val);
 560
 561         xen_mc_issue(PARAVIRT_LAZY_MMU);
 562 }
 563 #endif  /* CONFIG_PGTABLE_LEVELS == 4 */
 564
 565 static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
 566                 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
 567                 bool last, unsigned long limit)
 568 {
 569         int i, nr, flush = 0;
 570
 571         nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
 572         for (i = 0; i < nr; i++) {
 573                 if (!pmd_none(pmd[i]))
 574                         flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
 575         }
 576         return flush;
 577 }
 578
 579 static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
 580                 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
 581                 bool last, unsigned long limit)
 582 {
 583         int i, nr, flush = 0;
 584
 585         nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
 586         for (i = 0; i < nr; i++) {
 587                 pmd_t *pmd;
 588
 589                 if (pud_none(pud[i]))
 590                         continue;
 591
 592                 pmd = pmd_offset(&pud[i], 0);
 593                 if (PTRS_PER_PMD > 1)
 594                         flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 595                 flush |= xen_pmd_walk(mm, pmd, func,
 596                                 last && i == nr - 1, limit);
 597         }
 598         return flush;
 599 }
 600
 601 static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
 602                 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
 603                 bool last, unsigned long limit)
 604 {
 605         int i, nr, flush = 0;
 606
 607         nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
 608         for (i = 0; i < nr; i++) {
 609                 pud_t *pud;
 610
 611                 if (p4d_none(p4d[i]))
 612                         continue;
 613
 614                 pud = pud_offset(&p4d[i], 0);
 615                 if (PTRS_PER_PUD > 1)
 616                         flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 617                 flush |= xen_pud_walk(mm, pud, func,
 618                                 last && i == nr - 1, limit);
 619         }
 620         return flush;
 621 }
 622
 623 /*
 624  * (Yet another) pagetable walker.  This one is intended for pinning a
 625  * pagetable.  This means that it walks a pagetable and calls the
 626  * callback function on each page it finds making up the page table,
 627  * at every level.  It walks the entire pagetable, but it only bothers
 628  * pinning pte pages which are below limit.  In the normal case this
 629  * will be STACK_TOP_MAX, but at boot we need to pin up to
 630  * FIXADDR_TOP.
 631  *
 632  * For 32-bit the important bit is that we don't pin beyond there,
 633  * because then we start getting into Xen's ptes.
 634  *
 635  * For 64-bit, we must skip the Xen hole in the middle of the address
 636  * space, just after the big x86-64 virtual hole.
 637  */
 638 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 639                           int (*func)(struct mm_struct *mm, struct page *,
 640                                       enum pt_level),
 641                           unsigned long limit)
 642 {
 643         int i, nr, flush = 0;
 644         unsigned hole_low, hole_high;
 645
 646         /* The limit is the last byte to be touched */
 647         limit--;
 648         BUG_ON(limit >= FIXADDR_TOP);
 649
 650         if (xen_feature(XENFEAT_auto_translated_physmap))
 651                 return 0;
 652
 653         /*
 654          * 64-bit has a great big hole in the middle of the address
 655          * space, which contains the Xen mappings.  On 32-bit these
 656          * will end up making a zero-sized hole and so is a no-op.
 657          */
 658         hole_low = pgd_index(USER_LIMIT);
 659         hole_high = pgd_index(PAGE_OFFSET);
 660
 661         nr = pgd_index(limit) + 1;
 662         for (i = 0; i < nr; i++) {
 663                 p4d_t *p4d;
 664
 665                 if (i >= hole_low && i < hole_high)
 666                         continue;
 667
 668                 if (pgd_none(pgd[i]))
 669                         continue;
 670
 671                 p4d = p4d_offset(&pgd[i], 0);
 672                 if (PTRS_PER_P4D > 1)
 673                         flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
 674                 flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
 675         }
 676
 677         /* Do the top level last, so that the callbacks can use it as
 678            a cue to do final things like tlb flushes. */
 679         flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 680
 681         return flush;
 682 }
 683
 684 static int xen_pgd_walk(struct mm_struct *mm,
 685                         int (*func)(struct mm_struct *mm, struct page *,
 686                                     enum pt_level),
 687                         unsigned long limit)
 688 {
 689         return __xen_pgd_walk(mm, mm->pgd, func, limit);
 690 }
 691
 692 /* If we're using split pte locks, then take the page's lock and
 693    return a pointer to it.  Otherwise return NULL. */
 694 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 695 {
 696         spinlock_t *ptl = NULL;
 697
 698 #if USE_SPLIT_PTE_PTLOCKS
 699         ptl = ptlock_ptr(page);
 700         spin_lock_nest_lock(ptl, &mm->page_table_lock);
 701 #endif
 702
 703         return ptl;
 704 }
 705
 706 static void xen_pte_unlock(void *v)
 707 {
 708         spinlock_t *ptl = v;
 709         spin_unlock(ptl);
 710 }
 711
 712 static void xen_do_pin(unsigned level, unsigned long pfn)
 713 {
 714         struct mmuext_op op;
 715
 716         op.cmd = level;
 717         op.arg1.mfn = pfn_to_mfn(pfn);
 718
 719         xen_extend_mmuext_op(&op);
 720 }
 721
 722 static int xen_pin_page(struct mm_struct *mm, struct page *page,
 723                         enum pt_level level)
 724 {
 725         unsigned pgfl = TestSetPagePinned(page);
 726         int flush;
 727
 728         if (pgfl)
 729                 flush = 0;              /* already pinned */
 730         else if (PageHighMem(page))
 731                 /* kmaps need flushing if we found an unpinned
 732                    highpage */
 733                 flush = 1;
 734         else {
 735                 void *pt = lowmem_page_address(page);
 736                 unsigned long pfn = page_to_pfn(page);
 737                 struct multicall_space mcs = __xen_mc_entry(0);
 738                 spinlock_t *ptl;
 739
 740                 flush = 0;
 741
 742                 /*
 743                  * We need to hold the pagetable lock between the time
 744                  * we make the pagetable RO and when we actually pin
 745                  * it.  If we don't, then other users may come in and
 746                  * attempt to update the pagetable by writing it,
 747                  * which will fail because the memory is RO but not
 748                  * pinned, so Xen won't do the trap'n'emulate.
 749                  *
 750                  * If we're using split pte locks, we can't hold the
 751                  * entire pagetable's worth of locks during the
 752                  * traverse, because we may wrap the preempt count (8
 753                  * bits).  The solution is to mark RO and pin each PTE
 754                  * page while holding the lock.  This means the number
 755                  * of locks we end up holding is never more than a
 756                  * batch size (~32 entries, at present).
 757                  *
 758                  * If we're not using split pte locks, we needn't pin
 759                  * the PTE pages independently, because we're
 760                  * protected by the overall pagetable lock.
 761                  */
 762                 ptl = NULL;
 763                 if (level == PT_PTE)
 764                         ptl = xen_pte_lock(page, mm);
 765
 766                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 767                                         pfn_pte(pfn, PAGE_KERNEL_RO),
 768                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 769
 770                 if (ptl) {
 771                         xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 772
 773                         /* Queue a deferred unlock for when this batch
 774                            is completed. */
 775                         xen_mc_callback(xen_pte_unlock, ptl);
 776                 }
 777         }
 778
 779         return flush;
 780 }
 781
 782 /* This is called just after a mm has been created, but it has not
 783    been used yet.  We need to make sure that its pagetable is all
 784    read-only, and can be pinned. */
 785 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 786 {
 787         trace_xen_mmu_pgd_pin(mm, pgd);
 788
 789         xen_mc_batch();
 790
 791         if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
 792                 /* re-enable interrupts for flushing */
 793                 xen_mc_issue(0);
 794
 795                 kmap_flush_unused();
 796
 797                 xen_mc_batch();
 798         }
 799
 800 #ifdef CONFIG_X86_64
 801         {
 802                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
 803
 804                 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 805
 806                 if (user_pgd) {
 807                         xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 808                         xen_do_pin(MMUEXT_PIN_L4_TABLE,
 809                                    PFN_DOWN(__pa(user_pgd)));
 810                 }
 811         }
 812 #else /* CONFIG_X86_32 */
 813 #ifdef CONFIG_X86_PAE
 814         /* Need to make sure unshared kernel PMD is pinnable */
 815         xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 816                      PT_PMD);
 817 #endif
 818         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 819 #endif /* CONFIG_X86_64 */
 820         xen_mc_issue(0);
 821 }
 822
 823 static void xen_pgd_pin(struct mm_struct *mm)
 824 {
 825         __xen_pgd_pin(mm, mm->pgd);
 826 }
 827
 828 /*
 829  * On save, we need to pin all pagetables to make sure they get their
 830  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 831  * them (unpinned pgds are not currently in use, probably because the
 832  * process is under construction or destruction).
 833  *
 834  * Expected to be called in stop_machine() ("equivalent to taking
 835  * every spinlock in the system"), so the locking doesn't really
 836  * matter all that much.
 837  */
 838 void xen_mm_pin_all(void)
 839 {
 840         struct page *page;
 841
 842         spin_lock(&pgd_lock);
 843
 844         list_for_each_entry(page, &pgd_list, lru) {
 845                 if (!PagePinned(page)) {
 846                         __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
 847                         SetPageSavePinned(page);
 848                 }
 849         }
 850
 851         spin_unlock(&pgd_lock);
 852 }
 853
 854 /*
 855  * The init_mm pagetable is really pinned as soon as its created, but
 856  * that's before we have page structures to store the bits.  So do all
 857  * the book-keeping now.
 858  */
 859 static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
 860                                   enum pt_level level)
 861 {
 862         SetPagePinned(page);
 863         return 0;
 864 }
 865
 866 static void __init xen_mark_init_mm_pinned(void)
 867 {
 868         xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 869 }
 870
 871 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
 872                           enum pt_level level)
 873 {
 874         unsigned pgfl = TestClearPagePinned(page);
 875
 876         if (pgfl && !PageHighMem(page)) {
 877                 void *pt = lowmem_page_address(page);
 878                 unsigned long pfn = page_to_pfn(page);
 879                 spinlock_t *ptl = NULL;
 880                 struct multicall_space mcs;
 881
 882                 /*
 883                  * Do the converse to pin_page.  If we're using split
 884                  * pte locks, we must be holding the lock for while
 885                  * the pte page is unpinned but still RO to prevent
 886                  * concurrent updates from seeing it in this
 887                  * partially-pinned state.
 888                  */
 889                 if (level == PT_PTE) {
 890                         ptl = xen_pte_lock(page, mm);
 891
 892                         if (ptl)
 893                                 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 894                 }
 895
 896                 mcs = __xen_mc_entry(0);
 897
 898                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 899                                         pfn_pte(pfn, PAGE_KERNEL),
 900                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 901
 902                 if (ptl) {
 903                         /* unlock when batch completed */
 904                         xen_mc_callback(xen_pte_unlock, ptl);
 905                 }
 906         }
 907
 908         return 0;               /* never need to flush on unpin */
 909 }
 910
 911 /* Release a pagetables pages back as normal RW */
 912 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 913 {
 914         trace_xen_mmu_pgd_unpin(mm, pgd);
 915
 916         xen_mc_batch();
 917
 918         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 919
 920 #ifdef CONFIG_X86_64
 921         {
 922                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
 923
 924                 if (user_pgd) {
 925                         xen_do_pin(MMUEXT_UNPIN_TABLE,
 926                                    PFN_DOWN(__pa(user_pgd)));
 927                         xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
 928                 }
 929         }
 930 #endif
 931
 932 #ifdef CONFIG_X86_PAE
 933         /* Need to make sure unshared kernel PMD is unpinned */
 934         xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 935                        PT_PMD);
 936 #endif
 937
 938         __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
 939
 940         xen_mc_issue(0);
 941 }
 942
 943 static void xen_pgd_unpin(struct mm_struct *mm)
 944 {
 945         __xen_pgd_unpin(mm, mm->pgd);
 946 }
 947
 948 /*
 949  * On resume, undo any pinning done at save, so that the rest of the
 950  * kernel doesn't see any unexpected pinned pagetables.
 951  */
 952 void xen_mm_unpin_all(void)
 953 {
 954         struct page *page;
 955
 956         spin_lock(&pgd_lock);
 957
 958         list_for_each_entry(page, &pgd_list, lru) {
 959                 if (PageSavePinned(page)) {
 960                         BUG_ON(!PagePinned(page));
 961                         __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
 962                         ClearPageSavePinned(page);
 963                 }
 964         }
 965
 966         spin_unlock(&pgd_lock);
 967 }
 968
 969 static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 970 {
 971         spin_lock(&next->page_table_lock);
 972         xen_pgd_pin(next);
 973         spin_unlock(&next->page_table_lock);
 974 }
 975
 976 static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 977 {
 978         spin_lock(&mm->page_table_lock);
 979         xen_pgd_pin(mm);
 980         spin_unlock(&mm->page_table_lock);
 981 }
 982
 983
 984 #ifdef CONFIG_SMP
 985 /* Another cpu may still have their %cr3 pointing at the pagetable, so
 986    we need to repoint it somewhere else before we can unpin it. */
 987 static void drop_other_mm_ref(void *info)
 988 {
 989         struct mm_struct *mm = info;
 990         struct mm_struct *active_mm;
 991
 992         active_mm = this_cpu_read(cpu_tlbstate.active_mm);
 993
 994         if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
 995                 leave_mm(smp_processor_id());
 996
 997         /* If this cpu still has a stale cr3 reference, then make sure
 998            it has been flushed. */
 999         if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1000                 load_cr3(swapper_pg_dir);
1001 }
1002
1003 static void xen_drop_mm_ref(struct mm_struct *mm)
1004 {
1005         cpumask_var_t mask;
1006         unsigned cpu;
1007
1008         if (current->active_mm == mm) {
1009                 if (current->mm == mm)
1010                         load_cr3(swapper_pg_dir);
1011                 else
1012                         leave_mm(smp_processor_id());
1013         }
1014
1015         /* Get the "official" set of cpus referring to our pagetable. */
1016         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1017                 for_each_online_cpu(cpu) {
1018                         if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1019                             && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1020                                 continue;
1021                         smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1022                 }
1023                 return;
1024         }
1025         cpumask_copy(mask, mm_cpumask(mm));
1026
1027         /* It's possible that a vcpu may have a stale reference to our
1028            cr3, because its in lazy mode, and it hasn't yet flushed
1029            its set of pending hypercalls yet.  In this case, we can
1030            look at its actual current cr3 value, and force it to flush
1031            if needed. */
1032         for_each_online_cpu(cpu) {
1033                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1034                         cpumask_set_cpu(cpu, mask);
1035         }
1036
1037         if (!cpumask_empty(mask))
1038                 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1039         free_cpumask_var(mask);
1040 }
1041 #else
1042 static void xen_drop_mm_ref(struct mm_struct *mm)
1043 {
1044         if (current->active_mm == mm)
1045                 load_cr3(swapper_pg_dir);
1046 }
1047 #endif
1048
1049 /*
1050  * While a process runs, Xen pins its pagetables, which means that the
1051  * hypervisor forces it to be read-only, and it controls all updates
1052  * to it.  This means that all pagetable updates have to go via the
1053  * hypervisor, which is moderately expensive.
1054  *
1055  * Since we're pulling the pagetable down, we switch to use init_mm,
1056  * unpin old process pagetable and mark it all read-write, which
1057  * allows further operations on it to be simple memory accesses.
1058  *
1059  * The only subtle point is that another CPU may be still using the
1060  * pagetable because of lazy tlb flushing.  This means we need need to
1061  * switch all CPUs off this pagetable before we can unpin it.
1062  */
1063 static void xen_exit_mmap(struct mm_struct *mm)
1064 {
1065         get_cpu();              /* make sure we don't move around */
1066         xen_drop_mm_ref(mm);
1067         put_cpu();
1068
1069         spin_lock(&mm->page_table_lock);
1070
1071         /* pgd may not be pinned in the error exit path of execve */
1072         if (xen_page_pinned(mm->pgd))
1073                 xen_pgd_unpin(mm);
1074
1075         spin_unlock(&mm->page_table_lock);
1076 }
1077
1078 static void xen_post_allocator_init(void);
1079
1080 static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1081 {
1082         struct mmuext_op op;
1083
1084         op.cmd = cmd;
1085         op.arg1.mfn = pfn_to_mfn(pfn);
1086         if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1087                 BUG();
1088 }
1089
1090 #ifdef CONFIG_X86_64
1091 static void __init xen_cleanhighmap(unsigned long vaddr,
1092                                     unsigned long vaddr_end)
1093 {
1094         unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1095         pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1096
1097         /* NOTE: The loop is more greedy than the cleanup_highmap variant.
1098          * We include the PMD passed in on _both_ boundaries. */
1099         for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
1100                         pmd++, vaddr += PMD_SIZE) {
1101                 if (pmd_none(*pmd))
1102                         continue;
1103                 if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1104                         set_pmd(pmd, __pmd(0));
1105         }
1106         /* In case we did something silly, we should crash in this function
1107          * instead of somewhere later and be confusing. */
1108         xen_mc_flush();
1109 }
1110
1111 /*
1112  * Make a page range writeable and free it.
1113  */
1114 static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1115 {
1116         void *vaddr = __va(paddr);
1117         void *vaddr_end = vaddr + size;
1118
1119         for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1120                 make_lowmem_page_readwrite(vaddr);
1121
1122         memblock_free(paddr, size);
1123 }
1124
1125 static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1126 {
1127         unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1128
1129         if (unpin)
1130                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1131         ClearPagePinned(virt_to_page(__va(pa)));
1132         xen_free_ro_pages(pa, PAGE_SIZE);
1133 }
1134
1135 static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
1136 {
1137         unsigned long pa;
1138         pte_t *pte_tbl;
1139         int i;
1140
1141         if (pmd_large(*pmd)) {
1142                 pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1143                 xen_free_ro_pages(pa, PMD_SIZE);
1144                 return;
1145         }
1146
1147         pte_tbl = pte_offset_kernel(pmd, 0);
1148         for (i = 0; i < PTRS_PER_PTE; i++) {
1149                 if (pte_none(pte_tbl[i]))
1150                         continue;
1151                 pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
1152                 xen_free_ro_pages(pa, PAGE_SIZE);
1153         }
1154         set_pmd(pmd, __pmd(0));
1155         xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
1156 }
1157
1158 static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
1159 {
1160         unsigned long pa;
1161         pmd_t *pmd_tbl;
1162         int i;
1163
1164         if (pud_large(*pud)) {
1165                 pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1166                 xen_free_ro_pages(pa, PUD_SIZE);
1167                 return;
1168         }
1169
1170         pmd_tbl = pmd_offset(pud, 0);
1171         for (i = 0; i < PTRS_PER_PMD; i++) {
1172                 if (pmd_none(pmd_tbl[i]))
1173                         continue;
1174                 xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
1175         }
1176         set_pud(pud, __pud(0));
1177         xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
1178 }
1179
1180 static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
1181 {
1182         unsigned long pa;
1183         pud_t *pud_tbl;
1184         int i;
1185
1186         if (p4d_large(*p4d)) {
1187                 pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
1188                 xen_free_ro_pages(pa, P4D_SIZE);
1189                 return;
1190         }
1191
1192         pud_tbl = pud_offset(p4d, 0);
1193         for (i = 0; i < PTRS_PER_PUD; i++) {
1194                 if (pud_none(pud_tbl[i]))
1195                         continue;
1196                 xen_cleanmfnmap_pud(pud_tbl + i, unpin);
1197         }
1198         set_p4d(p4d, __p4d(0));
1199         xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
1200 }
1201
1202 /*
1203  * Since it is well isolated we can (and since it is perhaps large we should)
1204  * also free the page tables mapping the initial P->M table.
1205  */
1206 static void __init xen_cleanmfnmap(unsigned long vaddr)
1207 {
1208         pgd_t *pgd;
1209         p4d_t *p4d;
1210         unsigned int i;
1211         bool unpin;
1212
1213         unpin = (vaddr == 2 * PGDIR_SIZE);
1214         vaddr &= PMD_MASK;
1215         pgd = pgd_offset_k(vaddr);
1216         p4d = p4d_offset(pgd, 0);
1217         for (i = 0; i < PTRS_PER_P4D; i++) {
1218                 if (p4d_none(p4d[i]))
1219                         continue;
1220                 xen_cleanmfnmap_p4d(p4d + i, unpin);
1221         }
1222         if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
1223                 set_pgd(pgd, __pgd(0));
1224                 xen_cleanmfnmap_free_pgtbl(p4d, unpin);
1225         }
1226 }
1227
1228 static void __init xen_pagetable_p2m_free(void)
1229 {
1230         unsigned long size;
1231         unsigned long addr;
1232
1233         size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1234
1235         /* No memory or already called. */
1236         if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1237                 return;
1238
1239         /* using __ka address and sticking INVALID_P2M_ENTRY! */
1240         memset((void *)xen_start_info->mfn_list, 0xff, size);
1241
1242         addr = xen_start_info->mfn_list;
1243         /*
1244          * We could be in __ka space.
1245          * We roundup to the PMD, which means that if anybody at this stage is
1246          * using the __ka address of xen_start_info or
1247          * xen_start_info->shared_info they are in going to crash. Fortunatly
1248          * we have already revectored in xen_setup_kernel_pagetable and in
1249          * xen_setup_shared_info.
1250          */
1251         size = roundup(size, PMD_SIZE);
1252
1253         if (addr >= __START_KERNEL_map) {
1254                 xen_cleanhighmap(addr, addr + size);
1255                 size = PAGE_ALIGN(xen_start_info->nr_pages *
1256                                   sizeof(unsigned long));
1257                 memblock_free(__pa(addr), size);
1258         } else {
1259                 xen_cleanmfnmap(addr);
1260         }
1261 }
1262
1263 static void __init xen_pagetable_cleanhighmap(void)
1264 {
1265         unsigned long size;
1266         unsigned long addr;
1267
1268         /* At this stage, cleanup_highmap has already cleaned __ka space
1269          * from _brk_limit way up to the max_pfn_mapped (which is the end of
1270          * the ramdisk). We continue on, erasing PMD entries that point to page
1271          * tables - do note that they are accessible at this stage via __va.
1272          * For good measure we also round up to the PMD - which means that if
1273          * anybody is using __ka address to the initial boot-stack - and try
1274          * to use it - they are going to crash. The xen_start_info has been
1275          * taken care of already in xen_setup_kernel_pagetable. */
1276         addr = xen_start_info->pt_base;
1277         size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1278
1279         xen_cleanhighmap(addr, addr + size);
1280         xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1281 #ifdef DEBUG
1282         /* This is superfluous and is not necessary, but you know what
1283          * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1284          * anything at this stage. */
1285         xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1286 #endif
1287 }
1288 #endif
1289
1290 static void __init xen_pagetable_p2m_setup(void)
1291 {
1292         if (xen_feature(XENFEAT_auto_translated_physmap))
1293                 return;
1294
1295         xen_vmalloc_p2m_tree();
1296
1297 #ifdef CONFIG_X86_64
1298         xen_pagetable_p2m_free();
1299
1300         xen_pagetable_cleanhighmap();
1301 #endif
1302         /* And revector! Bye bye old array */
1303         xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1304 }
1305
1306 static void __init xen_pagetable_init(void)
1307 {
1308         paging_init();
1309         xen_post_allocator_init();
1310
1311         xen_pagetable_p2m_setup();
1312
1313         /* Allocate and initialize top and mid mfn levels for p2m structure */
1314         xen_build_mfn_list_list();
1315
1316         /* Remap memory freed due to conflicts with E820 map */
1317         if (!xen_feature(XENFEAT_auto_translated_physmap))
1318                 xen_remap_memory();
1319
1320         xen_setup_shared_info();
1321 }
1322 static void xen_write_cr2(unsigned long cr2)
1323 {
1324         this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1325 }
1326
1327 static unsigned long xen_read_cr2(void)
1328 {
1329         return this_cpu_read(xen_vcpu)->arch.cr2;
1330 }
1331
1332 unsigned long xen_read_cr2_direct(void)
1333 {
1334         return this_cpu_read(xen_vcpu_info.arch.cr2);
1335 }
1336
1337 static void xen_flush_tlb(void)
1338 {
1339         struct mmuext_op *op;
1340         struct multicall_space mcs;
1341
1342         trace_xen_mmu_flush_tlb(0);
1343
1344         preempt_disable();
1345
1346         mcs = xen_mc_entry(sizeof(*op));
1347
1348         op = mcs.args;
1349         op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1350         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1351
1352         xen_mc_issue(PARAVIRT_LAZY_MMU);
1353
1354         preempt_enable();
1355 }
1356
1357 static void xen_flush_tlb_single(unsigned long addr)
1358 {
1359         struct mmuext_op *op;
1360         struct multicall_space mcs;
1361
1362         trace_xen_mmu_flush_tlb_single(addr);
1363
1364         preempt_disable();
1365
1366         mcs = xen_mc_entry(sizeof(*op));
1367         op = mcs.args;
1368         op->cmd = MMUEXT_INVLPG_LOCAL;
1369         op->arg1.linear_addr = addr & PAGE_MASK;
1370         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1371
1372         xen_mc_issue(PARAVIRT_LAZY_MMU);
1373
1374         preempt_enable();
1375 }
1376
1377 static void xen_flush_tlb_others(const struct cpumask *cpus,
1378                                  struct mm_struct *mm, unsigned long start,
1379                                  unsigned long end)
1380 {
1381         struct {
1382                 struct mmuext_op op;
1383 #ifdef CONFIG_SMP
1384                 DECLARE_BITMAP(mask, num_processors);
1385 #else
1386                 DECLARE_BITMAP(mask, NR_CPUS);
1387 #endif
1388         } *args;
1389         struct multicall_space mcs;
1390
1391         trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1392
1393         if (cpumask_empty(cpus))
1394                 return;         /* nothing to do */
1395
1396         mcs = xen_mc_entry(sizeof(*args));
1397         args = mcs.args;
1398         args->op.arg2.vcpumask = to_cpumask(args->mask);
1399
1400         /* Remove us, and any offline CPUS. */
1401         cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1402         cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1403
1404         args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1405         if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1406                 args->op.cmd = MMUEXT_INVLPG_MULTI;
1407                 args->op.arg1.linear_addr = start;
1408         }
1409
1410         MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1411
1412         xen_mc_issue(PARAVIRT_LAZY_MMU);
1413 }
1414
1415 static unsigned long xen_read_cr3(void)
1416 {
1417         return this_cpu_read(xen_cr3);
1418 }
1419
1420 static void set_current_cr3(void *v)
1421 {
1422         this_cpu_write(xen_current_cr3, (unsigned long)v);
1423 }
1424
1425 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1426 {
1427         struct mmuext_op op;
1428         unsigned long mfn;
1429
1430         trace_xen_mmu_write_cr3(kernel, cr3);
1431
1432         if (cr3)
1433                 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1434         else
1435                 mfn = 0;
1436
1437         WARN_ON(mfn == 0 && kernel);
1438
1439         op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1440         op.arg1.mfn = mfn;
1441
1442         xen_extend_mmuext_op(&op);
1443
1444         if (kernel) {
1445                 this_cpu_write(xen_cr3, cr3);
1446
1447                 /* Update xen_current_cr3 once the batch has actually
1448                    been submitted. */
1449                 xen_mc_callback(set_current_cr3, (void *)cr3);
1450         }
1451 }
1452 static void xen_write_cr3(unsigned long cr3)
1453 {
1454         BUG_ON(preemptible());
1455
1456         xen_mc_batch();  /* disables interrupts */
1457
1458         /* Update while interrupts are disabled, so its atomic with
1459            respect to ipis */
1460         this_cpu_write(xen_cr3, cr3);
1461
1462         __xen_write_cr3(true, cr3);
1463
1464 #ifdef CONFIG_X86_64
1465         {
1466                 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1467                 if (user_pgd)
1468                         __xen_write_cr3(false, __pa(user_pgd));
1469                 else
1470                         __xen_write_cr3(false, 0);
1471         }
1472 #endif
1473
1474         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1475 }
1476
1477 #ifdef CONFIG_X86_64
1478 /*
1479  * At the start of the day - when Xen launches a guest, it has already
1480  * built pagetables for the guest. We diligently look over them
1481  * in xen_setup_kernel_pagetable and graft as appropriate them in the
1482  * init_level4_pgt and its friends. Then when we are happy we load
1483  * the new init_level4_pgt - and continue on.
1484  *
1485  * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1486  * up the rest of the pagetables. When it has completed it loads the cr3.
1487  * N.B. that baremetal would start at 'start_kernel' (and the early
1488  * #PF handler would create bootstrap pagetables) - so we are running
1489  * with the same assumptions as what to do when write_cr3 is executed
1490  * at this point.
1491  *
1492  * Since there are no user-page tables at all, we have two variants
1493  * of xen_write_cr3 - the early bootup (this one), and the late one
1494  * (xen_write_cr3). The reason we have to do that is that in 64-bit
1495  * the Linux kernel and user-space are both in ring 3 while the
1496  * hypervisor is in ring 0.
1497  */
1498 static void __init xen_write_cr3_init(unsigned long cr3)
1499 {
1500         BUG_ON(preemptible());
1501
1502         xen_mc_batch();  /* disables interrupts */
1503
1504         /* Update while interrupts are disabled, so its atomic with
1505            respect to ipis */
1506         this_cpu_write(xen_cr3, cr3);
1507
1508         __xen_write_cr3(true, cr3);
1509
1510         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1511 }
1512 #endif
1513
1514 static int xen_pgd_alloc(struct mm_struct *mm)
1515 {
1516         pgd_t *pgd = mm->pgd;
1517         int ret = 0;
1518
1519         BUG_ON(PagePinned(virt_to_page(pgd)));
1520
1521 #ifdef CONFIG_X86_64
1522         {
1523                 struct page *page = virt_to_page(pgd);
1524                 pgd_t *user_pgd;
1525
1526                 BUG_ON(page->private != 0);
1527
1528                 ret = -ENOMEM;
1529
1530                 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1531                 page->private = (unsigned long)user_pgd;
1532
1533                 if (user_pgd != NULL) {
1534 #ifdef CONFIG_X86_VSYSCALL_EMULATION
1535                         user_pgd[pgd_index(VSYSCALL_ADDR)] =
1536                                 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1537 #endif
1538                         ret = 0;
1539                 }
1540
1541                 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1542         }
1543 #endif
1544         return ret;
1545 }
1546
1547 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1548 {
1549 #ifdef CONFIG_X86_64
1550         pgd_t *user_pgd = xen_get_user_pgd(pgd);
1551
1552         if (user_pgd)
1553                 free_page((unsigned long)user_pgd);
1554 #endif
1555 }
1556
1557 /*
1558  * Init-time set_pte while constructing initial pagetables, which
1559  * doesn't allow RO page table pages to be remapped RW.
1560  *
1561  * If there is no MFN for this PFN then this page is initially
1562  * ballooned out so clear the PTE (as in decrease_reservation() in
1563  * drivers/xen/balloon.c).
1564  *
1565  * Many of these PTE updates are done on unpinned and writable pages
1566  * and doing a hypercall for these is unnecessary and expensive.  At
1567  * this point it is not possible to tell if a page is pinned or not,
1568  * so always write the PTE directly and rely on Xen trapping and
1569  * emulating any updates as necessary.
1570  */
1571 __visible pte_t xen_make_pte_init(pteval_t pte)
1572 {
1573 #ifdef CONFIG_X86_64
1574         unsigned long pfn;
1575
1576         /*
1577          * Pages belonging to the initial p2m list mapped outside the default
1578          * address range must be mapped read-only. This region contains the
1579          * page tables for mapping the p2m list, too, and page tables MUST be
1580          * mapped read-only.
1581          */
1582         pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
1583         if (xen_start_info->mfn_list < __START_KERNEL_map &&
1584             pfn >= xen_start_info->first_p2m_pfn &&
1585             pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1586                 pte &= ~_PAGE_RW;
1587 #endif
1588         pte = pte_pfn_to_mfn(pte);
1589         return native_make_pte(pte);
1590 }
1591 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
1592
1593 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1594 {
1595 #ifdef CONFIG_X86_32
1596         /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1597         if (pte_mfn(pte) != INVALID_P2M_ENTRY
1598             && pte_val_ma(*ptep) & _PAGE_PRESENT)
1599                 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1600                                pte_val_ma(pte));
1601 #endif
1602         native_set_pte(ptep, pte);
1603 }
1604
1605 /* Early in boot, while setting up the initial pagetable, assume
1606    everything is pinned. */
1607 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1608 {
1609 #ifdef CONFIG_FLATMEM
1610         BUG_ON(mem_map);        /* should only be used early */
1611 #endif
1612         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1613         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1614 }
1615
1616 /* Used for pmd and pud */
1617 static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1618 {
1619 #ifdef CONFIG_FLATMEM
1620         BUG_ON(mem_map);        /* should only be used early */
1621 #endif
1622         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1623 }
1624
1625 /* Early release_pte assumes that all pts are pinned, since there's
1626    only init_mm and anything attached to that is pinned. */
1627 static void __init xen_release_pte_init(unsigned long pfn)
1628 {
1629         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1630         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1631 }
1632
1633 static void __init xen_release_pmd_init(unsigned long pfn)
1634 {
1635         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1636 }
1637
1638 static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1639 {
1640         struct multicall_space mcs;
1641         struct mmuext_op *op;
1642
1643         mcs = __xen_mc_entry(sizeof(*op));
1644         op = mcs.args;
1645         op->cmd = cmd;
1646         op->arg1.mfn = pfn_to_mfn(pfn);
1647
1648         MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1649 }
1650
1651 static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1652 {
1653         struct multicall_space mcs;
1654         unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1655
1656         mcs = __xen_mc_entry(0);
1657         MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1658                                 pfn_pte(pfn, prot), 0);
1659 }
1660
1661 /* This needs to make sure the new pte page is pinned iff its being
1662    attached to a pinned pagetable. */
1663 static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1664                                     unsigned level)
1665 {
1666         bool pinned = PagePinned(virt_to_page(mm->pgd));
1667
1668         trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1669
1670         if (pinned) {
1671                 struct page *page = pfn_to_page(pfn);
1672
1673                 SetPagePinned(page);
1674
1675                 if (!PageHighMem(page)) {
1676                         xen_mc_batch();
1677
1678                         __set_pfn_prot(pfn, PAGE_KERNEL_RO);
1679
1680                         if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1681                                 __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1682
1683                         xen_mc_issue(PARAVIRT_LAZY_MMU);
1684                 } else {
1685                         /* make sure there are no stray mappings of
1686                            this page */
1687                         kmap_flush_unused();
1688                 }
1689         }
1690 }
1691
1692 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1693 {
1694         xen_alloc_ptpage(mm, pfn, PT_PTE);
1695 }
1696
1697 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1698 {
1699         xen_alloc_ptpage(mm, pfn, PT_PMD);
1700 }
1701
1702 /* This should never happen until we're OK to use struct page */
1703 static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1704 {
1705         struct page *page = pfn_to_page(pfn);
1706         bool pinned = PagePinned(page);
1707
1708         trace_xen_mmu_release_ptpage(pfn, level, pinned);
1709
1710         if (pinned) {
1711                 if (!PageHighMem(page)) {
1712                         xen_mc_batch();
1713
1714                         if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1715                                 __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1716
1717                         __set_pfn_prot(pfn, PAGE_KERNEL);
1718
1719                         xen_mc_issue(PARAVIRT_LAZY_MMU);
1720                 }
1721                 ClearPagePinned(page);
1722         }
1723 }
1724
1725 static void xen_release_pte(unsigned long pfn)
1726 {
1727         xen_release_ptpage(pfn, PT_PTE);
1728 }
1729
1730 static void xen_release_pmd(unsigned long pfn)
1731 {
1732         xen_release_ptpage(pfn, PT_PMD);
1733 }
1734
1735 #if CONFIG_PGTABLE_LEVELS >= 4
1736 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1737 {
1738         xen_alloc_ptpage(mm, pfn, PT_PUD);
1739 }
1740
1741 static void xen_release_pud(unsigned long pfn)
1742 {
1743         xen_release_ptpage(pfn, PT_PUD);
1744 }
1745 #endif
1746
1747 void __init xen_reserve_top(void)
1748 {
1749 #ifdef CONFIG_X86_32
1750         unsigned long top = HYPERVISOR_VIRT_START;
1751         struct xen_platform_parameters pp;
1752
1753         if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1754                 top = pp.virt_start;
1755
1756         reserve_top_address(-top);
1757 #endif  /* CONFIG_X86_32 */
1758 }
1759
1760 /*
1761  * Like __va(), but returns address in the kernel mapping (which is
1762  * all we have until the physical memory mapping has been set up.
1763  */
1764 static void * __init __ka(phys_addr_t paddr)
1765 {
1766 #ifdef CONFIG_X86_64
1767         return (void *)(paddr + __START_KERNEL_map);
1768 #else
1769         return __va(paddr);
1770 #endif
1771 }
1772
1773 /* Convert a machine address to physical address */
1774 static unsigned long __init m2p(phys_addr_t maddr)
1775 {
1776         phys_addr_t paddr;
1777
1778         maddr &= PTE_PFN_MASK;
1779         paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1780
1781         return paddr;
1782 }
1783
1784 /* Convert a machine address to kernel virtual */
1785 static void * __init m2v(phys_addr_t maddr)
1786 {
1787         return __ka(m2p(maddr));
1788 }
1789
1790 /* Set the page permissions on an identity-mapped pages */
1791 static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1792                                        unsigned long flags)
1793 {
1794         unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1795         pte_t pte = pfn_pte(pfn, prot);
1796
1797         if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1798                 BUG();
1799 }
1800 static void __init set_page_prot(void *addr, pgprot_t prot)
1801 {
1802         return set_page_prot_flags(addr, prot, UVMF_NONE);
1803 }
1804 #ifdef CONFIG_X86_32
1805 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1806 {
1807         unsigned pmdidx, pteidx;
1808         unsigned ident_pte;
1809         unsigned long pfn;
1810
1811         level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1812                                       PAGE_SIZE);
1813
1814         ident_pte = 0;
1815         pfn = 0;
1816         for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1817                 pte_t *pte_page;
1818
1819                 /* Reuse or allocate a page of ptes */
1820                 if (pmd_present(pmd[pmdidx]))
1821                         pte_page = m2v(pmd[pmdidx].pmd);
1822                 else {
1823                         /* Check for free pte pages */
1824                         if (ident_pte == LEVEL1_IDENT_ENTRIES)
1825                                 break;
1826
1827                         pte_page = &level1_ident_pgt[ident_pte];
1828                         ident_pte += PTRS_PER_PTE;
1829
1830                         pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1831                 }
1832
1833                 /* Install mappings */
1834                 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1835                         pte_t pte;
1836
1837                         if (pfn > max_pfn_mapped)
1838                                 max_pfn_mapped = pfn;
1839
1840                         if (!pte_none(pte_page[pteidx]))
1841                                 continue;
1842
1843                         pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1844                         pte_page[pteidx] = pte;
1845                 }
1846         }
1847
1848         for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1849                 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1850
1851         set_page_prot(pmd, PAGE_KERNEL_RO);
1852 }
1853 #endif
1854 void __init xen_setup_machphys_mapping(void)
1855 {
1856         struct xen_machphys_mapping mapping;
1857
1858         if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1859                 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1860                 machine_to_phys_nr = mapping.max_mfn + 1;
1861         } else {
1862                 machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1863         }
1864 #ifdef CONFIG_X86_32
1865         WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1866                 < machine_to_phys_mapping);
1867 #endif
1868 }
1869
1870 #ifdef CONFIG_X86_64
1871 static void __init convert_pfn_mfn(void *v)
1872 {
1873         pte_t *pte = v;
1874         int i;
1875
1876         /* All levels are converted the same way, so just treat them
1877            as ptes. */
1878         for (i = 0; i < PTRS_PER_PTE; i++)
1879                 pte[i] = xen_make_pte(pte[i].pte);
1880 }
1881 static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1882                                  unsigned long addr)
1883 {
1884         if (*pt_base == PFN_DOWN(__pa(addr))) {
1885                 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1886                 clear_page((void *)addr);
1887                 (*pt_base)++;
1888         }
1889         if (*pt_end == PFN_DOWN(__pa(addr))) {
1890                 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1891                 clear_page((void *)addr);
1892                 (*pt_end)--;
1893         }
1894 }
1895 /*
1896  * Set up the initial kernel pagetable.
1897  *
1898  * We can construct this by grafting the Xen provided pagetable into
1899  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1900  * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
1901  * kernel has a physical mapping to start with - but that's enough to
1902  * get __va working.  We need to fill in the rest of the physical
1903  * mapping once some sort of allocator has been set up.
1904  */
1905 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1906 {
1907         pud_t *l3;
1908         pmd_t *l2;
1909         unsigned long addr[3];
1910         unsigned long pt_base, pt_end;
1911         unsigned i;
1912
1913         /* max_pfn_mapped is the last pfn mapped in the initial memory
1914          * mappings. Considering that on Xen after the kernel mappings we
1915          * have the mappings of some pages that don't exist in pfn space, we
1916          * set max_pfn_mapped to the last real pfn mapped. */
1917         if (xen_start_info->mfn_list < __START_KERNEL_map)
1918                 max_pfn_mapped = xen_start_info->first_p2m_pfn;
1919         else
1920                 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1921
1922         pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1923         pt_end = pt_base + xen_start_info->nr_pt_frames;
1924
1925         /* Zap identity mapping */
1926         init_level4_pgt[0] = __pgd(0);
1927
1928         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1929                 /* Pre-constructed entries are in pfn, so convert to mfn */
1930                 /* L4[272] -> level3_ident_pgt
1931                  * L4[511] -> level3_kernel_pgt */
1932                 convert_pfn_mfn(init_level4_pgt);
1933
1934                 /* L3_i[0] -> level2_ident_pgt */
1935                 convert_pfn_mfn(level3_ident_pgt);
1936                 /* L3_k[510] -> level2_kernel_pgt
1937                  * L3_k[511] -> level2_fixmap_pgt */
1938                 convert_pfn_mfn(level3_kernel_pgt);
1939
1940                 /* L3_k[511][506] -> level1_fixmap_pgt */
1941                 convert_pfn_mfn(level2_fixmap_pgt);
1942         }
1943         /* We get [511][511] and have Xen's version of level2_kernel_pgt */
1944         l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1945         l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1946
1947         addr[0] = (unsigned long)pgd;
1948         addr[1] = (unsigned long)l3;
1949         addr[2] = (unsigned long)l2;
1950         /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1951          * Both L4[272][0] and L4[511][510] have entries that point to the same
1952          * L2 (PMD) tables. Meaning that if you modify it in __va space
1953          * it will be also modified in the __ka space! (But if you just
1954          * modify the PMD table to point to other PTE's or none, then you
1955          * are OK - which is what cleanup_highmap does) */
1956         copy_page(level2_ident_pgt, l2);
1957         /* Graft it onto L4[511][510] */
1958         copy_page(level2_kernel_pgt, l2);
1959
1960         /* Copy the initial P->M table mappings if necessary. */
1961         i = pgd_index(xen_start_info->mfn_list);
1962         if (i && i < pgd_index(__START_KERNEL_map))
1963                 init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1964
1965         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1966                 /* Make pagetable pieces RO */
1967                 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1968                 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1969                 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1970                 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1971                 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1972                 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1973                 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1974                 set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1975
1976                 /* Pin down new L4 */
1977                 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1978                                   PFN_DOWN(__pa_symbol(init_level4_pgt)));
1979
1980                 /* Unpin Xen-provided one */
1981                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1982
1983                 /*
1984                  * At this stage there can be no user pgd, and no page
1985                  * structure to attach it to, so make sure we just set kernel
1986                  * pgd.
1987                  */
1988                 xen_mc_batch();
1989                 __xen_write_cr3(true, __pa(init_level4_pgt));
1990                 xen_mc_issue(PARAVIRT_LAZY_CPU);
1991         } else
1992                 native_write_cr3(__pa(init_level4_pgt));
1993
1994         /* We can't that easily rip out L3 and L2, as the Xen pagetables are
1995          * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
1996          * the initial domain. For guests using the toolstack, they are in:
1997          * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1998          * rip out the [L4] (pgd), but for guests we shave off three pages.
1999          */
2000         for (i = 0; i < ARRAY_SIZE(addr); i++)
2001                 check_pt_base(&pt_base, &pt_end, addr[i]);
2002
2003         /* Our (by three pages) smaller Xen pagetable that we are using */
2004         xen_pt_base = PFN_PHYS(pt_base);
2005         xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
2006         memblock_reserve(xen_pt_base, xen_pt_size);
2007
2008         /* Revector the xen_start_info */
2009         xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
2010 }
2011
2012 /*
2013  * Read a value from a physical address.
2014  */
2015 static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
2016 {
2017         unsigned long *vaddr;
2018         unsigned long val;
2019
2020         vaddr = early_memremap_ro(addr, sizeof(val));
2021         val = *vaddr;
2022         early_memunmap(vaddr, sizeof(val));
2023         return val;
2024 }
2025
2026 /*
2027  * Translate a virtual address to a physical one without relying on mapped
2028  * page tables. Don't rely on big pages being aligned in (guest) physical
2029  * space!
2030  */
2031 static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2032 {
2033         phys_addr_t pa;
2034         pgd_t pgd;
2035         pud_t pud;
2036         pmd_t pmd;
2037         pte_t pte;
2038
2039         pa = read_cr3();
2040         pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
2041                                                        sizeof(pgd)));
2042         if (!pgd_present(pgd))
2043                 return 0;
2044
2045         pa = pgd_val(pgd) & PTE_PFN_MASK;
2046         pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
2047                                                        sizeof(pud)));
2048         if (!pud_present(pud))
2049                 return 0;
2050         pa = pud_val(pud) & PTE_PFN_MASK;
2051         if (pud_large(pud))
2052                 return pa + (vaddr & ~PUD_MASK);
2053
2054         pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
2055                                                        sizeof(pmd)));
2056         if (!pmd_present(pmd))
2057                 return 0;
2058         pa = pmd_val(pmd) & PTE_PFN_MASK;
2059         if (pmd_large(pmd))
2060                 return pa + (vaddr & ~PMD_MASK);
2061
2062         pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
2063                                                        sizeof(pte)));
2064         if (!pte_present(pte))
2065                 return 0;
2066         pa = pte_pfn(pte) << PAGE_SHIFT;
2067
2068         return pa | (vaddr & ~PAGE_MASK);
2069 }
2070
2071 /*
2072  * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
2073  * this area.
2074  */
2075 void __init xen_relocate_p2m(void)
2076 {
2077         phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
2078         unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
2079         int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
2080         pte_t *pt;
2081         pmd_t *pmd;
2082         pud_t *pud;
2083         p4d_t *p4d = NULL;
2084         pgd_t *pgd;
2085         unsigned long *new_p2m;
2086         int save_pud;
2087
2088         size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
2089         n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
2090         n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
2091         n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
2092         n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
2093         if (PTRS_PER_P4D > 1)
2094                 n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
2095         else
2096                 n_p4d = 0;
2097         n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
2098
2099         new_area = xen_find_free_area(PFN_PHYS(n_frames));
2100         if (!new_area) {
2101                 xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
2102                 BUG();
2103         }
2104
2105         /*
2106          * Setup the page tables for addressing the new p2m list.
2107          * We have asked the hypervisor to map the p2m list at the user address
2108          * PUD_SIZE. It may have done so, or it may have used a kernel space
2109          * address depending on the Xen version.
2110          * To avoid any possible virtual address collision, just use
2111          * 2 * PUD_SIZE for the new area.
2112          */
2113         p4d_phys = new_area;
2114         pud_phys = p4d_phys + PFN_PHYS(n_p4d);
2115         pmd_phys = pud_phys + PFN_PHYS(n_pud);
2116         pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2117         p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2118
2119         pgd = __va(read_cr3());
2120         new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2121         idx_p4d = 0;
2122         save_pud = n_pud;
2123         do {
2124                 if (n_p4d > 0) {
2125                         p4d = early_memremap(p4d_phys, PAGE_SIZE);
2126                         clear_page(p4d);
2127                         n_pud = min(save_pud, PTRS_PER_P4D);
2128                 }
2129                 for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
2130                         pud = early_memremap(pud_phys, PAGE_SIZE);
2131                         clear_page(pud);
2132                         for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
2133                                  idx_pmd++) {
2134                                 pmd = early_memremap(pmd_phys, PAGE_SIZE);
2135                                 clear_page(pmd);
2136                                 for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
2137                                          idx_pt++) {
2138                                         pt = early_memremap(pt_phys, PAGE_SIZE);
2139                                         clear_page(pt);
2140                                         for (idx_pte = 0;
2141                                                  idx_pte < min(n_pte, PTRS_PER_PTE);
2142                                                  idx_pte++) {
2143                                                 set_pte(pt + idx_pte,
2144                                                                 pfn_pte(p2m_pfn, PAGE_KERNEL));
2145                                                 p2m_pfn++;
2146                                         }
2147                                         n_pte -= PTRS_PER_PTE;
2148                                         early_memunmap(pt, PAGE_SIZE);
2149                                         make_lowmem_page_readonly(__va(pt_phys));
2150                                         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2151                                                         PFN_DOWN(pt_phys));
2152                                         set_pmd(pmd + idx_pt,
2153                                                         __pmd(_PAGE_TABLE | pt_phys));
2154                                         pt_phys += PAGE_SIZE;
2155                                 }
2156                                 n_pt -= PTRS_PER_PMD;
2157                                 early_memunmap(pmd, PAGE_SIZE);
2158                                 make_lowmem_page_readonly(__va(pmd_phys));
2159                                 pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2160                                                 PFN_DOWN(pmd_phys));
2161                                 set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
2162                                 pmd_phys += PAGE_SIZE;
2163                         }
2164                         n_pmd -= PTRS_PER_PUD;
2165                         early_memunmap(pud, PAGE_SIZE);
2166                         make_lowmem_page_readonly(__va(pud_phys));
2167                         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2168                         if (n_p4d > 0)
2169                                 set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
2170                         else
2171                                 set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2172                         pud_phys += PAGE_SIZE;
2173                 }
2174                 if (n_p4d > 0) {
2175                         save_pud -= PTRS_PER_P4D;
2176                         early_memunmap(p4d, PAGE_SIZE);
2177                         make_lowmem_page_readonly(__va(p4d_phys));
2178                         pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
2179                         set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
2180                         p4d_phys += PAGE_SIZE;
2181                 }
2182         } while (++idx_p4d < n_p4d);
2183
2184         /* Now copy the old p2m info to the new area. */
2185         memcpy(new_p2m, xen_p2m_addr, size);
2186         xen_p2m_addr = new_p2m;
2187
2188         /* Release the old p2m list and set new list info. */
2189         p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
2190         BUG_ON(!p2m_pfn);
2191         p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
2192
2193         if (xen_start_info->mfn_list < __START_KERNEL_map) {
2194                 pfn = xen_start_info->first_p2m_pfn;
2195                 pfn_end = xen_start_info->first_p2m_pfn +
2196                           xen_start_info->nr_p2m_frames;
2197                 set_pgd(pgd + 1, __pgd(0));
2198         } else {
2199                 pfn = p2m_pfn;
2200                 pfn_end = p2m_pfn_end;
2201         }
2202
2203         memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
2204         while (pfn < pfn_end) {
2205                 if (pfn == p2m_pfn) {
2206                         pfn = p2m_pfn_end;
2207                         continue;
2208                 }
2209                 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
2210                 pfn++;
2211         }
2212
2213         xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2214         xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
2215         xen_start_info->nr_p2m_frames = n_frames;
2216 }
2217
2218 #else   /* !CONFIG_X86_64 */
2219 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2220 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2221
2222 static void __init xen_write_cr3_init(unsigned long cr3)
2223 {
2224         unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2225
2226         BUG_ON(read_cr3() != __pa(initial_page_table));
2227         BUG_ON(cr3 != __pa(swapper_pg_dir));
2228
2229         /*
2230          * We are switching to swapper_pg_dir for the first time (from
2231          * initial_page_table) and therefore need to mark that page
2232          * read-only and then pin it.
2233          *
2234          * Xen disallows sharing of kernel PMDs for PAE
2235          * guests. Therefore we must copy the kernel PMD from
2236          * initial_page_table into a new kernel PMD to be used in
2237          * swapper_pg_dir.
2238          */
2239         swapper_kernel_pmd =
2240                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2241         copy_page(swapper_kernel_pmd, initial_kernel_pmd);
2242         swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2243                 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2244         set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2245
2246         set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2247         xen_write_cr3(cr3);
2248         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2249
2250         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2251                           PFN_DOWN(__pa(initial_page_table)));
2252         set_page_prot(initial_page_table, PAGE_KERNEL);
2253         set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2254
2255         pv_mmu_ops.write_cr3 = &xen_write_cr3;
2256 }
2257
2258 /*
2259  * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
2260  * not the first page table in the page table pool.
2261  * Iterate through the initial page tables to find the real page table base.
2262  */
2263 static phys_addr_t xen_find_pt_base(pmd_t *pmd)
2264 {
2265         phys_addr_t pt_base, paddr;
2266         unsigned pmdidx;
2267
2268         pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
2269
2270         for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
2271                 if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
2272                         paddr = m2p(pmd[pmdidx].pmd);
2273                         pt_base = min(pt_base, paddr);
2274                 }
2275
2276         return pt_base;
2277 }
2278
2279 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
2280 {
2281         pmd_t *kernel_pmd;
2282
2283         kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2284
2285         xen_pt_base = xen_find_pt_base(kernel_pmd);
2286         xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
2287
2288         initial_kernel_pmd =
2289                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2290
2291         max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
2292
2293         copy_page(initial_kernel_pmd, kernel_pmd);
2294
2295         xen_map_identity_early(initial_kernel_pmd, max_pfn);
2296
2297         copy_page(initial_page_table, pgd);
2298         initial_page_table[KERNEL_PGD_BOUNDARY] =
2299                 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2300
2301         set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2302         set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2303         set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2304
2305         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2306
2307         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2308                           PFN_DOWN(__pa(initial_page_table)));
2309         xen_write_cr3(__pa(initial_page_table));
2310
2311         memblock_reserve(xen_pt_base, xen_pt_size);
2312 }
2313 #endif  /* CONFIG_X86_64 */
2314
2315 void __init xen_reserve_special_pages(void)
2316 {
2317         phys_addr_t paddr;
2318
2319         memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2320         if (xen_start_info->store_mfn) {
2321                 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2322                 memblock_reserve(paddr, PAGE_SIZE);
2323         }
2324         if (!xen_initial_domain()) {
2325                 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2326                 memblock_reserve(paddr, PAGE_SIZE);
2327         }
2328 }
2329
2330 void __init xen_pt_check_e820(void)
2331 {
2332         if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
2333                 xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
2334                 BUG();
2335         }
2336 }
2337
2338 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2339
2340 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2341 {
2342         pte_t pte;
2343
2344         phys >>= PAGE_SHIFT;
2345
2346         switch (idx) {
2347         case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2348         case FIX_RO_IDT:
2349 #ifdef CONFIG_X86_32
2350         case FIX_WP_TEST:
2351 # ifdef CONFIG_HIGHMEM
2352         case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2353 # endif
2354 #elif defined(CONFIG_X86_VSYSCALL_EMULATION)
2355         case VSYSCALL_PAGE:
2356 #endif
2357         case FIX_TEXT_POKE0:
2358         case FIX_TEXT_POKE1:
2359         case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
2360                 /* All local page mappings */
2361                 pte = pfn_pte(phys, prot);
2362                 break;
2363
2364 #ifdef CONFIG_X86_LOCAL_APIC
2365         case FIX_APIC_BASE:     /* maps dummy local APIC */
2366                 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2367                 break;
2368 #endif
2369
2370 #ifdef CONFIG_X86_IO_APIC
2371         case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2372                 /*
2373                  * We just don't map the IO APIC - all access is via
2374                  * hypercalls.  Keep the address in the pte for reference.
2375                  */
2376                 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2377                 break;
2378 #endif
2379
2380         case FIX_PARAVIRT_BOOTMAP:
2381                 /* This is an MFN, but it isn't an IO mapping from the
2382                    IO domain */
2383                 pte = mfn_pte(phys, prot);
2384                 break;
2385
2386         default:
2387                 /* By default, set_fixmap is used for hardware mappings */
2388                 pte = mfn_pte(phys, prot);
2389                 break;
2390         }
2391
2392         __native_set_fixmap(idx, pte);
2393
2394 #ifdef CONFIG_X86_VSYSCALL_EMULATION
2395         /* Replicate changes to map the vsyscall page into the user
2396            pagetable vsyscall mapping. */
2397         if (idx == VSYSCALL_PAGE) {
2398                 unsigned long vaddr = __fix_to_virt(idx);
2399                 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2400         }
2401 #endif
2402 }
2403
2404 static void __init xen_post_allocator_init(void)
2405 {
2406         if (xen_feature(XENFEAT_auto_translated_physmap))
2407                 return;
2408
2409         pv_mmu_ops.set_pte = xen_set_pte;
2410         pv_mmu_ops.set_pmd = xen_set_pmd;
2411         pv_mmu_ops.set_pud = xen_set_pud;
2412 #if CONFIG_PGTABLE_LEVELS >= 4
2413         pv_mmu_ops.set_p4d = xen_set_p4d;
2414 #endif
2415
2416         /* This will work as long as patching hasn't happened yet
2417            (which it hasn't) */
2418         pv_mmu_ops.alloc_pte = xen_alloc_pte;
2419         pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2420         pv_mmu_ops.release_pte = xen_release_pte;
2421         pv_mmu_ops.release_pmd = xen_release_pmd;
2422 #if CONFIG_PGTABLE_LEVELS >= 4
2423         pv_mmu_ops.alloc_pud = xen_alloc_pud;
2424         pv_mmu_ops.release_pud = xen_release_pud;
2425 #endif
2426         pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
2427
2428 #ifdef CONFIG_X86_64
2429         pv_mmu_ops.write_cr3 = &xen_write_cr3;
2430         SetPagePinned(virt_to_page(level3_user_vsyscall));
2431 #endif
2432         xen_mark_init_mm_pinned();
2433 }
2434
2435 static void xen_leave_lazy_mmu(void)
2436 {
2437         preempt_disable();
2438         xen_mc_flush();
2439         paravirt_leave_lazy_mmu();
2440         preempt_enable();
2441 }
2442
2443 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2444         .read_cr2 = xen_read_cr2,
2445         .write_cr2 = xen_write_cr2,
2446
2447         .read_cr3 = xen_read_cr3,
2448         .write_cr3 = xen_write_cr3_init,
2449
2450         .flush_tlb_user = xen_flush_tlb,
2451         .flush_tlb_kernel = xen_flush_tlb,
2452         .flush_tlb_single = xen_flush_tlb_single,
2453         .flush_tlb_others = xen_flush_tlb_others,
2454
2455         .pte_update = paravirt_nop,
2456
2457         .pgd_alloc = xen_pgd_alloc,
2458         .pgd_free = xen_pgd_free,
2459
2460         .alloc_pte = xen_alloc_pte_init,
2461         .release_pte = xen_release_pte_init,
2462         .alloc_pmd = xen_alloc_pmd_init,
2463         .release_pmd = xen_release_pmd_init,
2464
2465         .set_pte = xen_set_pte_init,
2466         .set_pte_at = xen_set_pte_at,
2467         .set_pmd = xen_set_pmd_hyper,
2468
2469         .ptep_modify_prot_start = __ptep_modify_prot_start,
2470         .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2471
2472         .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2473         .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2474
2475         .make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
2476         .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2477
2478 #ifdef CONFIG_X86_PAE
2479         .set_pte_atomic = xen_set_pte_atomic,
2480         .pte_clear = xen_pte_clear,
2481         .pmd_clear = xen_pmd_clear,
2482 #endif  /* CONFIG_X86_PAE */
2483         .set_pud = xen_set_pud_hyper,
2484
2485         .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2486         .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2487
2488 #if CONFIG_PGTABLE_LEVELS >= 4
2489         .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2490         .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2491         .set_p4d = xen_set_p4d_hyper,
2492
2493         .alloc_pud = xen_alloc_pmd_init,
2494         .release_pud = xen_release_pmd_init,
2495 #endif  /* CONFIG_PGTABLE_LEVELS == 4 */
2496
2497         .activate_mm = xen_activate_mm,
2498         .dup_mmap = xen_dup_mmap,
2499         .exit_mmap = xen_exit_mmap,
2500
2501         .lazy_mode = {
2502                 .enter = paravirt_enter_lazy_mmu,
2503                 .leave = xen_leave_lazy_mmu,
2504                 .flush = paravirt_flush_lazy_mmu,
2505         },
2506
2507         .set_fixmap = xen_set_fixmap,
2508 };
2509
2510 void __init xen_init_mmu_ops(void)
2511 {
2512         x86_init.paging.pagetable_init = xen_pagetable_init;
2513
2514         if (xen_feature(XENFEAT_auto_translated_physmap))
2515                 return;
2516
2517         pv_mmu_ops = xen_mmu_ops;
2518
2519         memset(dummy_mapping, 0xff, PAGE_SIZE);
2520 }
2521
2522 /* Protected by xen_reservation_lock. */
2523 #define MAX_CONTIG_ORDER 9 /* 2MB */
2524 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2525
2526 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2527 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2528                                 unsigned long *in_frames,
2529                                 unsigned long *out_frames)
2530 {
2531         int i;
2532         struct multicall_space mcs;
2533
2534         xen_mc_batch();
2535         for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2536                 mcs = __xen_mc_entry(0);
2537
2538                 if (in_frames)
2539                         in_frames[i] = virt_to_mfn(vaddr);
2540
2541                 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2542                 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2543
2544                 if (out_frames)
2545                         out_frames[i] = virt_to_pfn(vaddr);
2546         }
2547         xen_mc_issue(0);
2548 }
2549
2550 /*
2551  * Update the pfn-to-mfn mappings for a virtual address range, either to
2552  * point to an array of mfns, or contiguously from a single starting
2553  * mfn.
2554  */
2555 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2556                                      unsigned long *mfns,
2557                                      unsigned long first_mfn)
2558 {
2559         unsigned i, limit;
2560         unsigned long mfn;
2561
2562         xen_mc_batch();
2563
2564         limit = 1u << order;
2565         for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2566                 struct multicall_space mcs;
2567                 unsigned flags;
2568
2569                 mcs = __xen_mc_entry(0);
2570                 if (mfns)
2571                         mfn = mfns[i];
2572                 else
2573                         mfn = first_mfn + i;
2574
2575                 if (i < (limit - 1))
2576                         flags = 0;
2577                 else {
2578                         if (order == 0)
2579                                 flags = UVMF_INVLPG | UVMF_ALL;
2580                         else
2581                                 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2582                 }
2583
2584                 MULTI_update_va_mapping(mcs.mc, vaddr,
2585                                 mfn_pte(mfn, PAGE_KERNEL), flags);
2586
2587                 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2588         }
2589
2590         xen_mc_issue(0);
2591 }
2592
2593 /*
2594  * Perform the hypercall to exchange a region of our pfns to point to
2595  * memory with the required contiguous alignment.  Takes the pfns as
2596  * input, and populates mfns as output.
2597  *
2598  * Returns a success code indicating whether the hypervisor was able to
2599  * satisfy the request or not.
2600  */
2601 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2602                                unsigned long *pfns_in,
2603                                unsigned long extents_out,
2604                                unsigned int order_out,
2605                                unsigned long *mfns_out,
2606                                unsigned int address_bits)
2607 {
2608         long rc;
2609         int success;
2610
2611         struct xen_memory_exchange exchange = {
2612                 .in = {
2613                         .nr_extents   = extents_in,
2614                         .extent_order = order_in,
2615                         .extent_start = pfns_in,
2616                         .domid        = DOMID_SELF
2617                 },
2618                 .out = {
2619                         .nr_extents   = extents_out,
2620                         .extent_order = order_out,
2621                         .extent_start = mfns_out,
2622                         .address_bits = address_bits,
2623                         .domid        = DOMID_SELF
2624                 }
2625         };
2626
2627         BUG_ON(extents_in << order_in != extents_out << order_out);
2628
2629         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2630         success = (exchange.nr_exchanged == extents_in);
2631
2632         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2633         BUG_ON(success && (rc != 0));
2634
2635         return success;
2636 }
2637
2638 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2639                                  unsigned int address_bits,
2640                                  dma_addr_t *dma_handle)
2641 {
2642         unsigned long *in_frames = discontig_frames, out_frame;
2643         unsigned long  flags;
2644         int            success;
2645         unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2646
2647         /*
2648          * Currently an auto-translated guest will not perform I/O, nor will
2649          * it require PAE page directories below 4GB. Therefore any calls to
2650          * this function are redundant and can be ignored.
2651          */
2652
2653         if (xen_feature(XENFEAT_auto_translated_physmap))
2654                 return 0;
2655
2656         if (unlikely(order > MAX_CONTIG_ORDER))
2657                 return -ENOMEM;
2658
2659         memset((void *) vstart, 0, PAGE_SIZE << order);
2660
2661         spin_lock_irqsave(&xen_reservation_lock, flags);
2662
2663         /* 1. Zap current PTEs, remembering MFNs. */
2664         xen_zap_pfn_range(vstart, order, in_frames, NULL);
2665
2666         /* 2. Get a new contiguous memory extent. */
2667         out_frame = virt_to_pfn(vstart);
2668         success = xen_exchange_memory(1UL << order, 0, in_frames,
2669                                       1, order, &out_frame,
2670                                       address_bits);
2671
2672         /* 3. Map the new extent in place of old pages. */
2673         if (success)
2674                 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2675         else
2676                 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2677
2678         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2679
2680         *dma_handle = virt_to_machine(vstart).maddr;
2681         return success ? 0 : -ENOMEM;
2682 }
2683 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2684
2685 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2686 {
2687         unsigned long *out_frames = discontig_frames, in_frame;
2688         unsigned long  flags;
2689         int success;
2690         unsigned long vstart;
2691
2692         if (xen_feature(XENFEAT_auto_translated_physmap))
2693                 return;
2694
2695         if (unlikely(order > MAX_CONTIG_ORDER))
2696                 return;
2697
2698         vstart = (unsigned long)phys_to_virt(pstart);
2699         memset((void *) vstart, 0, PAGE_SIZE << order);
2700
2701         spin_lock_irqsave(&xen_reservation_lock, flags);
2702
2703         /* 1. Find start MFN of contiguous extent. */
2704         in_frame = virt_to_mfn(vstart);
2705
2706         /* 2. Zap current PTEs. */
2707         xen_zap_pfn_range(vstart, order, NULL, out_frames);
2708
2709         /* 3. Do the exchange for non-contiguous MFNs. */
2710         success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2711                                         0, out_frames, 0);
2712
2713         /* 4. Map new pages in place of old pages. */
2714         if (success)
2715                 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2716         else
2717                 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2718
2719         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2720 }
2721 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2722
2723 #ifdef CONFIG_KEXEC_CORE
2724 phys_addr_t paddr_vmcoreinfo_note(void)
2725 {
2726         if (xen_pv_domain())
2727                 return virt_to_machine(&vmcoreinfo_note).maddr;
2728         else
2729                 return __pa_symbol(&vmcoreinfo_note);
2730 }
2731 #endif /* CONFIG_KEXEC_CORE */