#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/gfp.h>
+#include <linux/memblock.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/e820.h>
#include <asm/linkage.h>
#include <asm/page.h>
+#include <asm/init.h>
+#include <asm/pat.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
* / \ / \ / /
* p2m p2m p2m p2m p2m p2m p2m ...
*
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
* The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
* maximum representable pseudo-physical address space is:
* P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
* 512 and 1024 entries respectively.
*/
-static unsigned long max_p2m_pfn __read_mostly;
+unsigned long xen_max_p2m_pfn __read_mostly;
#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
top[i] = virt_to_mfn(p2m_mid_missing_mfn);
}
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
static void p2m_mid_init(unsigned long **mid)
{
unsigned i;
*/
void xen_build_mfn_list_list(void)
{
- unsigned pfn;
+ unsigned long pfn;
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
}
- for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
unsigned mididx = p2m_mid_index(pfn);
unsigned long **mid;
- unsigned long mid_mfn;
unsigned long *mid_mfn_p;
mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
/* Don't bother allocating any mfn mid levels if
- they're just missing */
- if (mid[mididx] == p2m_missing)
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
continue;
-
- mid_mfn = p2m_top_mfn[topidx];
- mid_mfn_p = mfn_to_virt(mid_mfn);
+ }
if (mid_mfn_p == p2m_mid_missing_mfn) {
/*
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(mid_mfn_p);
- mid_mfn = virt_to_mfn(mid_mfn_p);
-
- p2m_top_mfn[topidx] = mid_mfn;
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
}
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
}
}
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
virt_to_mfn(p2m_top_mfn);
- HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
{
unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- unsigned pfn;
+ unsigned long pfn;
- max_p2m_pfn = max_pfn;
+ xen_max_p2m_pfn = max_pfn;
p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_init(p2m_missing);
}
top_mfn_p = &p2m_top_mfn[topidx];
- mid_mfn = mfn_to_virt(*top_mfn_p);
+ mid_mfn = p2m_top_mfn_p[topidx];
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
if (mid_mfn == p2m_mid_missing_mfn) {
/* Separately check the mid mfn level */
return false;
p2m_mid_mfn_init(mid_mfn);
-
+
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
}
if (p2m_top[topidx][mididx] == p2m_missing) {
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_wrprotect(*pte);
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_mkwrite(*pte);
return pte_flags(pte) & _PAGE_IOMAP;
}
-static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
{
struct multicall_space mcs;
struct mmu_update *u;
u->ptr = arbitrary_virt_to_machine(ptep).maddr;
u->val = pte_val_ma(pteval);
- MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
+ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
+EXPORT_SYMBOL_GPL(xen_set_domain_pte);
+
+static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+{
+ xen_set_domain_pte(ptep, pteval, DOMID_IO);
+}
static void xen_extend_mmu_update(const struct mmu_update *update)
{
pteval_t xen_pte_val(pte_t pte)
{
- if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
- return pte.pte;
+ pteval_t pteval = pte.pte;
+
+ /* If this is a WC pte, convert back from Xen WC to Linux WC */
+ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
+ WARN_ON(!pat_enabled);
+ pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
+ }
+
+ if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
+ return pteval;
- return pte_mfn_to_pfn(pte.pte);
+ return pte_mfn_to_pfn(pteval);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
+/*
+ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
+ * are reserved for now, to correspond to the Intel-reserved PAT
+ * types.
+ *
+ * We expect Linux's PAT set as follows:
+ *
+ * Idx PTE flags Linux Xen Default
+ * 0 WB WB WB
+ * 1 PWT WC WT WT
+ * 2 PCD UC- UC- UC-
+ * 3 PCD PWT UC UC UC
+ * 4 PAT WB WC WB
+ * 5 PAT PWT WC WP WT
+ * 6 PAT PCD UC- UC UC-
+ * 7 PAT PCD PWT UC UC UC
+ */
+
+void xen_set_pat(u64 pat)
+{
+ /* We expect Linux to use a PAT setting of
+ * UC UC- WC WB (ignoring the PAT flag) */
+ WARN_ON(pat != 0x0007010600070106ull);
+}
+
pte_t xen_make_pte(pteval_t pte)
{
phys_addr_t addr = (pte & PTE_PFN_MASK);
+ /* If Linux is trying to set a WC pte, then map to the Xen WC.
+ * If _PAGE_PAT is set, then it probably means it is really
+ * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
+ * things work out OK...
+ *
+ * (We should never see kernel mappings with _PAGE_PSE set,
+ * but we could see hugetlbfs mappings, I think.).
+ */
+ if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
+ if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
+ pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
+ }
+
/*
* Unprivileged domains are allowed to do IOMAPpings for
* PCI passthrough, but not map ISA space. The ISA
#endif
}
-#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
+ unsigned long pfn = pte_pfn(pte);
+
+#ifdef CONFIG_X86_32
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
+#endif
+
+ /*
+ * If the new pfn is within the range of the newly allocated
+ * kernel pagetable, and it isn't being mapped into an
+ * early_ioremap fixmap slot, make sure it is RO.
+ */
+ if (!is_early_ioremap_ptep(ptep) &&
+ pfn >= e820_table_start && pfn < e820_table_end)
+ pte = pte_wrprotect(pte);
return pte;
}
xen_set_pte(ptep, pte);
}
-#endif
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
return __ka(m2p(maddr));
}
+/* Set the page permissions on an identity-mapped pages */
static void set_page_prot(void *addr, pgprot_t prot)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
__xen_write_cr3(true, __pa(pgd));
xen_mc_issue(PARAVIRT_LAZY_CPU);
- reserve_early(__pa(xen_start_info->pt_base),
+ memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
- reserve_early(__pa(xen_start_info->pt_base),
+ memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
}
#endif /* CONFIG_X86_64 */
+static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
+
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
{
pte_t pte;
# endif
#else
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- case FIX_APIC_BASE: /* maps dummy local APIC */
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
pte = pfn_pte(phys, prot);
break;
+#ifdef CONFIG_X86_LOCAL_APIC
+ case FIX_APIC_BASE: /* maps dummy local APIC */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
+ /*
+ * We just don't map the IO APIC - all access is via
+ * hypercalls. Keep the address in the pte for reference.
+ */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
case FIX_PARAVIRT_BOOTMAP:
/* This is an MFN, but it isn't an IO mapping from the
IO domain */
#endif
}
+__init void xen_ident_map_ISA(void)
+{
+ unsigned long pa;
+
+ /*
+ * If we're dom0, then linear map the ISA machine addresses into
+ * the kernel's address space.
+ */
+ if (!xen_initial_domain())
+ return;
+
+ xen_raw_printk("Xen: setup ISA identity maps\n");
+
+ for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
+ pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
+
+ if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
+ BUG();
+ }
+
+ xen_flush_tlb();
+}
+
static __init void xen_post_allocator_init(void)
{
pv_mmu_ops.set_pte = xen_set_pte;
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
.alloc_pmd = xen_alloc_pmd_init,
- .alloc_pmd_clone = paravirt_nop,
.release_pmd = xen_release_pmd_init,
-#ifdef CONFIG_X86_64
- .set_pte = xen_set_pte,
-#else
.set_pte = xen_set_pte_init,
-#endif
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
pv_mmu_ops = xen_mmu_ops;
vmap_lazy_unmap = false;
+
+ memset(dummy_mapping, 0xff, PAGE_SIZE);
}
/* Protected by xen_reservation_lock. */
}
#endif
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+ unsigned long mfn;
+ pgprot_t prot;
+ struct mmu_update *mmu_update;
+};
+
+static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct remap_data *rmd = data;
+ pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+
+ rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ rmd->mmu_update->val = pte_val_ma(pte);
+ rmd->mmu_update++;
+
+ return 0;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long mfn, int nr,
+ pgprot_t prot, unsigned domid)
+{
+ struct remap_data rmd;
+ struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+ int batch;
+ unsigned long range;
+ int err = 0;
+
+ prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+ rmd.mfn = mfn;
+ rmd.prot = prot;
+
+ while (nr) {
+ batch = min(REMAP_BATCH_SIZE, nr);
+ range = (unsigned long)batch << PAGE_SHIFT;
+
+ rmd.mmu_update = mmu_update;
+ err = apply_to_page_range(vma->vm_mm, addr, range,
+ remap_area_mfn_pte_fn, &rmd);
+ if (err)
+ goto out;
+
+ err = -EFAULT;
+ if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
+ goto out;
+
+ nr -= batch;
+ addr += range;
+ }
+
+ err = 0;
+out:
+
+ flush_tlb_all();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_mmu_debug;