#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
#endif
}
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+ return xchg(sptep, new_spte);
+#else
+ u64 old_spte;
+
+ do {
+ old_spte = *sptep;
+ } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+ return old_spte;
+#endif
+}
+
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte;
+
+ if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
+ __set_spte(sptep, new_spte);
+ } else {
+ old_spte = __xchg_spte(sptep, new_spte);
+ if (old_spte & shadow_accessed_mask)
+ mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+ }
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
{
unsigned long idx;
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].write_count;
}
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
-
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
struct kvm_memory_slot *slot;
int *largepage_idx;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
if (slot) {
largepage_idx = slot_largepage_idx(gfn, slot, level);
return *largepage_idx;
/*
* Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].rmap_pde;
}
if (!is_rmap_spte(*spte))
return count;
- gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- pfn_t pfn;
gfn_t gfn;
unsigned long *rmapp;
int i;
- if (!is_rmap_spte(*spte))
- return;
sp = page_header(__pa(spte));
- pfn = spte_to_pfn(*spte);
- if (*spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(pfn);
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
if (!*rmapp) {
}
}
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+ pfn_t pfn;
+ u64 old_spte;
+
+ old_spte = __xchg_spte(sptep, new_spte);
+ if (!is_rmap_spte(old_spte))
+ return;
+ pfn = spte_to_pfn(old_spte);
+ if (old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (is_writable_pte(old_spte))
+ kvm_set_pfn_dirty(pfn);
+ rmap_remove(kvm, sptep);
+}
+
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm_rmap_desc *desc;
u64 *spte;
int i, write_protected = 0;
- gfn = unalias_gfn(kvm, gfn);
rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
spte = rmap_next(kvm, rmapp, NULL);
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ update_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- rmap_remove(kvm, spte);
+ drop_spte(kvm, spte,
+ shadow_trap_nonpresent_pte);
--kvm->stat.lpages;
- __set_spte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
unsigned long data)
{
int need_flush = 0;
- u64 *spte, new_spte;
+ u64 *spte, new_spte, old_spte;
pte_t *ptep = (pte_t *)data;
pfn_t new_pfn;
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
+ new_spte &= ~shadow_accessed_mask;
if (is_writable_pte(*spte))
kvm_set_pfn_dirty(spte_to_pfn(*spte));
- __set_spte(spte, new_spte);
+ old_spte = __xchg_spte(spte, new_spte);
+ if (is_shadow_present_pte(old_spte)
+ && (old_spte & shadow_accessed_mask))
+ mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
spte = rmap_next(kvm, rmapp, spte);
}
}
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- int idx = gfn_offset;
- idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+ unsigned long idx;
+ int sh;
+
+ sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+ idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+ (memslot->base_gfn >> sh);
ret |= handler(kvm,
&memslot->lpage_info[j][idx].rmap_pde,
data);
sp = page_header(__pa(spte));
- gfn = unalias_gfn(vcpu->kvm, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
BUG();
}
-
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ fn(parent_sp, sp->parent_pte);
return;
}
+
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
+ u64 *spte = pte_chain->parent_ptes[i];
+
+ if (!spte)
break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ parent_sp = page_header(__pa(spte));
+ fn(parent_sp, spte);
}
}
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- unsigned int index;
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- index = spte - sp->spt;
- if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
- sp->unsync_children++;
- WARN_ON(!sp->unsync_children);
+ mmu_parent_walk(sp, mark_unsync);
}
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
+ unsigned int index;
- if (!sp->parent_pte)
+ index = spte - sp->spt;
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
-
- if (!sp->multimapped) {
- kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ if (sp->unsync_children++)
return;
- }
-
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
- }
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
- kvm_mmu_update_parents_unsync(sp);
- return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
- mmu_parent_walk(sp, unsync_walk_fn);
- kvm_mmu_update_parents_unsync(sp);
+ kvm_mmu_mark_parents_unsync(sp);
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
if (sp->unsync_children) {
- set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
kvm_mmu_mark_parents_unsync(sp);
} else if (sp->unsync)
kvm_mmu_mark_parents_unsync(sp);
--iterator->level;
}
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ spte = __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ __set_spte(sptep, spte);
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ if (child->role.access == direct_access)
+ return;
+
+ mmu_page_remove_parent_pte(child, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
} else {
if (is_large_pte(ent))
--kvm->stat.lpages;
- rmap_remove(kvm, &pt[i]);
+ drop_spte(kvm, &pt[i],
+ shadow_trap_nonpresent_pte);
}
}
pt[i] = shadow_trap_nonpresent_pte;
bool need_unsync = false;
for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!can_unsync)
+ return 1;
+
if (s->role.level != PT_PAGE_TABLE_LEVEL)
return 1;
if (!need_unsync && !s->unsync) {
- if (!can_unsync || !oos_shadow)
+ if (!oos_shadow)
return 1;
need_unsync = true;
}
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- rmap_remove(vcpu->kvm, sptep);
- spte = shadow_trap_nonpresent_pte;
- goto set_pte;
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ goto done;
}
spte |= PT_WRITABLE_MASK;
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- __set_spte(sptep, spte);
+ update_spte(sptep, spte);
+done:
return ret;
}
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %lx new %lx\n",
spte_to_pfn(*sptep), pfn);
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
if (is_hwpoison_pfn(pfn)) {
kvm_send_hwpoison_signal(kvm, gfn);
return 0;
- }
+ } else if (is_fault_pfn(pfn))
+ return -EFAULT;
+
return 1;
}
int ret = 0;
if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- rmap_remove(vcpu->kvm, spte);
+ drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
if (sp->unsync)
continue;
- gfn = unalias_gfn(vcpu->kvm, sp->gfn);
- slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
spte = rmap_next(vcpu->kvm, rmapp, NULL);