arch/x86/kvm/mmu.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * This module enables machines with Intel VT-x extensions to run virtual
   5  * machines without emulation or binary translation.
   6  *
   7  * MMU support
   8  *
   9  * Copyright (C) 2006 Qumranet, Inc.
  10  * Copyright 2010 Red Hat, Inc. and/or its affilates.
  11  *
  12  * Authors:
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  *   Avi Kivity   <avi@qumranet.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include "mmu.h"
  22 #include "x86.h"
  23 #include "kvm_cache_regs.h"
  24
  25 #include <linux/kvm_host.h>
  26 #include <linux/types.h>
  27 #include <linux/string.h>
  28 #include <linux/mm.h>
  29 #include <linux/highmem.h>
  30 #include <linux/module.h>
  31 #include <linux/swap.h>
  32 #include <linux/hugetlb.h>
  33 #include <linux/compiler.h>
  34 #include <linux/srcu.h>
  35 #include <linux/slab.h>
  36 #include <linux/uaccess.h>
  37
  38 #include <asm/page.h>
  39 #include <asm/cmpxchg.h>
  40 #include <asm/io.h>
  41 #include <asm/vmx.h>
  42
  43 /*
  44  * When setting this variable to true it enables Two-Dimensional-Paging
  45  * where the hardware walks 2 page tables:
  46  * 1. the guest-virtual to guest-physical
  47  * 2. while doing 1. it walks guest-physical to host-physical
  48  * If the hardware supports that we don't need to do shadow paging.
  49  */
  50 bool tdp_enabled = false;
  51
  52 #undef MMU_DEBUG
  53
  54 #undef AUDIT
  55
  56 #ifdef AUDIT
  57 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
  58 #else
  59 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
  60 #endif
  61
  62 #ifdef MMU_DEBUG
  63
  64 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
  65 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
  66
  67 #else
  68
  69 #define pgprintk(x...) do { } while (0)
  70 #define rmap_printk(x...) do { } while (0)
  71
  72 #endif
  73
  74 #if defined(MMU_DEBUG) || defined(AUDIT)
  75 static int dbg = 0;
  76 module_param(dbg, bool, 0644);
  77 #endif
  78
  79 static int oos_shadow = 1;
  80 module_param(oos_shadow, bool, 0644);
  81
  82 #ifndef MMU_DEBUG
  83 #define ASSERT(x) do { } while (0)
  84 #else
  85 #define ASSERT(x)                                                       \
  86         if (!(x)) {                                                     \
  87                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
  88                        __FILE__, __LINE__, #x);                         \
  89         }
  90 #endif
  91
  92 #define PT_FIRST_AVAIL_BITS_SHIFT 9
  93 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
  94
  95 #define PT64_LEVEL_BITS 9
  96
  97 #define PT64_LEVEL_SHIFT(level) \
  98                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
  99
 100 #define PT64_LEVEL_MASK(level) \
 101                 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
 102
 103 #define PT64_INDEX(address, level)\
 104         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 105
 106
 107 #define PT32_LEVEL_BITS 10
 108
 109 #define PT32_LEVEL_SHIFT(level) \
 110                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 111
 112 #define PT32_LEVEL_MASK(level) \
 113                 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
 114 #define PT32_LVL_OFFSET_MASK(level) \
 115         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 116                                                 * PT32_LEVEL_BITS))) - 1))
 117
 118 #define PT32_INDEX(address, level)\
 119         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 120
 121
 122 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 123 #define PT64_DIR_BASE_ADDR_MASK \
 124         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
 125 #define PT64_LVL_ADDR_MASK(level) \
 126         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 127                                                 * PT64_LEVEL_BITS))) - 1))
 128 #define PT64_LVL_OFFSET_MASK(level) \
 129         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 130                                                 * PT64_LEVEL_BITS))) - 1))
 131
 132 #define PT32_BASE_ADDR_MASK PAGE_MASK
 133 #define PT32_DIR_BASE_ADDR_MASK \
 134         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 135 #define PT32_LVL_ADDR_MASK(level) \
 136         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 137                                             * PT32_LEVEL_BITS))) - 1))
 138
 139 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
 140                         | PT64_NX_MASK)
 141
 142 #define RMAP_EXT 4
 143
 144 #define ACC_EXEC_MASK    1
 145 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
 146 #define ACC_USER_MASK    PT_USER_MASK
 147 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 148
 149 #include <trace/events/kvm.h>
 150
 151 #define CREATE_TRACE_POINTS
 152 #include "mmutrace.h"
 153
 154 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 155
 156 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 157
 158 struct kvm_rmap_desc {
 159         u64 *sptes[RMAP_EXT];
 160         struct kvm_rmap_desc *more;
 161 };
 162
 163 struct kvm_shadow_walk_iterator {
 164         u64 addr;
 165         hpa_t shadow_addr;
 166         int level;
 167         u64 *sptep;
 168         unsigned index;
 169 };
 170
 171 #define for_each_shadow_entry(_vcpu, _addr, _walker)    \
 172         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 173              shadow_walk_okay(&(_walker));                      \
 174              shadow_walk_next(&(_walker)))
 175
 176 typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 177
 178 static struct kmem_cache *pte_chain_cache;
 179 static struct kmem_cache *rmap_desc_cache;
 180 static struct kmem_cache *mmu_page_header_cache;
 181
 182 static u64 __read_mostly shadow_trap_nonpresent_pte;
 183 static u64 __read_mostly shadow_notrap_nonpresent_pte;
 184 static u64 __read_mostly shadow_base_present_pte;
 185 static u64 __read_mostly shadow_nx_mask;
 186 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 187 static u64 __read_mostly shadow_user_mask;
 188 static u64 __read_mostly shadow_accessed_mask;
 189 static u64 __read_mostly shadow_dirty_mask;
 190
 191 static inline u64 rsvd_bits(int s, int e)
 192 {
 193         return ((1ULL << (e - s + 1)) - 1) << s;
 194 }
 195
 196 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 197 {
 198         shadow_trap_nonpresent_pte = trap_pte;
 199         shadow_notrap_nonpresent_pte = notrap_pte;
 200 }
 201 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 202
 203 void kvm_mmu_set_base_ptes(u64 base_pte)
 204 {
 205         shadow_base_present_pte = base_pte;
 206 }
 207 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 208
 209 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 210                 u64 dirty_mask, u64 nx_mask, u64 x_mask)
 211 {
 212         shadow_user_mask = user_mask;
 213         shadow_accessed_mask = accessed_mask;
 214         shadow_dirty_mask = dirty_mask;
 215         shadow_nx_mask = nx_mask;
 216         shadow_x_mask = x_mask;
 217 }
 218 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 219
 220 static bool is_write_protection(struct kvm_vcpu *vcpu)
 221 {
 222         return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 223 }
 224
 225 static int is_cpuid_PSE36(void)
 226 {
 227         return 1;
 228 }
 229
 230 static int is_nx(struct kvm_vcpu *vcpu)
 231 {
 232         return vcpu->arch.efer & EFER_NX;
 233 }
 234
 235 static int is_shadow_present_pte(u64 pte)
 236 {
 237         return pte != shadow_trap_nonpresent_pte
 238                 && pte != shadow_notrap_nonpresent_pte;
 239 }
 240
 241 static int is_large_pte(u64 pte)
 242 {
 243         return pte & PT_PAGE_SIZE_MASK;
 244 }
 245
 246 static int is_writable_pte(unsigned long pte)
 247 {
 248         return pte & PT_WRITABLE_MASK;
 249 }
 250
 251 static int is_dirty_gpte(unsigned long pte)
 252 {
 253         return pte & PT_DIRTY_MASK;
 254 }
 255
 256 static int is_rmap_spte(u64 pte)
 257 {
 258         return is_shadow_present_pte(pte);
 259 }
 260
 261 static int is_last_spte(u64 pte, int level)
 262 {
 263         if (level == PT_PAGE_TABLE_LEVEL)
 264                 return 1;
 265         if (is_large_pte(pte))
 266                 return 1;
 267         return 0;
 268 }
 269
 270 static pfn_t spte_to_pfn(u64 pte)
 271 {
 272         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 273 }
 274
 275 static gfn_t pse36_gfn_delta(u32 gpte)
 276 {
 277         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 278
 279         return (gpte & PT32_DIR_PSE36_MASK) << shift;
 280 }
 281
 282 static void __set_spte(u64 *sptep, u64 spte)
 283 {
 284 #ifdef CONFIG_X86_64
 285         set_64bit((unsigned long *)sptep, spte);
 286 #else
 287         set_64bit((unsigned long long *)sptep, spte);
 288 #endif
 289 }
 290
 291 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 292                                   struct kmem_cache *base_cache, int min)
 293 {
 294         void *obj;
 295
 296         if (cache->nobjs >= min)
 297                 return 0;
 298         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 299                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
 300                 if (!obj)
 301                         return -ENOMEM;
 302                 cache->objects[cache->nobjs++] = obj;
 303         }
 304         return 0;
 305 }
 306
 307 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
 308                                   struct kmem_cache *cache)
 309 {
 310         while (mc->nobjs)
 311                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
 312 }
 313
 314 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 315                                        int min)
 316 {
 317         struct page *page;
 318
 319         if (cache->nobjs >= min)
 320                 return 0;
 321         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 322                 page = alloc_page(GFP_KERNEL);
 323                 if (!page)
 324                         return -ENOMEM;
 325                 cache->objects[cache->nobjs++] = page_address(page);
 326         }
 327         return 0;
 328 }
 329
 330 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
 331 {
 332         while (mc->nobjs)
 333                 free_page((unsigned long)mc->objects[--mc->nobjs]);
 334 }
 335
 336 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 337 {
 338         int r;
 339
 340         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
 341                                    pte_chain_cache, 4);
 342         if (r)
 343                 goto out;
 344         r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
 345                                    rmap_desc_cache, 4);
 346         if (r)
 347                 goto out;
 348         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
 349         if (r)
 350                 goto out;
 351         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
 352                                    mmu_page_header_cache, 4);
 353 out:
 354         return r;
 355 }
 356
 357 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 358 {
 359         mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
 360         mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
 361         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
 362         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
 363                                 mmu_page_header_cache);
 364 }
 365
 366 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 367                                     size_t size)
 368 {
 369         void *p;
 370
 371         BUG_ON(!mc->nobjs);
 372         p = mc->objects[--mc->nobjs];
 373         return p;
 374 }
 375
 376 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
 377 {
 378         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
 379                                       sizeof(struct kvm_pte_chain));
 380 }
 381
 382 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 383 {
 384         kmem_cache_free(pte_chain_cache, pc);
 385 }
 386
 387 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 388 {
 389         return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
 390                                       sizeof(struct kvm_rmap_desc));
 391 }
 392
 393 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 394 {
 395         kmem_cache_free(rmap_desc_cache, rd);
 396 }
 397
 398 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 399 {
 400         if (!sp->role.direct)
 401                 return sp->gfns[index];
 402
 403         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
 404 }
 405
 406 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 407 {
 408         if (sp->role.direct)
 409                 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
 410         else
 411                 sp->gfns[index] = gfn;
 412 }
 413
 414 /*
 415  * Return the pointer to the largepage write count for a given
 416  * gfn, handling slots that are not large page aligned.
 417  */
 418 static int *slot_largepage_idx(gfn_t gfn,
 419                                struct kvm_memory_slot *slot,
 420                                int level)
 421 {
 422         unsigned long idx;
 423
 424         idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
 425               (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 426         return &slot->lpage_info[level - 2][idx].write_count;
 427 }
 428
 429 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 430 {
 431         struct kvm_memory_slot *slot;
 432         int *write_count;
 433         int i;
 434
 435         slot = gfn_to_memslot(kvm, gfn);
 436         for (i = PT_DIRECTORY_LEVEL;
 437              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 438                 write_count   = slot_largepage_idx(gfn, slot, i);
 439                 *write_count += 1;
 440         }
 441 }
 442
 443 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 444 {
 445         struct kvm_memory_slot *slot;
 446         int *write_count;
 447         int i;
 448
 449         slot = gfn_to_memslot(kvm, gfn);
 450         for (i = PT_DIRECTORY_LEVEL;
 451              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 452                 write_count   = slot_largepage_idx(gfn, slot, i);
 453                 *write_count -= 1;
 454                 WARN_ON(*write_count < 0);
 455         }
 456 }
 457
 458 static int has_wrprotected_page(struct kvm *kvm,
 459                                 gfn_t gfn,
 460                                 int level)
 461 {
 462         struct kvm_memory_slot *slot;
 463         int *largepage_idx;
 464
 465         slot = gfn_to_memslot(kvm, gfn);
 466         if (slot) {
 467                 largepage_idx = slot_largepage_idx(gfn, slot, level);
 468                 return *largepage_idx;
 469         }
 470
 471         return 1;
 472 }
 473
 474 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 475 {
 476         unsigned long page_size;
 477         int i, ret = 0;
 478
 479         page_size = kvm_host_page_size(kvm, gfn);
 480
 481         for (i = PT_PAGE_TABLE_LEVEL;
 482              i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
 483                 if (page_size >= KVM_HPAGE_SIZE(i))
 484                         ret = i;
 485                 else
 486                         break;
 487         }
 488
 489         return ret;
 490 }
 491
 492 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 493 {
 494         struct kvm_memory_slot *slot;
 495         int host_level, level, max_level;
 496
 497         slot = gfn_to_memslot(vcpu->kvm, large_gfn);
 498         if (slot && slot->dirty_bitmap)
 499                 return PT_PAGE_TABLE_LEVEL;
 500
 501         host_level = host_mapping_level(vcpu->kvm, large_gfn);
 502
 503         if (host_level == PT_PAGE_TABLE_LEVEL)
 504                 return host_level;
 505
 506         max_level = kvm_x86_ops->get_lpage_level() < host_level ?
 507                 kvm_x86_ops->get_lpage_level() : host_level;
 508
 509         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
 510                 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
 511                         break;
 512
 513         return level - 1;
 514 }
 515
 516 /*
 517  * Take gfn and return the reverse mapping to it.
 518  */
 519
 520 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 521 {
 522         struct kvm_memory_slot *slot;
 523         unsigned long idx;
 524
 525         slot = gfn_to_memslot(kvm, gfn);
 526         if (likely(level == PT_PAGE_TABLE_LEVEL))
 527                 return &slot->rmap[gfn - slot->base_gfn];
 528
 529         idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
 530                 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 531
 532         return &slot->lpage_info[level - 2][idx].rmap_pde;
 533 }
 534
 535 /*
 536  * Reverse mapping data structures:
 537  *
 538  * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 539  * that points to page_address(page).
 540  *
 541  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 542  * containing more mappings.
 543  *
 544  * Returns the number of rmap entries before the spte was added or zero if
 545  * the spte was not added.
 546  *
 547  */
 548 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 549 {
 550         struct kvm_mmu_page *sp;
 551         struct kvm_rmap_desc *desc;
 552         unsigned long *rmapp;
 553         int i, count = 0;
 554
 555         if (!is_rmap_spte(*spte))
 556                 return count;
 557         sp = page_header(__pa(spte));
 558         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 559         rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 560         if (!*rmapp) {
 561                 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
 562                 *rmapp = (unsigned long)spte;
 563         } else if (!(*rmapp & 1)) {
 564                 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
 565                 desc = mmu_alloc_rmap_desc(vcpu);
 566                 desc->sptes[0] = (u64 *)*rmapp;
 567                 desc->sptes[1] = spte;
 568                 *rmapp = (unsigned long)desc | 1;
 569         } else {
 570                 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 571                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 572                 while (desc->sptes[RMAP_EXT-1] && desc->more) {
 573                         desc = desc->more;
 574                         count += RMAP_EXT;
 575                 }
 576                 if (desc->sptes[RMAP_EXT-1]) {
 577                         desc->more = mmu_alloc_rmap_desc(vcpu);
 578                         desc = desc->more;
 579                 }
 580                 for (i = 0; desc->sptes[i]; ++i)
 581                         ;
 582                 desc->sptes[i] = spte;
 583         }
 584         return count;
 585 }
 586
 587 static void rmap_desc_remove_entry(unsigned long *rmapp,
 588                                    struct kvm_rmap_desc *desc,
 589                                    int i,
 590                                    struct kvm_rmap_desc *prev_desc)
 591 {
 592         int j;
 593
 594         for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
 595                 ;
 596         desc->sptes[i] = desc->sptes[j];
 597         desc->sptes[j] = NULL;
 598         if (j != 0)
 599                 return;
 600         if (!prev_desc && !desc->more)
 601                 *rmapp = (unsigned long)desc->sptes[0];
 602         else
 603                 if (prev_desc)
 604                         prev_desc->more = desc->more;
 605                 else
 606                         *rmapp = (unsigned long)desc->more | 1;
 607         mmu_free_rmap_desc(desc);
 608 }
 609
 610 static void rmap_remove(struct kvm *kvm, u64 *spte)
 611 {
 612         struct kvm_rmap_desc *desc;
 613         struct kvm_rmap_desc *prev_desc;
 614         struct kvm_mmu_page *sp;
 615         pfn_t pfn;
 616         gfn_t gfn;
 617         unsigned long *rmapp;
 618         int i;
 619
 620         if (!is_rmap_spte(*spte))
 621                 return;
 622         sp = page_header(__pa(spte));
 623         pfn = spte_to_pfn(*spte);
 624         if (*spte & shadow_accessed_mask)
 625                 kvm_set_pfn_accessed(pfn);
 626         if (is_writable_pte(*spte))
 627                 kvm_set_pfn_dirty(pfn);
 628         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 629         rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 630         if (!*rmapp) {
 631                 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
 632                 BUG();
 633         } else if (!(*rmapp & 1)) {
 634                 rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
 635                 if ((u64 *)*rmapp != spte) {
 636                         printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
 637                                spte, *spte);
 638                         BUG();
 639                 }
 640                 *rmapp = 0;
 641         } else {
 642                 rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
 643                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 644                 prev_desc = NULL;
 645                 while (desc) {
 646                         for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
 647                                 if (desc->sptes[i] == spte) {
 648                                         rmap_desc_remove_entry(rmapp,
 649                                                                desc, i,
 650                                                                prev_desc);
 651                                         return;
 652                                 }
 653                         prev_desc = desc;
 654                         desc = desc->more;
 655                 }
 656                 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
 657                 BUG();
 658         }
 659 }
 660
 661 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 662 {
 663         rmap_remove(kvm, sptep);
 664         __set_spte(sptep, new_spte);
 665 }
 666
 667 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 668 {
 669         struct kvm_rmap_desc *desc;
 670         u64 *prev_spte;
 671         int i;
 672
 673         if (!*rmapp)
 674                 return NULL;
 675         else if (!(*rmapp & 1)) {
 676                 if (!spte)
 677                         return (u64 *)*rmapp;
 678                 return NULL;
 679         }
 680         desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 681         prev_spte = NULL;
 682         while (desc) {
 683                 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
 684                         if (prev_spte == spte)
 685                                 return desc->sptes[i];
 686                         prev_spte = desc->sptes[i];
 687                 }
 688                 desc = desc->more;
 689         }
 690         return NULL;
 691 }
 692
 693 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 694 {
 695         unsigned long *rmapp;
 696         u64 *spte;
 697         int i, write_protected = 0;
 698
 699         rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 700
 701         spte = rmap_next(kvm, rmapp, NULL);
 702         while (spte) {
 703                 BUG_ON(!spte);
 704                 BUG_ON(!(*spte & PT_PRESENT_MASK));
 705                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 706                 if (is_writable_pte(*spte)) {
 707                         __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
 708                         write_protected = 1;
 709                 }
 710                 spte = rmap_next(kvm, rmapp, spte);
 711         }
 712         if (write_protected) {
 713                 pfn_t pfn;
 714
 715                 spte = rmap_next(kvm, rmapp, NULL);
 716                 pfn = spte_to_pfn(*spte);
 717                 kvm_set_pfn_dirty(pfn);
 718         }
 719
 720         /* check for huge page mappings */
 721         for (i = PT_DIRECTORY_LEVEL;
 722              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 723                 rmapp = gfn_to_rmap(kvm, gfn, i);
 724                 spte = rmap_next(kvm, rmapp, NULL);
 725                 while (spte) {
 726                         BUG_ON(!spte);
 727                         BUG_ON(!(*spte & PT_PRESENT_MASK));
 728                         BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 729                         pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 730                         if (is_writable_pte(*spte)) {
 731                                 drop_spte(kvm, spte,
 732                                           shadow_trap_nonpresent_pte);
 733                                 --kvm->stat.lpages;
 734                                 spte = NULL;
 735                                 write_protected = 1;
 736                         }
 737                         spte = rmap_next(kvm, rmapp, spte);
 738                 }
 739         }
 740
 741         return write_protected;
 742 }
 743
 744 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 745                            unsigned long data)
 746 {
 747         u64 *spte;
 748         int need_tlb_flush = 0;
 749
 750         while ((spte = rmap_next(kvm, rmapp, NULL))) {
 751                 BUG_ON(!(*spte & PT_PRESENT_MASK));
 752                 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
 753                 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 754                 need_tlb_flush = 1;
 755         }
 756         return need_tlb_flush;
 757 }
 758
 759 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 760                              unsigned long data)
 761 {
 762         int need_flush = 0;
 763         u64 *spte, new_spte;
 764         pte_t *ptep = (pte_t *)data;
 765         pfn_t new_pfn;
 766
 767         WARN_ON(pte_huge(*ptep));
 768         new_pfn = pte_pfn(*ptep);
 769         spte = rmap_next(kvm, rmapp, NULL);
 770         while (spte) {
 771                 BUG_ON(!is_shadow_present_pte(*spte));
 772                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 773                 need_flush = 1;
 774                 if (pte_write(*ptep)) {
 775                         drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 776                         spte = rmap_next(kvm, rmapp, NULL);
 777                 } else {
 778                         new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
 779                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
 780
 781                         new_spte &= ~PT_WRITABLE_MASK;
 782                         new_spte &= ~SPTE_HOST_WRITEABLE;
 783                         if (is_writable_pte(*spte))
 784                                 kvm_set_pfn_dirty(spte_to_pfn(*spte));
 785                         __set_spte(spte, new_spte);
 786                         spte = rmap_next(kvm, rmapp, spte);
 787                 }
 788         }
 789         if (need_flush)
 790                 kvm_flush_remote_tlbs(kvm);
 791
 792         return 0;
 793 }
 794
 795 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 796                           unsigned long data,
 797                           int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 798                                          unsigned long data))
 799 {
 800         int i, j;
 801         int ret;
 802         int retval = 0;
 803         struct kvm_memslots *slots;
 804
 805         slots = kvm_memslots(kvm);
 806
 807         for (i = 0; i < slots->nmemslots; i++) {
 808                 struct kvm_memory_slot *memslot = &slots->memslots[i];
 809                 unsigned long start = memslot->userspace_addr;
 810                 unsigned long end;
 811
 812                 end = start + (memslot->npages << PAGE_SHIFT);
 813                 if (hva >= start && hva < end) {
 814                         gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 815
 816                         ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 817
 818                         for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
 819                                 int idx = gfn_offset;
 820                                 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
 821                                 ret |= handler(kvm,
 822                                         &memslot->lpage_info[j][idx].rmap_pde,
 823                                         data);
 824                         }
 825                         trace_kvm_age_page(hva, memslot, ret);
 826                         retval |= ret;
 827                 }
 828         }
 829
 830         return retval;
 831 }
 832
 833 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 834 {
 835         return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
 836 }
 837
 838 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 839 {
 840         kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
 841 }
 842
 843 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 844                          unsigned long data)
 845 {
 846         u64 *spte;
 847         int young = 0;
 848
 849         /*
 850          * Emulate the accessed bit for EPT, by checking if this page has
 851          * an EPT mapping, and clearing it if it does. On the next access,
 852          * a new EPT mapping will be established.
 853          * This has some overhead, but not as much as the cost of swapping
 854          * out actively used pages or breaking up actively used hugepages.
 855          */
 856         if (!shadow_accessed_mask)
 857                 return kvm_unmap_rmapp(kvm, rmapp, data);
 858
 859         spte = rmap_next(kvm, rmapp, NULL);
 860         while (spte) {
 861                 int _young;
 862                 u64 _spte = *spte;
 863                 BUG_ON(!(_spte & PT_PRESENT_MASK));
 864                 _young = _spte & PT_ACCESSED_MASK;
 865                 if (_young) {
 866                         young = 1;
 867                         clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
 868                 }
 869                 spte = rmap_next(kvm, rmapp, spte);
 870         }
 871         return young;
 872 }
 873
 874 #define RMAP_RECYCLE_THRESHOLD 1000
 875
 876 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 877 {
 878         unsigned long *rmapp;
 879         struct kvm_mmu_page *sp;
 880
 881         sp = page_header(__pa(spte));
 882
 883         rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 884
 885         kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
 886         kvm_flush_remote_tlbs(vcpu->kvm);
 887 }
 888
 889 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 890 {
 891         return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
 892 }
 893
 894 #ifdef MMU_DEBUG
 895 static int is_empty_shadow_page(u64 *spt)
 896 {
 897         u64 *pos;
 898         u64 *end;
 899
 900         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
 901                 if (is_shadow_present_pte(*pos)) {
 902                         printk(KERN_ERR "%s: %p %llx\n", __func__,
 903                                pos, *pos);
 904                         return 0;
 905                 }
 906         return 1;
 907 }
 908 #endif
 909
 910 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 911 {
 912         ASSERT(is_empty_shadow_page(sp->spt));
 913         hlist_del(&sp->hash_link);
 914         list_del(&sp->link);
 915         __free_page(virt_to_page(sp->spt));
 916         if (!sp->role.direct)
 917                 __free_page(virt_to_page(sp->gfns));
 918         kmem_cache_free(mmu_page_header_cache, sp);
 919         ++kvm->arch.n_free_mmu_pages;
 920 }
 921
 922 static unsigned kvm_page_table_hashfn(gfn_t gfn)
 923 {
 924         return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 925 }
 926
 927 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 928                                                u64 *parent_pte, int direct)
 929 {
 930         struct kvm_mmu_page *sp;
 931
 932         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
 933         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 934         if (!direct)
 935                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
 936                                                   PAGE_SIZE);
 937         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 938         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 939         bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 940         sp->multimapped = 0;
 941         sp->parent_pte = parent_pte;
 942         --vcpu->kvm->arch.n_free_mmu_pages;
 943         return sp;
 944 }
 945
 946 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
 947                                     struct kvm_mmu_page *sp, u64 *parent_pte)
 948 {
 949         struct kvm_pte_chain *pte_chain;
 950         struct hlist_node *node;
 951         int i;
 952
 953         if (!parent_pte)
 954                 return;
 955         if (!sp->multimapped) {
 956                 u64 *old = sp->parent_pte;
 957
 958                 if (!old) {
 959                         sp->parent_pte = parent_pte;
 960                         return;
 961                 }
 962                 sp->multimapped = 1;
 963                 pte_chain = mmu_alloc_pte_chain(vcpu);
 964                 INIT_HLIST_HEAD(&sp->parent_ptes);
 965                 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
 966                 pte_chain->parent_ptes[0] = old;
 967         }
 968         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
 969                 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
 970                         continue;
 971                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
 972                         if (!pte_chain->parent_ptes[i]) {
 973                                 pte_chain->parent_ptes[i] = parent_pte;
 974                                 return;
 975                         }
 976         }
 977         pte_chain = mmu_alloc_pte_chain(vcpu);
 978         BUG_ON(!pte_chain);
 979         hlist_add_head(&pte_chain->link, &sp->parent_ptes);
 980         pte_chain->parent_ptes[0] = parent_pte;
 981 }
 982
 983 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 984                                        u64 *parent_pte)
 985 {
 986         struct kvm_pte_chain *pte_chain;
 987         struct hlist_node *node;
 988         int i;
 989
 990         if (!sp->multimapped) {
 991                 BUG_ON(sp->parent_pte != parent_pte);
 992                 sp->parent_pte = NULL;
 993                 return;
 994         }
 995         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 996                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
 997                         if (!pte_chain->parent_ptes[i])
 998                                 break;
 999                         if (pte_chain->parent_ptes[i] != parent_pte)
1000                                 continue;
1001                         while (i + 1 < NR_PTE_CHAIN_ENTRIES
1002                                 && pte_chain->parent_ptes[i + 1]) {
1003                                 pte_chain->parent_ptes[i]
1004                                         = pte_chain->parent_ptes[i + 1];
1005                                 ++i;
1006                         }
1007                         pte_chain->parent_ptes[i] = NULL;
1008                         if (i == 0) {
1009                                 hlist_del(&pte_chain->link);
1010                                 mmu_free_pte_chain(pte_chain);
1011                                 if (hlist_empty(&sp->parent_ptes)) {
1012                                         sp->multimapped = 0;
1013                                         sp->parent_pte = NULL;
1014                                 }
1015                         }
1016                         return;
1017                 }
1018         BUG();
1019 }
1020
1021 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1022 {
1023         struct kvm_pte_chain *pte_chain;
1024         struct hlist_node *node;
1025         struct kvm_mmu_page *parent_sp;
1026         int i;
1027
1028         if (!sp->multimapped && sp->parent_pte) {
1029                 parent_sp = page_header(__pa(sp->parent_pte));
1030                 fn(parent_sp, sp->parent_pte);
1031                 return;
1032         }
1033
1034         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1035                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1036                         u64 *spte = pte_chain->parent_ptes[i];
1037
1038                         if (!spte)
1039                                 break;
1040                         parent_sp = page_header(__pa(spte));
1041                         fn(parent_sp, spte);
1042                 }
1043 }
1044
1045 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1046 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1047 {
1048         mmu_parent_walk(sp, mark_unsync);
1049 }
1050
1051 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1052 {
1053         unsigned int index;
1054
1055         index = spte - sp->spt;
1056         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1057                 return;
1058         if (sp->unsync_children++)
1059                 return;
1060         kvm_mmu_mark_parents_unsync(sp);
1061 }
1062
1063 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1064                                     struct kvm_mmu_page *sp)
1065 {
1066         int i;
1067
1068         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1069                 sp->spt[i] = shadow_trap_nonpresent_pte;
1070 }
1071
1072 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1073                                struct kvm_mmu_page *sp, bool clear_unsync)
1074 {
1075         return 1;
1076 }
1077
1078 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1079 {
1080 }
1081
1082 #define KVM_PAGE_ARRAY_NR 16
1083
1084 struct kvm_mmu_pages {
1085         struct mmu_page_and_offset {
1086                 struct kvm_mmu_page *sp;
1087                 unsigned int idx;
1088         } page[KVM_PAGE_ARRAY_NR];
1089         unsigned int nr;
1090 };
1091
1092 #define for_each_unsync_children(bitmap, idx)           \
1093         for (idx = find_first_bit(bitmap, 512);         \
1094              idx < 512;                                 \
1095              idx = find_next_bit(bitmap, 512, idx+1))
1096
1097 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1098                          int idx)
1099 {
1100         int i;
1101
1102         if (sp->unsync)
1103                 for (i=0; i < pvec->nr; i++)
1104                         if (pvec->page[i].sp == sp)
1105                                 return 0;
1106
1107         pvec->page[pvec->nr].sp = sp;
1108         pvec->page[pvec->nr].idx = idx;
1109         pvec->nr++;
1110         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1111 }
1112
1113 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1114                            struct kvm_mmu_pages *pvec)
1115 {
1116         int i, ret, nr_unsync_leaf = 0;
1117
1118         for_each_unsync_children(sp->unsync_child_bitmap, i) {
1119                 struct kvm_mmu_page *child;
1120                 u64 ent = sp->spt[i];
1121
1122                 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1123                         goto clear_child_bitmap;
1124
1125                 child = page_header(ent & PT64_BASE_ADDR_MASK);
1126
1127                 if (child->unsync_children) {
1128                         if (mmu_pages_add(pvec, child, i))
1129                                 return -ENOSPC;
1130
1131                         ret = __mmu_unsync_walk(child, pvec);
1132                         if (!ret)
1133                                 goto clear_child_bitmap;
1134                         else if (ret > 0)
1135                                 nr_unsync_leaf += ret;
1136                         else
1137                                 return ret;
1138                 } else if (child->unsync) {
1139                         nr_unsync_leaf++;
1140                         if (mmu_pages_add(pvec, child, i))
1141                                 return -ENOSPC;
1142                 } else
1143                          goto clear_child_bitmap;
1144
1145                 continue;
1146
1147 clear_child_bitmap:
1148                 __clear_bit(i, sp->unsync_child_bitmap);
1149                 sp->unsync_children--;
1150                 WARN_ON((int)sp->unsync_children < 0);
1151         }
1152
1153
1154         return nr_unsync_leaf;
1155 }
1156
1157 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1158                            struct kvm_mmu_pages *pvec)
1159 {
1160         if (!sp->unsync_children)
1161                 return 0;
1162
1163         mmu_pages_add(pvec, sp, 0);
1164         return __mmu_unsync_walk(sp, pvec);
1165 }
1166
1167 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1168 {
1169         WARN_ON(!sp->unsync);
1170         trace_kvm_mmu_sync_page(sp);
1171         sp->unsync = 0;
1172         --kvm->stat.mmu_unsync;
1173 }
1174
1175 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1176                                     struct list_head *invalid_list);
1177 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1178                                     struct list_head *invalid_list);
1179
1180 #define for_each_gfn_sp(kvm, sp, gfn, pos)                              \
1181   hlist_for_each_entry(sp, pos,                                         \
1182    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)   \
1183         if ((sp)->gfn != (gfn)) {} else
1184
1185 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)               \
1186   hlist_for_each_entry(sp, pos,                                         \
1187    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)   \
1188                 if ((sp)->gfn != (gfn) || (sp)->role.direct ||          \
1189                         (sp)->role.invalid) {} else
1190
1191 /* @sp->gfn should be write-protected at the call site */
1192 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1193                            struct list_head *invalid_list, bool clear_unsync)
1194 {
1195         if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1196                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1197                 return 1;
1198         }
1199
1200         if (clear_unsync)
1201                 kvm_unlink_unsync_page(vcpu->kvm, sp);
1202
1203         if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1204                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1205                 return 1;
1206         }
1207
1208         kvm_mmu_flush_tlb(vcpu);
1209         return 0;
1210 }
1211
1212 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1213                                    struct kvm_mmu_page *sp)
1214 {
1215         LIST_HEAD(invalid_list);
1216         int ret;
1217
1218         ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1219         if (ret)
1220                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1221
1222         return ret;
1223 }
1224
1225 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1226                          struct list_head *invalid_list)
1227 {
1228         return __kvm_sync_page(vcpu, sp, invalid_list, true);
1229 }
1230
1231 /* @gfn should be write-protected at the call site */
1232 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1233 {
1234         struct kvm_mmu_page *s;
1235         struct hlist_node *node;
1236         LIST_HEAD(invalid_list);
1237         bool flush = false;
1238
1239         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1240                 if (!s->unsync)
1241                         continue;
1242
1243                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1244                 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1245                         (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1246                         kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1247                         continue;
1248                 }
1249                 kvm_unlink_unsync_page(vcpu->kvm, s);
1250                 flush = true;
1251         }
1252
1253         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1254         if (flush)
1255                 kvm_mmu_flush_tlb(vcpu);
1256 }
1257
1258 struct mmu_page_path {
1259         struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1260         unsigned int idx[PT64_ROOT_LEVEL-1];
1261 };
1262
1263 #define for_each_sp(pvec, sp, parents, i)                       \
1264                 for (i = mmu_pages_next(&pvec, &parents, -1),   \
1265                         sp = pvec.page[i].sp;                   \
1266                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1267                         i = mmu_pages_next(&pvec, &parents, i))
1268
1269 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1270                           struct mmu_page_path *parents,
1271                           int i)
1272 {
1273         int n;
1274
1275         for (n = i+1; n < pvec->nr; n++) {
1276                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1277
1278                 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1279                         parents->idx[0] = pvec->page[n].idx;
1280                         return n;
1281                 }
1282
1283                 parents->parent[sp->role.level-2] = sp;
1284                 parents->idx[sp->role.level-1] = pvec->page[n].idx;
1285         }
1286
1287         return n;
1288 }
1289
1290 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1291 {
1292         struct kvm_mmu_page *sp;
1293         unsigned int level = 0;
1294
1295         do {
1296                 unsigned int idx = parents->idx[level];
1297
1298                 sp = parents->parent[level];
1299                 if (!sp)
1300                         return;
1301
1302                 --sp->unsync_children;
1303                 WARN_ON((int)sp->unsync_children < 0);
1304                 __clear_bit(idx, sp->unsync_child_bitmap);
1305                 level++;
1306         } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1307 }
1308
1309 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1310                                struct mmu_page_path *parents,
1311                                struct kvm_mmu_pages *pvec)
1312 {
1313         parents->parent[parent->role.level-1] = NULL;
1314         pvec->nr = 0;
1315 }
1316
1317 static void mmu_sync_children(struct kvm_vcpu *vcpu,
1318                               struct kvm_mmu_page *parent)
1319 {
1320         int i;
1321         struct kvm_mmu_page *sp;
1322         struct mmu_page_path parents;
1323         struct kvm_mmu_pages pages;
1324         LIST_HEAD(invalid_list);
1325
1326         kvm_mmu_pages_init(parent, &parents, &pages);
1327         while (mmu_unsync_walk(parent, &pages)) {
1328                 int protected = 0;
1329
1330                 for_each_sp(pages, sp, parents, i)
1331                         protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1332
1333                 if (protected)
1334                         kvm_flush_remote_tlbs(vcpu->kvm);
1335
1336                 for_each_sp(pages, sp, parents, i) {
1337                         kvm_sync_page(vcpu, sp, &invalid_list);
1338                         mmu_pages_clear_parents(&parents);
1339                 }
1340                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1341                 cond_resched_lock(&vcpu->kvm->mmu_lock);
1342                 kvm_mmu_pages_init(parent, &parents, &pages);
1343         }
1344 }
1345
1346 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1347                                              gfn_t gfn,
1348                                              gva_t gaddr,
1349                                              unsigned level,
1350                                              int direct,
1351                                              unsigned access,
1352                                              u64 *parent_pte)
1353 {
1354         union kvm_mmu_page_role role;
1355         unsigned quadrant;
1356         struct kvm_mmu_page *sp;
1357         struct hlist_node *node;
1358         bool need_sync = false;
1359
1360         role = vcpu->arch.mmu.base_role;
1361         role.level = level;
1362         role.direct = direct;
1363         if (role.direct)
1364                 role.cr4_pae = 0;
1365         role.access = access;
1366         if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1367                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1368                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1369                 role.quadrant = quadrant;
1370         }
1371         for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1372                 if (!need_sync && sp->unsync)
1373                         need_sync = true;
1374
1375                 if (sp->role.word != role.word)
1376                         continue;
1377
1378                 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1379                         break;
1380
1381                 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1382                 if (sp->unsync_children) {
1383                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1384                         kvm_mmu_mark_parents_unsync(sp);
1385                 } else if (sp->unsync)
1386                         kvm_mmu_mark_parents_unsync(sp);
1387
1388                 trace_kvm_mmu_get_page(sp, false);
1389                 return sp;
1390         }
1391         ++vcpu->kvm->stat.mmu_cache_miss;
1392         sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1393         if (!sp)
1394                 return sp;
1395         sp->gfn = gfn;
1396         sp->role = role;
1397         hlist_add_head(&sp->hash_link,
1398                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1399         if (!direct) {
1400                 if (rmap_write_protect(vcpu->kvm, gfn))
1401                         kvm_flush_remote_tlbs(vcpu->kvm);
1402                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1403                         kvm_sync_pages(vcpu, gfn);
1404
1405                 account_shadowed(vcpu->kvm, gfn);
1406         }
1407         if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1408                 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1409         else
1410                 nonpaging_prefetch_page(vcpu, sp);
1411         trace_kvm_mmu_get_page(sp, true);
1412         return sp;
1413 }
1414
1415 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1416                              struct kvm_vcpu *vcpu, u64 addr)
1417 {
1418         iterator->addr = addr;
1419         iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1420         iterator->level = vcpu->arch.mmu.shadow_root_level;
1421         if (iterator->level == PT32E_ROOT_LEVEL) {
1422                 iterator->shadow_addr
1423                         = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1424                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1425                 --iterator->level;
1426                 if (!iterator->shadow_addr)
1427                         iterator->level = 0;
1428         }
1429 }
1430
1431 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1432 {
1433         if (iterator->level < PT_PAGE_TABLE_LEVEL)
1434                 return false;
1435
1436         if (iterator->level == PT_PAGE_TABLE_LEVEL)
1437                 if (is_large_pte(*iterator->sptep))
1438                         return false;
1439
1440         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1441         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1442         return true;
1443 }
1444
1445 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1446 {
1447         iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1448         --iterator->level;
1449 }
1450
1451 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1452                                          struct kvm_mmu_page *sp)
1453 {
1454         unsigned i;
1455         u64 *pt;
1456         u64 ent;
1457
1458         pt = sp->spt;
1459
1460         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1461                 ent = pt[i];
1462
1463                 if (is_shadow_present_pte(ent)) {
1464                         if (!is_last_spte(ent, sp->role.level)) {
1465                                 ent &= PT64_BASE_ADDR_MASK;
1466                                 mmu_page_remove_parent_pte(page_header(ent),
1467                                                            &pt[i]);
1468                         } else {
1469                                 if (is_large_pte(ent))
1470                                         --kvm->stat.lpages;
1471                                 drop_spte(kvm, &pt[i],
1472                                           shadow_trap_nonpresent_pte);
1473                         }
1474                 }
1475                 pt[i] = shadow_trap_nonpresent_pte;
1476         }
1477 }
1478
1479 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1480 {
1481         mmu_page_remove_parent_pte(sp, parent_pte);
1482 }
1483
1484 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1485 {
1486         int i;
1487         struct kvm_vcpu *vcpu;
1488
1489         kvm_for_each_vcpu(i, vcpu, kvm)
1490                 vcpu->arch.last_pte_updated = NULL;
1491 }
1492
1493 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1494 {
1495         u64 *parent_pte;
1496
1497         while (sp->multimapped || sp->parent_pte) {
1498                 if (!sp->multimapped)
1499                         parent_pte = sp->parent_pte;
1500                 else {
1501                         struct kvm_pte_chain *chain;
1502
1503                         chain = container_of(sp->parent_ptes.first,
1504                                              struct kvm_pte_chain, link);
1505                         parent_pte = chain->parent_ptes[0];
1506                 }
1507                 BUG_ON(!parent_pte);
1508                 kvm_mmu_put_page(sp, parent_pte);
1509                 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1510         }
1511 }
1512
1513 static int mmu_zap_unsync_children(struct kvm *kvm,
1514                                    struct kvm_mmu_page *parent,
1515                                    struct list_head *invalid_list)
1516 {
1517         int i, zapped = 0;
1518         struct mmu_page_path parents;
1519         struct kvm_mmu_pages pages;
1520
1521         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1522                 return 0;
1523
1524         kvm_mmu_pages_init(parent, &parents, &pages);
1525         while (mmu_unsync_walk(parent, &pages)) {
1526                 struct kvm_mmu_page *sp;
1527
1528                 for_each_sp(pages, sp, parents, i) {
1529                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1530                         mmu_pages_clear_parents(&parents);
1531                         zapped++;
1532                 }
1533                 kvm_mmu_pages_init(parent, &parents, &pages);
1534         }
1535
1536         return zapped;
1537 }
1538
1539 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1540                                     struct list_head *invalid_list)
1541 {
1542         int ret;
1543
1544         trace_kvm_mmu_prepare_zap_page(sp);
1545         ++kvm->stat.mmu_shadow_zapped;
1546         ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1547         kvm_mmu_page_unlink_children(kvm, sp);
1548         kvm_mmu_unlink_parents(kvm, sp);
1549         if (!sp->role.invalid && !sp->role.direct)
1550                 unaccount_shadowed(kvm, sp->gfn);
1551         if (sp->unsync)
1552                 kvm_unlink_unsync_page(kvm, sp);
1553         if (!sp->root_count) {
1554                 /* Count self */
1555                 ret++;
1556                 list_move(&sp->link, invalid_list);
1557         } else {
1558                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1559                 kvm_reload_remote_mmus(kvm);
1560         }
1561
1562         sp->role.invalid = 1;
1563         kvm_mmu_reset_last_pte_updated(kvm);
1564         return ret;
1565 }
1566
1567 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1568                                     struct list_head *invalid_list)
1569 {
1570         struct kvm_mmu_page *sp;
1571
1572         if (list_empty(invalid_list))
1573                 return;
1574
1575         kvm_flush_remote_tlbs(kvm);
1576
1577         do {
1578                 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1579                 WARN_ON(!sp->role.invalid || sp->root_count);
1580                 kvm_mmu_free_page(kvm, sp);
1581         } while (!list_empty(invalid_list));
1582
1583 }
1584
1585 /*
1586  * Changing the number of mmu pages allocated to the vm
1587  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1588  */
1589 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1590 {
1591         int used_pages;
1592         LIST_HEAD(invalid_list);
1593
1594         used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1595         used_pages = max(0, used_pages);
1596
1597         /*
1598          * If we set the number of mmu pages to be smaller be than the
1599          * number of actived pages , we must to free some mmu pages before we
1600          * change the value
1601          */
1602
1603         if (used_pages > kvm_nr_mmu_pages) {
1604                 while (used_pages > kvm_nr_mmu_pages &&
1605                         !list_empty(&kvm->arch.active_mmu_pages)) {
1606                         struct kvm_mmu_page *page;
1607
1608                         page = container_of(kvm->arch.active_mmu_pages.prev,
1609                                             struct kvm_mmu_page, link);
1610                         used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1611                                                                &invalid_list);
1612                 }
1613                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1614                 kvm_nr_mmu_pages = used_pages;
1615                 kvm->arch.n_free_mmu_pages = 0;
1616         }
1617         else
1618                 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1619                                          - kvm->arch.n_alloc_mmu_pages;
1620
1621         kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1622 }
1623
1624 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1625 {
1626         struct kvm_mmu_page *sp;
1627         struct hlist_node *node;
1628         LIST_HEAD(invalid_list);
1629         int r;
1630
1631         pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1632         r = 0;
1633
1634         for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1635                 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1636                          sp->role.word);
1637                 r = 1;
1638                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1639         }
1640         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1641         return r;
1642 }
1643
1644 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1645 {
1646         struct kvm_mmu_page *sp;
1647         struct hlist_node *node;
1648         LIST_HEAD(invalid_list);
1649
1650         for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1651                 pgprintk("%s: zap %lx %x\n",
1652                          __func__, gfn, sp->role.word);
1653                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1654         }
1655         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1656 }
1657
1658 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1659 {
1660         int slot = memslot_id(kvm, gfn);
1661         struct kvm_mmu_page *sp = page_header(__pa(pte));
1662
1663         __set_bit(slot, sp->slot_bitmap);
1664 }
1665
1666 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1667 {
1668         int i;
1669         u64 *pt = sp->spt;
1670
1671         if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1672                 return;
1673
1674         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1675                 if (pt[i] == shadow_notrap_nonpresent_pte)
1676                         __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1677         }
1678 }
1679
1680 /*
1681  * The function is based on mtrr_type_lookup() in
1682  * arch/x86/kernel/cpu/mtrr/generic.c
1683  */
1684 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1685                          u64 start, u64 end)
1686 {
1687         int i;
1688         u64 base, mask;
1689         u8 prev_match, curr_match;
1690         int num_var_ranges = KVM_NR_VAR_MTRR;
1691
1692         if (!mtrr_state->enabled)
1693                 return 0xFF;
1694
1695         /* Make end inclusive end, instead of exclusive */
1696         end--;
1697
1698         /* Look in fixed ranges. Just return the type as per start */
1699         if (mtrr_state->have_fixed && (start < 0x100000)) {
1700                 int idx;
1701
1702                 if (start < 0x80000) {
1703                         idx = 0;
1704                         idx += (start >> 16);
1705                         return mtrr_state->fixed_ranges[idx];
1706                 } else if (start < 0xC0000) {
1707                         idx = 1 * 8;
1708                         idx += ((start - 0x80000) >> 14);
1709                         return mtrr_state->fixed_ranges[idx];
1710                 } else if (start < 0x1000000) {
1711                         idx = 3 * 8;
1712                         idx += ((start - 0xC0000) >> 12);
1713                         return mtrr_state->fixed_ranges[idx];
1714                 }
1715         }
1716
1717         /*
1718          * Look in variable ranges
1719          * Look of multiple ranges matching this address and pick type
1720          * as per MTRR precedence
1721          */
1722         if (!(mtrr_state->enabled & 2))
1723                 return mtrr_state->def_type;
1724
1725         prev_match = 0xFF;
1726         for (i = 0; i < num_var_ranges; ++i) {
1727                 unsigned short start_state, end_state;
1728
1729                 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1730                         continue;
1731
1732                 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1733                        (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1734                 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1735                        (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1736
1737                 start_state = ((start & mask) == (base & mask));
1738                 end_state = ((end & mask) == (base & mask));
1739                 if (start_state != end_state)
1740                         return 0xFE;
1741
1742                 if ((start & mask) != (base & mask))
1743                         continue;
1744
1745                 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1746                 if (prev_match == 0xFF) {
1747                         prev_match = curr_match;
1748                         continue;
1749                 }
1750
1751                 if (prev_match == MTRR_TYPE_UNCACHABLE ||
1752                     curr_match == MTRR_TYPE_UNCACHABLE)
1753                         return MTRR_TYPE_UNCACHABLE;
1754
1755                 if ((prev_match == MTRR_TYPE_WRBACK &&
1756                      curr_match == MTRR_TYPE_WRTHROUGH) ||
1757                     (prev_match == MTRR_TYPE_WRTHROUGH &&
1758                      curr_match == MTRR_TYPE_WRBACK)) {
1759                         prev_match = MTRR_TYPE_WRTHROUGH;
1760                         curr_match = MTRR_TYPE_WRTHROUGH;
1761                 }
1762
1763                 if (prev_match != curr_match)
1764                         return MTRR_TYPE_UNCACHABLE;
1765         }
1766
1767         if (prev_match != 0xFF)
1768                 return prev_match;
1769
1770         return mtrr_state->def_type;
1771 }
1772
1773 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1774 {
1775         u8 mtrr;
1776
1777         mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1778                              (gfn << PAGE_SHIFT) + PAGE_SIZE);
1779         if (mtrr == 0xfe || mtrr == 0xff)
1780                 mtrr = MTRR_TYPE_WRBACK;
1781         return mtrr;
1782 }
1783 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1784
1785 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1786 {
1787         trace_kvm_mmu_unsync_page(sp);
1788         ++vcpu->kvm->stat.mmu_unsync;
1789         sp->unsync = 1;
1790
1791         kvm_mmu_mark_parents_unsync(sp);
1792         mmu_convert_notrap(sp);
1793 }
1794
1795 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1796 {
1797         struct kvm_mmu_page *s;
1798         struct hlist_node *node;
1799
1800         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1801                 if (s->unsync)
1802                         continue;
1803                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1804                 __kvm_unsync_page(vcpu, s);
1805         }
1806 }
1807
1808 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1809                                   bool can_unsync)
1810 {
1811         struct kvm_mmu_page *s;
1812         struct hlist_node *node;
1813         bool need_unsync = false;
1814
1815         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1816                 if (!can_unsync)
1817                         return 1;
1818
1819                 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1820                         return 1;
1821
1822                 if (!need_unsync && !s->unsync) {
1823                         if (!oos_shadow)
1824                                 return 1;
1825                         need_unsync = true;
1826                 }
1827         }
1828         if (need_unsync)
1829                 kvm_unsync_pages(vcpu, gfn);
1830         return 0;
1831 }
1832
1833 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1834                     unsigned pte_access, int user_fault,
1835                     int write_fault, int dirty, int level,
1836                     gfn_t gfn, pfn_t pfn, bool speculative,
1837                     bool can_unsync, bool reset_host_protection)
1838 {
1839         u64 spte;
1840         int ret = 0;
1841
1842         /*
1843          * We don't set the accessed bit, since we sometimes want to see
1844          * whether the guest actually used the pte (in order to detect
1845          * demand paging).
1846          */
1847         spte = shadow_base_present_pte | shadow_dirty_mask;
1848         if (!speculative)
1849                 spte |= shadow_accessed_mask;
1850         if (!dirty)
1851                 pte_access &= ~ACC_WRITE_MASK;
1852         if (pte_access & ACC_EXEC_MASK)
1853                 spte |= shadow_x_mask;
1854         else
1855                 spte |= shadow_nx_mask;
1856         if (pte_access & ACC_USER_MASK)
1857                 spte |= shadow_user_mask;
1858         if (level > PT_PAGE_TABLE_LEVEL)
1859                 spte |= PT_PAGE_SIZE_MASK;
1860         if (tdp_enabled)
1861                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1862                         kvm_is_mmio_pfn(pfn));
1863
1864         if (reset_host_protection)
1865                 spte |= SPTE_HOST_WRITEABLE;
1866
1867         spte |= (u64)pfn << PAGE_SHIFT;
1868
1869         if ((pte_access & ACC_WRITE_MASK)
1870             || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1871                 && !user_fault)) {
1872
1873                 if (level > PT_PAGE_TABLE_LEVEL &&
1874                     has_wrprotected_page(vcpu->kvm, gfn, level)) {
1875                         ret = 1;
1876                         drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1877                         goto done;
1878                 }
1879
1880                 spte |= PT_WRITABLE_MASK;
1881
1882                 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
1883                         spte &= ~PT_USER_MASK;
1884
1885                 /*
1886                  * Optimization: for pte sync, if spte was writable the hash
1887                  * lookup is unnecessary (and expensive). Write protection
1888                  * is responsibility of mmu_get_page / kvm_sync_page.
1889                  * Same reasoning can be applied to dirty page accounting.
1890                  */
1891                 if (!can_unsync && is_writable_pte(*sptep))
1892                         goto set_pte;
1893
1894                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1895                         pgprintk("%s: found shadow page for %lx, marking ro\n",
1896                                  __func__, gfn);
1897                         ret = 1;
1898                         pte_access &= ~ACC_WRITE_MASK;
1899                         if (is_writable_pte(spte))
1900                                 spte &= ~PT_WRITABLE_MASK;
1901                 }
1902         }
1903
1904         if (pte_access & ACC_WRITE_MASK)
1905                 mark_page_dirty(vcpu->kvm, gfn);
1906
1907 set_pte:
1908         __set_spte(sptep, spte);
1909 done:
1910         return ret;
1911 }
1912
1913 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1914                          unsigned pt_access, unsigned pte_access,
1915                          int user_fault, int write_fault, int dirty,
1916                          int *ptwrite, int level, gfn_t gfn,
1917                          pfn_t pfn, bool speculative,
1918                          bool reset_host_protection)
1919 {
1920         int was_rmapped = 0;
1921         int was_writable = is_writable_pte(*sptep);
1922         int rmap_count;
1923
1924         pgprintk("%s: spte %llx access %x write_fault %d"
1925                  " user_fault %d gfn %lx\n",
1926                  __func__, *sptep, pt_access,
1927                  write_fault, user_fault, gfn);
1928
1929         if (is_rmap_spte(*sptep)) {
1930                 /*
1931                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1932                  * the parent of the now unreachable PTE.
1933                  */
1934                 if (level > PT_PAGE_TABLE_LEVEL &&
1935                     !is_large_pte(*sptep)) {
1936                         struct kvm_mmu_page *child;
1937                         u64 pte = *sptep;
1938
1939                         child = page_header(pte & PT64_BASE_ADDR_MASK);
1940                         mmu_page_remove_parent_pte(child, sptep);
1941                         __set_spte(sptep, shadow_trap_nonpresent_pte);
1942                         kvm_flush_remote_tlbs(vcpu->kvm);
1943                 } else if (pfn != spte_to_pfn(*sptep)) {
1944                         pgprintk("hfn old %lx new %lx\n",
1945                                  spte_to_pfn(*sptep), pfn);
1946                         drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1947                         kvm_flush_remote_tlbs(vcpu->kvm);
1948                 } else
1949                         was_rmapped = 1;
1950         }
1951
1952         if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1953                       dirty, level, gfn, pfn, speculative, true,
1954                       reset_host_protection)) {
1955                 if (write_fault)
1956                         *ptwrite = 1;
1957                 kvm_mmu_flush_tlb(vcpu);
1958         }
1959
1960         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
1961         pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1962                  is_large_pte(*sptep)? "2MB" : "4kB",
1963                  *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
1964                  *sptep, sptep);
1965         if (!was_rmapped && is_large_pte(*sptep))
1966                 ++vcpu->kvm->stat.lpages;
1967
1968         page_header_update_slot(vcpu->kvm, sptep, gfn);
1969         if (!was_rmapped) {
1970                 rmap_count = rmap_add(vcpu, sptep, gfn);
1971                 kvm_release_pfn_clean(pfn);
1972                 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1973                         rmap_recycle(vcpu, sptep, gfn);
1974         } else {
1975                 if (was_writable)
1976                         kvm_release_pfn_dirty(pfn);
1977                 else
1978                         kvm_release_pfn_clean(pfn);
1979         }
1980         if (speculative) {
1981                 vcpu->arch.last_pte_updated = sptep;
1982                 vcpu->arch.last_pte_gfn = gfn;
1983         }
1984 }
1985
1986 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1987 {
1988 }
1989
1990 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1991                         int level, gfn_t gfn, pfn_t pfn)
1992 {
1993         struct kvm_shadow_walk_iterator iterator;
1994         struct kvm_mmu_page *sp;
1995         int pt_write = 0;
1996         gfn_t pseudo_gfn;
1997
1998         for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
1999                 if (iterator.level == level) {
2000                         mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
2001                                      0, write, 1, &pt_write,
2002                                      level, gfn, pfn, false, true);
2003                         ++vcpu->stat.pf_fixed;
2004                         break;
2005                 }
2006
2007                 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2008                         u64 base_addr = iterator.addr;
2009
2010                         base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2011                         pseudo_gfn = base_addr >> PAGE_SHIFT;
2012                         sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2013                                               iterator.level - 1,
2014                                               1, ACC_ALL, iterator.sptep);
2015                         if (!sp) {
2016                                 pgprintk("nonpaging_map: ENOMEM\n");
2017                                 kvm_release_pfn_clean(pfn);
2018                                 return -ENOMEM;
2019                         }
2020
2021                         __set_spte(iterator.sptep,
2022                                    __pa(sp->spt)
2023                                    | PT_PRESENT_MASK | PT_WRITABLE_MASK
2024                                    | shadow_user_mask | shadow_x_mask);
2025                 }
2026         }
2027         return pt_write;
2028 }
2029
2030 static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2031 {
2032         char buf[1];
2033         void __user *hva;
2034         int r;
2035
2036         /* Touch the page, so send SIGBUS */
2037         hva = (void __user *)gfn_to_hva(kvm, gfn);
2038         r = copy_from_user(buf, hva, 1);
2039 }
2040
2041 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2042 {
2043         kvm_release_pfn_clean(pfn);
2044         if (is_hwpoison_pfn(pfn)) {
2045                 kvm_send_hwpoison_signal(kvm, gfn);
2046                 return 0;
2047         }
2048         return 1;
2049 }
2050
2051 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2052 {
2053         int r;
2054         int level;
2055         pfn_t pfn;
2056         unsigned long mmu_seq;
2057
2058         level = mapping_level(vcpu, gfn);
2059
2060         /*
2061          * This path builds a PAE pagetable - so we can map 2mb pages at
2062          * maximum. Therefore check if the level is larger than that.
2063          */
2064         if (level > PT_DIRECTORY_LEVEL)
2065                 level = PT_DIRECTORY_LEVEL;
2066
2067         gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2068
2069         mmu_seq = vcpu->kvm->mmu_notifier_seq;
2070         smp_rmb();
2071         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2072
2073         /* mmio */
2074         if (is_error_pfn(pfn))
2075                 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2076
2077         spin_lock(&vcpu->kvm->mmu_lock);
2078         if (mmu_notifier_retry(vcpu, mmu_seq))
2079                 goto out_unlock;
2080         kvm_mmu_free_some_pages(vcpu);
2081         r = __direct_map(vcpu, v, write, level, gfn, pfn);
2082         spin_unlock(&vcpu->kvm->mmu_lock);
2083
2084
2085         return r;
2086
2087 out_unlock:
2088         spin_unlock(&vcpu->kvm->mmu_lock);
2089         kvm_release_pfn_clean(pfn);
2090         return 0;
2091 }
2092
2093
2094 static void mmu_free_roots(struct kvm_vcpu *vcpu)
2095 {
2096         int i;
2097         struct kvm_mmu_page *sp;
2098         LIST_HEAD(invalid_list);
2099
2100         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2101                 return;
2102         spin_lock(&vcpu->kvm->mmu_lock);
2103         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2104                 hpa_t root = vcpu->arch.mmu.root_hpa;
2105
2106                 sp = page_header(root);
2107                 --sp->root_count;
2108                 if (!sp->root_count && sp->role.invalid) {
2109                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2110                         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2111                 }
2112                 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2113                 spin_unlock(&vcpu->kvm->mmu_lock);
2114                 return;
2115         }
2116         for (i = 0; i < 4; ++i) {
2117                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2118
2119                 if (root) {
2120                         root &= PT64_BASE_ADDR_MASK;
2121                         sp = page_header(root);
2122                         --sp->root_count;
2123                         if (!sp->root_count && sp->role.invalid)
2124                                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2125                                                          &invalid_list);
2126                 }
2127                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2128         }
2129         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2130         spin_unlock(&vcpu->kvm->mmu_lock);
2131         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2132 }
2133
2134 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2135 {
2136         int ret = 0;
2137
2138         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2139                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2140                 ret = 1;
2141         }
2142
2143         return ret;
2144 }
2145
2146 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2147 {
2148         int i;
2149         gfn_t root_gfn;
2150         struct kvm_mmu_page *sp;
2151         int direct = 0;
2152         u64 pdptr;
2153
2154         root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2155
2156         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2157                 hpa_t root = vcpu->arch.mmu.root_hpa;
2158
2159                 ASSERT(!VALID_PAGE(root));
2160                 if (mmu_check_root(vcpu, root_gfn))
2161                         return 1;
2162                 if (tdp_enabled) {
2163                         direct = 1;
2164                         root_gfn = 0;
2165                 }
2166                 spin_lock(&vcpu->kvm->mmu_lock);
2167                 kvm_mmu_free_some_pages(vcpu);
2168                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2169                                       PT64_ROOT_LEVEL, direct,
2170                                       ACC_ALL, NULL);
2171                 root = __pa(sp->spt);
2172                 ++sp->root_count;
2173                 spin_unlock(&vcpu->kvm->mmu_lock);
2174                 vcpu->arch.mmu.root_hpa = root;
2175                 return 0;
2176         }
2177         direct = !is_paging(vcpu);
2178         for (i = 0; i < 4; ++i) {
2179                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2180
2181                 ASSERT(!VALID_PAGE(root));
2182                 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2183                         pdptr = kvm_pdptr_read(vcpu, i);
2184                         if (!is_present_gpte(pdptr)) {
2185                                 vcpu->arch.mmu.pae_root[i] = 0;
2186                                 continue;
2187                         }
2188                         root_gfn = pdptr >> PAGE_SHIFT;
2189                 } else if (vcpu->arch.mmu.root_level == 0)
2190                         root_gfn = 0;
2191                 if (mmu_check_root(vcpu, root_gfn))
2192                         return 1;
2193                 if (tdp_enabled) {
2194                         direct = 1;
2195                         root_gfn = i << 30;
2196                 }
2197                 spin_lock(&vcpu->kvm->mmu_lock);
2198                 kvm_mmu_free_some_pages(vcpu);
2199                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2200                                       PT32_ROOT_LEVEL, direct,
2201                                       ACC_ALL, NULL);
2202                 root = __pa(sp->spt);
2203                 ++sp->root_count;
2204                 spin_unlock(&vcpu->kvm->mmu_lock);
2205
2206                 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2207         }
2208         vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2209         return 0;
2210 }
2211
2212 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2213 {
2214         int i;
2215         struct kvm_mmu_page *sp;
2216
2217         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2218                 return;
2219         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2220                 hpa_t root = vcpu->arch.mmu.root_hpa;
2221                 sp = page_header(root);
2222                 mmu_sync_children(vcpu, sp);
2223                 return;
2224         }
2225         for (i = 0; i < 4; ++i) {
2226                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2227
2228                 if (root && VALID_PAGE(root)) {
2229                         root &= PT64_BASE_ADDR_MASK;
2230                         sp = page_header(root);
2231                         mmu_sync_children(vcpu, sp);
2232                 }
2233         }
2234 }
2235
2236 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2237 {
2238         spin_lock(&vcpu->kvm->mmu_lock);
2239         mmu_sync_roots(vcpu);
2240         spin_unlock(&vcpu->kvm->mmu_lock);
2241 }
2242
2243 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2244                                   u32 access, u32 *error)
2245 {
2246         if (error)
2247                 *error = 0;
2248         return vaddr;
2249 }
2250
2251 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2252                                 u32 error_code)
2253 {
2254         gfn_t gfn;
2255         int r;
2256
2257         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2258         r = mmu_topup_memory_caches(vcpu);
2259         if (r)
2260                 return r;
2261
2262         ASSERT(vcpu);
2263         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2264
2265         gfn = gva >> PAGE_SHIFT;
2266
2267         return nonpaging_map(vcpu, gva & PAGE_MASK,
2268                              error_code & PFERR_WRITE_MASK, gfn);
2269 }
2270
2271 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2272                                 u32 error_code)
2273 {
2274         pfn_t pfn;
2275         int r;
2276         int level;
2277         gfn_t gfn = gpa >> PAGE_SHIFT;
2278         unsigned long mmu_seq;
2279
2280         ASSERT(vcpu);
2281         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2282
2283         r = mmu_topup_memory_caches(vcpu);
2284         if (r)
2285                 return r;
2286
2287         level = mapping_level(vcpu, gfn);
2288
2289         gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2290
2291         mmu_seq = vcpu->kvm->mmu_notifier_seq;
2292         smp_rmb();
2293         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2294         if (is_error_pfn(pfn))
2295                 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2296         spin_lock(&vcpu->kvm->mmu_lock);
2297         if (mmu_notifier_retry(vcpu, mmu_seq))
2298                 goto out_unlock;
2299         kvm_mmu_free_some_pages(vcpu);
2300         r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2301                          level, gfn, pfn);
2302         spin_unlock(&vcpu->kvm->mmu_lock);
2303
2304         return r;
2305
2306 out_unlock:
2307         spin_unlock(&vcpu->kvm->mmu_lock);
2308         kvm_release_pfn_clean(pfn);
2309         return 0;
2310 }
2311
2312 static void nonpaging_free(struct kvm_vcpu *vcpu)
2313 {
2314         mmu_free_roots(vcpu);
2315 }
2316
2317 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2318 {
2319         struct kvm_mmu *context = &vcpu->arch.mmu;
2320
2321         context->new_cr3 = nonpaging_new_cr3;
2322         context->page_fault = nonpaging_page_fault;
2323         context->gva_to_gpa = nonpaging_gva_to_gpa;
2324         context->free = nonpaging_free;
2325         context->prefetch_page = nonpaging_prefetch_page;
2326         context->sync_page = nonpaging_sync_page;
2327         context->invlpg = nonpaging_invlpg;
2328         context->root_level = 0;
2329         context->shadow_root_level = PT32E_ROOT_LEVEL;
2330         context->root_hpa = INVALID_PAGE;
2331         return 0;
2332 }
2333
2334 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2335 {
2336         ++vcpu->stat.tlb_flush;
2337         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2338 }
2339
2340 static void paging_new_cr3(struct kvm_vcpu *vcpu)
2341 {
2342         pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2343         mmu_free_roots(vcpu);
2344 }
2345
2346 static void inject_page_fault(struct kvm_vcpu *vcpu,
2347                               u64 addr,
2348                               u32 err_code)
2349 {
2350         kvm_inject_page_fault(vcpu, addr, err_code);
2351 }
2352
2353 static void paging_free(struct kvm_vcpu *vcpu)
2354 {
2355         nonpaging_free(vcpu);
2356 }
2357
2358 static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2359 {
2360         int bit7;
2361
2362         bit7 = (gpte >> 7) & 1;
2363         return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
2364 }
2365
2366 #define PTTYPE 64
2367 #include "paging_tmpl.h"
2368 #undef PTTYPE
2369
2370 #define PTTYPE 32
2371 #include "paging_tmpl.h"
2372 #undef PTTYPE
2373
2374 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2375 {
2376         struct kvm_mmu *context = &vcpu->arch.mmu;
2377         int maxphyaddr = cpuid_maxphyaddr(vcpu);
2378         u64 exb_bit_rsvd = 0;
2379
2380         if (!is_nx(vcpu))
2381                 exb_bit_rsvd = rsvd_bits(63, 63);
2382         switch (level) {
2383         case PT32_ROOT_LEVEL:
2384                 /* no rsvd bits for 2 level 4K page table entries */
2385                 context->rsvd_bits_mask[0][1] = 0;
2386                 context->rsvd_bits_mask[0][0] = 0;
2387                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2388
2389                 if (!is_pse(vcpu)) {
2390                         context->rsvd_bits_mask[1][1] = 0;
2391                         break;
2392                 }
2393
2394                 if (is_cpuid_PSE36())
2395                         /* 36bits PSE 4MB page */
2396                         context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2397                 else
2398                         /* 32 bits PSE 4MB page */
2399                         context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2400                 break;
2401         case PT32E_ROOT_LEVEL:
2402                 context->rsvd_bits_mask[0][2] =
2403                         rsvd_bits(maxphyaddr, 63) |
2404                         rsvd_bits(7, 8) | rsvd_bits(1, 2);      /* PDPTE */
2405                 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2406                         rsvd_bits(maxphyaddr, 62);      /* PDE */
2407                 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2408                         rsvd_bits(maxphyaddr, 62);      /* PTE */
2409                 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2410                         rsvd_bits(maxphyaddr, 62) |
2411                         rsvd_bits(13, 20);              /* large page */
2412                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2413                 break;
2414         case PT64_ROOT_LEVEL:
2415                 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2416                         rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2417                 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2418                         rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2419                 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2420                         rsvd_bits(maxphyaddr, 51);
2421                 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2422                         rsvd_bits(maxphyaddr, 51);
2423                 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2424                 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2425                         rsvd_bits(maxphyaddr, 51) |
2426                         rsvd_bits(13, 29);
2427                 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2428                         rsvd_bits(maxphyaddr, 51) |
2429                         rsvd_bits(13, 20);              /* large page */
2430                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2431                 break;
2432         }
2433 }
2434
2435 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2436 {
2437         struct kvm_mmu *context = &vcpu->arch.mmu;
2438
2439         ASSERT(is_pae(vcpu));
2440         context->new_cr3 = paging_new_cr3;
2441         context->page_fault = paging64_page_fault;
2442         context->gva_to_gpa = paging64_gva_to_gpa;
2443         context->prefetch_page = paging64_prefetch_page;
2444         context->sync_page = paging64_sync_page;
2445         context->invlpg = paging64_invlpg;
2446         context->free = paging_free;
2447         context->root_level = level;
2448         context->shadow_root_level = level;
2449         context->root_hpa = INVALID_PAGE;
2450         return 0;
2451 }
2452
2453 static int paging64_init_context(struct kvm_vcpu *vcpu)
2454 {
2455         reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2456         return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2457 }
2458
2459 static int paging32_init_context(struct kvm_vcpu *vcpu)
2460 {
2461         struct kvm_mmu *context = &vcpu->arch.mmu;
2462
2463         reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2464         context->new_cr3 = paging_new_cr3;
2465         context->page_fault = paging32_page_fault;
2466         context->gva_to_gpa = paging32_gva_to_gpa;
2467         context->free = paging_free;
2468         context->prefetch_page = paging32_prefetch_page;
2469         context->sync_page = paging32_sync_page;
2470         context->invlpg = paging32_invlpg;
2471         context->root_level = PT32_ROOT_LEVEL;
2472         context->shadow_root_level = PT32E_ROOT_LEVEL;
2473         context->root_hpa = INVALID_PAGE;
2474         return 0;
2475 }
2476
2477 static int paging32E_init_context(struct kvm_vcpu *vcpu)
2478 {
2479         reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2480         return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2481 }
2482
2483 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2484 {
2485         struct kvm_mmu *context = &vcpu->arch.mmu;
2486
2487         context->new_cr3 = nonpaging_new_cr3;
2488         context->page_fault = tdp_page_fault;
2489         context->free = nonpaging_free;
2490         context->prefetch_page = nonpaging_prefetch_page;
2491         context->sync_page = nonpaging_sync_page;
2492         context->invlpg = nonpaging_invlpg;
2493         context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2494         context->root_hpa = INVALID_PAGE;
2495
2496         if (!is_paging(vcpu)) {
2497                 context->gva_to_gpa = nonpaging_gva_to_gpa;
2498                 context->root_level = 0;
2499         } else if (is_long_mode(vcpu)) {
2500                 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2501                 context->gva_to_gpa = paging64_gva_to_gpa;
2502                 context->root_level = PT64_ROOT_LEVEL;
2503         } else if (is_pae(vcpu)) {
2504                 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2505                 context->gva_to_gpa = paging64_gva_to_gpa;
2506                 context->root_level = PT32E_ROOT_LEVEL;
2507         } else {
2508                 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2509                 context->gva_to_gpa = paging32_gva_to_gpa;
2510                 context->root_level = PT32_ROOT_LEVEL;
2511         }
2512
2513         return 0;
2514 }
2515
2516 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2517 {
2518         int r;
2519
2520         ASSERT(vcpu);
2521         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2522
2523         if (!is_paging(vcpu))
2524                 r = nonpaging_init_context(vcpu);
2525         else if (is_long_mode(vcpu))
2526                 r = paging64_init_context(vcpu);
2527         else if (is_pae(vcpu))
2528                 r = paging32E_init_context(vcpu);
2529         else
2530                 r = paging32_init_context(vcpu);
2531
2532         vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2533         vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2534
2535         return r;
2536 }
2537
2538 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2539 {
2540         vcpu->arch.update_pte.pfn = bad_pfn;
2541
2542         if (tdp_enabled)
2543                 return init_kvm_tdp_mmu(vcpu);
2544         else
2545                 return init_kvm_softmmu(vcpu);
2546 }
2547
2548 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2549 {
2550         ASSERT(vcpu);
2551         if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2552                 /* mmu.free() should set root_hpa = INVALID_PAGE */
2553                 vcpu->arch.mmu.free(vcpu);
2554 }
2555
2556 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
2557 {
2558         destroy_kvm_mmu(vcpu);
2559         return init_kvm_mmu(vcpu);
2560 }
2561 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
2562
2563 int kvm_mmu_load(struct kvm_vcpu *vcpu)
2564 {
2565         int r;
2566
2567         r = mmu_topup_memory_caches(vcpu);
2568         if (r)
2569                 goto out;
2570         r = mmu_alloc_roots(vcpu);
2571         spin_lock(&vcpu->kvm->mmu_lock);
2572         mmu_sync_roots(vcpu);
2573         spin_unlock(&vcpu->kvm->mmu_lock);
2574         if (r)
2575                 goto out;
2576         /* set_cr3() should ensure TLB has been flushed */
2577         kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2578 out:
2579         return r;
2580 }
2581 EXPORT_SYMBOL_GPL(kvm_mmu_load);
2582
2583 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2584 {
2585         mmu_free_roots(vcpu);
2586 }
2587
2588 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2589                                   struct kvm_mmu_page *sp,
2590                                   u64 *spte)
2591 {
2592         u64 pte;
2593         struct kvm_mmu_page *child;
2594
2595         pte = *spte;
2596         if (is_shadow_present_pte(pte)) {
2597                 if (is_last_spte(pte, sp->role.level))
2598                         drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2599                 else {
2600                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2601                         mmu_page_remove_parent_pte(child, spte);
2602                 }
2603         }
2604         __set_spte(spte, shadow_trap_nonpresent_pte);
2605         if (is_large_pte(pte))
2606                 --vcpu->kvm->stat.lpages;
2607 }
2608
2609 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2610                                   struct kvm_mmu_page *sp,
2611                                   u64 *spte,
2612                                   const void *new)
2613 {
2614         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2615                 ++vcpu->kvm->stat.mmu_pde_zapped;
2616                 return;
2617         }
2618
2619         ++vcpu->kvm->stat.mmu_pte_updated;
2620         if (!sp->role.cr4_pae)
2621                 paging32_update_pte(vcpu, sp, spte, new);
2622         else
2623                 paging64_update_pte(vcpu, sp, spte, new);
2624 }
2625
2626 static bool need_remote_flush(u64 old, u64 new)
2627 {
2628         if (!is_shadow_present_pte(old))
2629                 return false;
2630         if (!is_shadow_present_pte(new))
2631                 return true;
2632         if ((old ^ new) & PT64_BASE_ADDR_MASK)
2633                 return true;
2634         old ^= PT64_NX_MASK;
2635         new ^= PT64_NX_MASK;
2636         return (old & ~new & PT64_PERM_MASK) != 0;
2637 }
2638
2639 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2640                                     bool remote_flush, bool local_flush)
2641 {
2642         if (zap_page)
2643                 return;
2644
2645         if (remote_flush)
2646                 kvm_flush_remote_tlbs(vcpu->kvm);
2647         else if (local_flush)
2648                 kvm_mmu_flush_tlb(vcpu);
2649 }
2650
2651 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2652 {
2653         u64 *spte = vcpu->arch.last_pte_updated;
2654
2655         return !!(spte && (*spte & shadow_accessed_mask));
2656 }
2657
2658 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2659                                           u64 gpte)
2660 {
2661         gfn_t gfn;
2662         pfn_t pfn;
2663
2664         if (!is_present_gpte(gpte))
2665                 return;
2666         gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2667
2668         vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2669         smp_rmb();
2670         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2671
2672         if (is_error_pfn(pfn)) {
2673                 kvm_release_pfn_clean(pfn);
2674                 return;
2675         }
2676         vcpu->arch.update_pte.gfn = gfn;
2677         vcpu->arch.update_pte.pfn = pfn;
2678 }
2679
2680 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2681 {
2682         u64 *spte = vcpu->arch.last_pte_updated;
2683
2684         if (spte
2685             && vcpu->arch.last_pte_gfn == gfn
2686             && shadow_accessed_mask
2687             && !(*spte & shadow_accessed_mask)
2688             && is_shadow_present_pte(*spte))
2689                 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
2690 }
2691
2692 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2693                        const u8 *new, int bytes,
2694                        bool guest_initiated)
2695 {
2696         gfn_t gfn = gpa >> PAGE_SHIFT;
2697         struct kvm_mmu_page *sp;
2698         struct hlist_node *node;
2699         LIST_HEAD(invalid_list);
2700         u64 entry, gentry;
2701         u64 *spte;
2702         unsigned offset = offset_in_page(gpa);
2703         unsigned pte_size;
2704         unsigned page_offset;
2705         unsigned misaligned;
2706         unsigned quadrant;
2707         int level;
2708         int flooded = 0;
2709         int npte;
2710         int r;
2711         int invlpg_counter;
2712         bool remote_flush, local_flush, zap_page;
2713
2714         zap_page = remote_flush = local_flush = false;
2715
2716         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2717
2718         invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2719
2720         /*
2721          * Assume that the pte write on a page table of the same type
2722          * as the current vcpu paging mode.  This is nearly always true
2723          * (might be false while changing modes).  Note it is verified later
2724          * by update_pte().
2725          */
2726         if ((is_pae(vcpu) && bytes == 4) || !new) {
2727                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2728                 if (is_pae(vcpu)) {
2729                         gpa &= ~(gpa_t)7;
2730                         bytes = 8;
2731                 }
2732                 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2733                 if (r)
2734                         gentry = 0;
2735                 new = (const u8 *)&gentry;
2736         }
2737
2738         switch (bytes) {
2739         case 4:
2740                 gentry = *(const u32 *)new;
2741                 break;
2742         case 8:
2743                 gentry = *(const u64 *)new;
2744                 break;
2745         default:
2746                 gentry = 0;
2747                 break;
2748         }
2749
2750         mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2751         spin_lock(&vcpu->kvm->mmu_lock);
2752         if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2753                 gentry = 0;
2754         kvm_mmu_access_page(vcpu, gfn);
2755         kvm_mmu_free_some_pages(vcpu);
2756         ++vcpu->kvm->stat.mmu_pte_write;
2757         kvm_mmu_audit(vcpu, "pre pte write");
2758         if (guest_initiated) {
2759                 if (gfn == vcpu->arch.last_pt_write_gfn
2760                     && !last_updated_pte_accessed(vcpu)) {
2761                         ++vcpu->arch.last_pt_write_count;
2762                         if (vcpu->arch.last_pt_write_count >= 3)
2763                                 flooded = 1;
2764                 } else {
2765                         vcpu->arch.last_pt_write_gfn = gfn;
2766                         vcpu->arch.last_pt_write_count = 1;
2767                         vcpu->arch.last_pte_updated = NULL;
2768                 }
2769         }
2770
2771         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2772                 pte_size = sp->role.cr4_pae ? 8 : 4;
2773                 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2774                 misaligned |= bytes < 4;
2775                 if (misaligned || flooded) {
2776                         /*
2777                          * Misaligned accesses are too much trouble to fix
2778                          * up; also, they usually indicate a page is not used
2779                          * as a page table.
2780                          *
2781                          * If we're seeing too many writes to a page,
2782                          * it may no longer be a page table, or we may be
2783                          * forking, in which case it is better to unmap the
2784                          * page.
2785                          */
2786                         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2787                                  gpa, bytes, sp->role.word);
2788                         zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2789                                                      &invalid_list);
2790                         ++vcpu->kvm->stat.mmu_flooded;
2791                         continue;
2792                 }
2793                 page_offset = offset;
2794                 level = sp->role.level;
2795                 npte = 1;
2796                 if (!sp->role.cr4_pae) {
2797                         page_offset <<= 1;      /* 32->64 */
2798                         /*
2799                          * A 32-bit pde maps 4MB while the shadow pdes map
2800                          * only 2MB.  So we need to double the offset again
2801                          * and zap two pdes instead of one.
2802                          */
2803                         if (level == PT32_ROOT_LEVEL) {
2804                                 page_offset &= ~7; /* kill rounding error */
2805                                 page_offset <<= 1;
2806                                 npte = 2;
2807                         }
2808                         quadrant = page_offset >> PAGE_SHIFT;
2809                         page_offset &= ~PAGE_MASK;
2810                         if (quadrant != sp->role.quadrant)
2811                                 continue;
2812                 }
2813                 local_flush = true;
2814                 spte = &sp->spt[page_offset / sizeof(*spte)];
2815                 while (npte--) {
2816                         entry = *spte;
2817                         mmu_pte_write_zap_pte(vcpu, sp, spte);
2818                         if (gentry)
2819                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2820                         if (!remote_flush && need_remote_flush(entry, *spte))
2821                                 remote_flush = true;
2822                         ++spte;
2823                 }
2824         }
2825         mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2826         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2827         kvm_mmu_audit(vcpu, "post pte write");
2828         spin_unlock(&vcpu->kvm->mmu_lock);
2829         if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2830                 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2831                 vcpu->arch.update_pte.pfn = bad_pfn;
2832         }
2833 }
2834
2835 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2836 {
2837         gpa_t gpa;
2838         int r;
2839
2840         if (tdp_enabled)
2841                 return 0;
2842
2843         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2844
2845         spin_lock(&vcpu->kvm->mmu_lock);
2846         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2847         spin_unlock(&vcpu->kvm->mmu_lock);
2848         return r;
2849 }
2850 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2851
2852 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2853 {
2854         int free_pages;
2855         LIST_HEAD(invalid_list);
2856
2857         free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2858         while (free_pages < KVM_REFILL_PAGES &&
2859                !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2860                 struct kvm_mmu_page *sp;
2861
2862                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2863                                   struct kvm_mmu_page, link);
2864                 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2865                                                        &invalid_list);
2866                 ++vcpu->kvm->stat.mmu_recycled;
2867         }
2868         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2869 }
2870
2871 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2872 {
2873         int r;
2874         enum emulation_result er;
2875
2876         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2877         if (r < 0)
2878                 goto out;
2879
2880         if (!r) {
2881                 r = 1;
2882                 goto out;
2883         }
2884
2885         r = mmu_topup_memory_caches(vcpu);
2886         if (r)
2887                 goto out;
2888
2889         er = emulate_instruction(vcpu, cr2, error_code, 0);
2890
2891         switch (er) {
2892         case EMULATE_DONE:
2893                 return 1;
2894         case EMULATE_DO_MMIO:
2895                 ++vcpu->stat.mmio_exits;
2896                 /* fall through */
2897         case EMULATE_FAIL:
2898                 return 0;
2899         default:
2900                 BUG();
2901         }
2902 out:
2903         return r;
2904 }
2905 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2906
2907 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2908 {
2909         vcpu->arch.mmu.invlpg(vcpu, gva);
2910         kvm_mmu_flush_tlb(vcpu);
2911         ++vcpu->stat.invlpg;
2912 }
2913 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2914
2915 void kvm_enable_tdp(void)
2916 {
2917         tdp_enabled = true;
2918 }
2919 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
2920
2921 void kvm_disable_tdp(void)
2922 {
2923         tdp_enabled = false;
2924 }
2925 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2926
2927 static void free_mmu_pages(struct kvm_vcpu *vcpu)
2928 {
2929         free_page((unsigned long)vcpu->arch.mmu.pae_root);
2930 }
2931
2932 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2933 {
2934         struct page *page;
2935         int i;
2936
2937         ASSERT(vcpu);
2938
2939         /*
2940          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2941          * Therefore we need to allocate shadow page tables in the first
2942          * 4GB of memory, which happens to fit the DMA32 zone.
2943          */
2944         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2945         if (!page)
2946                 return -ENOMEM;
2947
2948         vcpu->arch.mmu.pae_root = page_address(page);
2949         for (i = 0; i < 4; ++i)
2950                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2951
2952         return 0;
2953 }
2954
2955 int kvm_mmu_create(struct kvm_vcpu *vcpu)
2956 {
2957         ASSERT(vcpu);
2958         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2959
2960         return alloc_mmu_pages(vcpu);
2961 }
2962
2963 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
2964 {
2965         ASSERT(vcpu);
2966         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2967
2968         return init_kvm_mmu(vcpu);
2969 }
2970
2971 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
2972 {
2973         ASSERT(vcpu);
2974
2975         destroy_kvm_mmu(vcpu);
2976         free_mmu_pages(vcpu);
2977         mmu_free_memory_caches(vcpu);
2978 }
2979
2980 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2981 {
2982         struct kvm_mmu_page *sp;
2983
2984         list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2985                 int i;
2986                 u64 *pt;
2987
2988                 if (!test_bit(slot, sp->slot_bitmap))
2989                         continue;
2990
2991                 pt = sp->spt;
2992                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2993                         /* avoid RMW */
2994                         if (is_writable_pte(pt[i]))
2995                                 pt[i] &= ~PT_WRITABLE_MASK;
2996         }
2997         kvm_flush_remote_tlbs(kvm);
2998 }
2999
3000 void kvm_mmu_zap_all(struct kvm *kvm)
3001 {
3002         struct kvm_mmu_page *sp, *node;
3003         LIST_HEAD(invalid_list);
3004
3005         spin_lock(&kvm->mmu_lock);
3006 restart:
3007         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3008                 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3009                         goto restart;
3010
3011         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3012         spin_unlock(&kvm->mmu_lock);
3013 }
3014
3015 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3016                                                struct list_head *invalid_list)
3017 {
3018         struct kvm_mmu_page *page;
3019
3020         page = container_of(kvm->arch.active_mmu_pages.prev,
3021                             struct kvm_mmu_page, link);
3022         return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3023 }
3024
3025 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3026 {
3027         struct kvm *kvm;
3028         struct kvm *kvm_freed = NULL;
3029         int cache_count = 0;
3030
3031         spin_lock(&kvm_lock);
3032
3033         list_for_each_entry(kvm, &vm_list, vm_list) {
3034                 int npages, idx, freed_pages;
3035                 LIST_HEAD(invalid_list);
3036
3037                 idx = srcu_read_lock(&kvm->srcu);
3038                 spin_lock(&kvm->mmu_lock);
3039                 npages = kvm->arch.n_alloc_mmu_pages -
3040                          kvm->arch.n_free_mmu_pages;
3041                 cache_count += npages;
3042                 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3043                         freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3044                                                           &invalid_list);
3045                         cache_count -= freed_pages;
3046                         kvm_freed = kvm;
3047                 }
3048                 nr_to_scan--;
3049
3050                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3051                 spin_unlock(&kvm->mmu_lock);
3052                 srcu_read_unlock(&kvm->srcu, idx);
3053         }
3054         if (kvm_freed)
3055                 list_move_tail(&kvm_freed->vm_list, &vm_list);
3056
3057         spin_unlock(&kvm_lock);
3058
3059         return cache_count;
3060 }
3061
3062 static struct shrinker mmu_shrinker = {
3063         .shrink = mmu_shrink,
3064         .seeks = DEFAULT_SEEKS * 10,
3065 };
3066
3067 static void mmu_destroy_caches(void)
3068 {
3069         if (pte_chain_cache)
3070                 kmem_cache_destroy(pte_chain_cache);
3071         if (rmap_desc_cache)
3072                 kmem_cache_destroy(rmap_desc_cache);
3073         if (mmu_page_header_cache)
3074                 kmem_cache_destroy(mmu_page_header_cache);
3075 }
3076
3077 void kvm_mmu_module_exit(void)
3078 {
3079         mmu_destroy_caches();
3080         unregister_shrinker(&mmu_shrinker);
3081 }
3082
3083 int kvm_mmu_module_init(void)
3084 {
3085         pte_chain_cache = kmem_cache_create("kvm_pte_chain",
3086                                             sizeof(struct kvm_pte_chain),
3087                                             0, 0, NULL);
3088         if (!pte_chain_cache)
3089                 goto nomem;
3090         rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3091                                             sizeof(struct kvm_rmap_desc),
3092                                             0, 0, NULL);
3093         if (!rmap_desc_cache)
3094                 goto nomem;
3095
3096         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3097                                                   sizeof(struct kvm_mmu_page),
3098                                                   0, 0, NULL);
3099         if (!mmu_page_header_cache)
3100                 goto nomem;
3101
3102         register_shrinker(&mmu_shrinker);
3103
3104         return 0;
3105
3106 nomem:
3107         mmu_destroy_caches();
3108         return -ENOMEM;
3109 }
3110
3111 /*
3112  * Caculate mmu pages needed for kvm.
3113  */
3114 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3115 {
3116         int i;
3117         unsigned int nr_mmu_pages;
3118         unsigned int  nr_pages = 0;
3119         struct kvm_memslots *slots;
3120
3121         slots = kvm_memslots(kvm);
3122
3123         for (i = 0; i < slots->nmemslots; i++)
3124                 nr_pages += slots->memslots[i].npages;
3125
3126         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3127         nr_mmu_pages = max(nr_mmu_pages,
3128                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3129
3130         return nr_mmu_pages;
3131 }
3132
3133 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3134                                 unsigned len)
3135 {
3136         if (len > buffer->len)
3137                 return NULL;
3138         return buffer->ptr;
3139 }
3140
3141 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3142                                 unsigned len)
3143 {
3144         void *ret;
3145
3146         ret = pv_mmu_peek_buffer(buffer, len);
3147         if (!ret)
3148                 return ret;
3149         buffer->ptr += len;
3150         buffer->len -= len;
3151         buffer->processed += len;
3152         return ret;
3153 }
3154
3155 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3156                              gpa_t addr, gpa_t value)
3157 {
3158         int bytes = 8;
3159         int r;
3160
3161         if (!is_long_mode(vcpu) && !is_pae(vcpu))
3162                 bytes = 4;
3163
3164         r = mmu_topup_memory_caches(vcpu);
3165         if (r)
3166                 return r;
3167
3168         if (!emulator_write_phys(vcpu, addr, &value, bytes))
3169                 return -EFAULT;
3170
3171         return 1;
3172 }
3173
3174 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3175 {
3176         (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3177         return 1;
3178 }
3179
3180 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3181 {
3182         spin_lock(&vcpu->kvm->mmu_lock);
3183         mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3184         spin_unlock(&vcpu->kvm->mmu_lock);
3185         return 1;
3186 }
3187
3188 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3189                              struct kvm_pv_mmu_op_buffer *buffer)
3190 {
3191         struct kvm_mmu_op_header *header;
3192
3193         header = pv_mmu_peek_buffer(buffer, sizeof *header);
3194         if (!header)
3195                 return 0;
3196         switch (header->op) {
3197         case KVM_MMU_OP_WRITE_PTE: {
3198                 struct kvm_mmu_op_write_pte *wpte;
3199
3200                 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3201                 if (!wpte)
3202                         return 0;
3203                 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3204                                         wpte->pte_val);
3205         }
3206         case KVM_MMU_OP_FLUSH_TLB: {
3207                 struct kvm_mmu_op_flush_tlb *ftlb;
3208
3209                 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3210                 if (!ftlb)
3211                         return 0;
3212                 return kvm_pv_mmu_flush_tlb(vcpu);
3213         }
3214         case KVM_MMU_OP_RELEASE_PT: {
3215                 struct kvm_mmu_op_release_pt *rpt;
3216
3217                 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3218                 if (!rpt)
3219                         return 0;
3220                 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3221         }
3222         default: return 0;
3223         }
3224 }
3225
3226 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3227                   gpa_t addr, unsigned long *ret)
3228 {
3229         int r;
3230         struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3231
3232         buffer->ptr = buffer->buf;
3233         buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3234         buffer->processed = 0;
3235
3236         r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3237         if (r)
3238                 goto out;
3239
3240         while (buffer->len) {
3241                 r = kvm_pv_mmu_op_one(vcpu, buffer);
3242                 if (r < 0)
3243                         goto out;
3244                 if (r == 0)
3245                         break;
3246         }
3247
3248         r = 1;
3249 out:
3250         *ret = buffer->processed;
3251         return r;
3252 }
3253
3254 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3255 {
3256         struct kvm_shadow_walk_iterator iterator;
3257         int nr_sptes = 0;
3258
3259         spin_lock(&vcpu->kvm->mmu_lock);
3260         for_each_shadow_entry(vcpu, addr, iterator) {
3261                 sptes[iterator.level-1] = *iterator.sptep;
3262                 nr_sptes++;
3263                 if (!is_shadow_present_pte(*iterator.sptep))
3264                         break;
3265         }
3266         spin_unlock(&vcpu->kvm->mmu_lock);
3267
3268         return nr_sptes;
3269 }
3270 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3271
3272 #ifdef AUDIT
3273
3274 static const char *audit_msg;
3275
3276 static gva_t canonicalize(gva_t gva)
3277 {
3278 #ifdef CONFIG_X86_64
3279         gva = (long long)(gva << 16) >> 16;
3280 #endif
3281         return gva;
3282 }
3283
3284
3285 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3286
3287 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3288                             inspect_spte_fn fn)
3289 {
3290         int i;
3291
3292         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3293                 u64 ent = sp->spt[i];
3294
3295                 if (is_shadow_present_pte(ent)) {
3296                         if (!is_last_spte(ent, sp->role.level)) {
3297                                 struct kvm_mmu_page *child;
3298                                 child = page_header(ent & PT64_BASE_ADDR_MASK);
3299                                 __mmu_spte_walk(kvm, child, fn);
3300                         } else
3301                                 fn(kvm, &sp->spt[i]);
3302                 }
3303         }
3304 }
3305
3306 static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3307 {
3308         int i;
3309         struct kvm_mmu_page *sp;
3310
3311         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3312                 return;
3313         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3314                 hpa_t root = vcpu->arch.mmu.root_hpa;
3315                 sp = page_header(root);
3316                 __mmu_spte_walk(vcpu->kvm, sp, fn);
3317                 return;
3318         }
3319         for (i = 0; i < 4; ++i) {
3320                 hpa_t root = vcpu->arch.mmu.pae_root[i];
3321
3322                 if (root && VALID_PAGE(root)) {
3323                         root &= PT64_BASE_ADDR_MASK;
3324                         sp = page_header(root);
3325                         __mmu_spte_walk(vcpu->kvm, sp, fn);
3326                 }
3327         }
3328         return;
3329 }
3330
3331 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3332                                 gva_t va, int level)
3333 {
3334         u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3335         int i;
3336         gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3337
3338         for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3339                 u64 ent = pt[i];
3340
3341                 if (ent == shadow_trap_nonpresent_pte)
3342                         continue;
3343
3344                 va = canonicalize(va);
3345                 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3346                         audit_mappings_page(vcpu, ent, va, level - 1);
3347                 else {
3348                         gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3349                         gfn_t gfn = gpa >> PAGE_SHIFT;
3350                         pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3351                         hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3352
3353                         if (is_error_pfn(pfn)) {
3354                                 kvm_release_pfn_clean(pfn);
3355                                 continue;
3356                         }
3357
3358                         if (is_shadow_present_pte(ent)
3359                             && (ent & PT64_BASE_ADDR_MASK) != hpa)
3360                                 printk(KERN_ERR "xx audit error: (%s) levels %d"
3361                                        " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3362                                        audit_msg, vcpu->arch.mmu.root_level,
3363                                        va, gpa, hpa, ent,
3364                                        is_shadow_present_pte(ent));
3365                         else if (ent == shadow_notrap_nonpresent_pte
3366                                  && !is_error_hpa(hpa))
3367                                 printk(KERN_ERR "audit: (%s) notrap shadow,"
3368                                        " valid guest gva %lx\n", audit_msg, va);
3369                         kvm_release_pfn_clean(pfn);
3370
3371                 }
3372         }
3373 }
3374
3375 static void audit_mappings(struct kvm_vcpu *vcpu)
3376 {
3377         unsigned i;
3378
3379         if (vcpu->arch.mmu.root_level == 4)
3380                 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3381         else
3382                 for (i = 0; i < 4; ++i)
3383                         if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3384                                 audit_mappings_page(vcpu,
3385                                                     vcpu->arch.mmu.pae_root[i],
3386                                                     i << 30,
3387                                                     2);
3388 }
3389
3390 static int count_rmaps(struct kvm_vcpu *vcpu)
3391 {
3392         struct kvm *kvm = vcpu->kvm;
3393         struct kvm_memslots *slots;
3394         int nmaps = 0;
3395         int i, j, k, idx;
3396
3397         idx = srcu_read_lock(&kvm->srcu);
3398         slots = kvm_memslots(kvm);
3399         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3400                 struct kvm_memory_slot *m = &slots->memslots[i];
3401                 struct kvm_rmap_desc *d;
3402
3403                 for (j = 0; j < m->npages; ++j) {
3404                         unsigned long *rmapp = &m->rmap[j];
3405
3406                         if (!*rmapp)
3407                                 continue;
3408                         if (!(*rmapp & 1)) {
3409                                 ++nmaps;
3410                                 continue;
3411                         }
3412                         d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3413                         while (d) {
3414                                 for (k = 0; k < RMAP_EXT; ++k)
3415                                         if (d->sptes[k])
3416                                                 ++nmaps;
3417                                         else
3418                                                 break;
3419                                 d = d->more;
3420                         }
3421                 }
3422         }
3423         srcu_read_unlock(&kvm->srcu, idx);
3424         return nmaps;
3425 }
3426
3427 void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3428 {
3429         unsigned long *rmapp;
3430         struct kvm_mmu_page *rev_sp;
3431         gfn_t gfn;
3432
3433         if (is_writable_pte(*sptep)) {
3434                 rev_sp = page_header(__pa(sptep));
3435                 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3436
3437                 if (!gfn_to_memslot(kvm, gfn)) {
3438                         if (!printk_ratelimit())
3439                                 return;
3440                         printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3441                                          audit_msg, gfn);
3442                         printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3443                                audit_msg, (long int)(sptep - rev_sp->spt),
3444                                         rev_sp->gfn);
3445                         dump_stack();
3446                         return;
3447                 }
3448
3449                 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3450                 if (!*rmapp) {
3451                         if (!printk_ratelimit())
3452                                 return;
3453                         printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3454                                          audit_msg, *sptep);
3455                         dump_stack();
3456                 }
3457         }
3458
3459 }
3460
3461 void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3462 {
3463         mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3464 }
3465
3466 static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3467 {
3468         struct kvm_mmu_page *sp;
3469         int i;
3470
3471         list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3472                 u64 *pt = sp->spt;
3473
3474                 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3475                         continue;
3476
3477                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3478                         u64 ent = pt[i];
3479
3480                         if (!(ent & PT_PRESENT_MASK))
3481                                 continue;
3482                         if (!is_writable_pte(ent))
3483                                 continue;
3484                         inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3485                 }
3486         }
3487         return;
3488 }
3489
3490 static void audit_rmap(struct kvm_vcpu *vcpu)
3491 {
3492         check_writable_mappings_rmap(vcpu);
3493         count_rmaps(vcpu);
3494 }
3495
3496 static void audit_write_protection(struct kvm_vcpu *vcpu)
3497 {
3498         struct kvm_mmu_page *sp;
3499         struct kvm_memory_slot *slot;
3500         unsigned long *rmapp;
3501         u64 *spte;
3502         gfn_t gfn;
3503
3504         list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3505                 if (sp->role.direct)
3506                         continue;
3507                 if (sp->unsync)
3508                         continue;
3509
3510                 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3511                 rmapp = &slot->rmap[gfn - slot->base_gfn];
3512
3513                 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3514                 while (spte) {
3515                         if (is_writable_pte(*spte))
3516                                 printk(KERN_ERR "%s: (%s) shadow page has "
3517                                 "writable mappings: gfn %lx role %x\n",
3518                                __func__, audit_msg, sp->gfn,
3519                                sp->role.word);
3520                         spte = rmap_next(vcpu->kvm, rmapp, spte);
3521                 }
3522         }
3523 }
3524
3525 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3526 {
3527         int olddbg = dbg;
3528
3529         dbg = 0;
3530         audit_msg = msg;
3531         audit_rmap(vcpu);
3532         audit_write_protection(vcpu);
3533         if (strcmp("pre pte write", audit_msg) != 0)
3534                 audit_mappings(vcpu);
3535         audit_writable_sptes_have_rmaps(vcpu);
3536         dbg = olddbg;
3537 }
3538
3539 #endif