]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 4 Oct 2012 16:30:33 +0000 (09:30 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 4 Oct 2012 16:30:33 +0000 (09:30 -0700)
Pull KVM updates from Avi Kivity:
 "Highlights of the changes for this release include support for vfio
  level triggered interrupts, improved big real mode support on older
  Intels, a streamlines guest page table walker, guest APIC speedups,
  PIO optimizations, better overcommit handling, and read-only memory."

* tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits)
  KVM: s390: Fix vcpu_load handling in interrupt code
  KVM: x86: Fix guest debug across vcpu INIT reset
  KVM: Add resampling irqfds for level triggered interrupts
  KVM: optimize apic interrupt delivery
  KVM: MMU: Eliminate pointless temporary 'ac'
  KVM: MMU: Avoid access/dirty update loop if all is well
  KVM: MMU: Eliminate eperm temporary
  KVM: MMU: Optimize is_last_gpte()
  KVM: MMU: Simplify walk_addr_generic() loop
  KVM: MMU: Optimize pte permission checks
  KVM: MMU: Update accessed and dirty bits after guest pagetable walk
  KVM: MMU: Move gpte_access() out of paging_tmpl.h
  KVM: MMU: Optimize gpte_access() slightly
  KVM: MMU: Push clean gpte write protection out of gpte_access()
  KVM: clarify kvmclock documentation
  KVM: make processes waiting on vcpu mutex killable
  KVM: SVM: Make use of asm.h
  KVM: VMX: Make use of asm.h
  KVM: VMX: Make lto-friendly
  KVM: x86: lapic: Clean up find_highest_vector() and count_vectors()
  ...

Conflicts:
arch/s390/include/asm/processor.h
arch/x86/kvm/i8259.c

62 files changed:
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/hypercalls.txt [new file with mode: 0644]
Documentation/virtual/kvm/msr.txt
Documentation/virtual/kvm/ppc-pv.txt
arch/ia64/kvm/kvm-ia64.c
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kvm/44x_tlb.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/e500_tlb.c
arch/powerpc/kvm/powerpc.c
arch/s390/include/asm/processor.h
arch/s390/kernel/dis.c
arch/s390/kvm/Kconfig
arch/s390/kvm/diag.c
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/priv.c
arch/s390/kvm/sigp.c
arch/s390/kvm/trace-s390.h [new file with mode: 0644]
arch/s390/kvm/trace.h [new file with mode: 0644]
arch/x86/Kconfig
arch/x86/include/asm/kvm.h
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_para.h
arch/x86/kernel/Makefile
arch/x86/kernel/kvm.c
arch/x86/kernel/setup.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/cpuid.c
arch/x86/kvm/emulate.c
arch/x86/kvm/i8254.c
arch/x86/kvm/i8254.h
arch/x86/kvm/i8259.c
arch/x86/kvm/irq.h
arch/x86/kvm/kvm_timer.h [deleted file]
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/pmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/timer.c [deleted file]
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm.h
include/linux/kvm_host.h
kernel/jump_label.c
virt/kvm/Kconfig
virt/kvm/async_pf.c
virt/kvm/eventfd.c
virt/kvm/ioapic.c
virt/kvm/iommu.c
virt/kvm/irq_comm.c
virt/kvm/kvm_main.c

index bf33aaa4c59f8f2e507fdd3e9c32528b16e831d1..f6ec3a92e62148087b8ac0764e8f916e175d8eb8 100644 (file)
@@ -857,7 +857,8 @@ struct kvm_userspace_memory_region {
 };
 
 /* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
+#define KVM_MEM_LOG_DIRTY_PAGES        (1UL << 0)
+#define KVM_MEM_READONLY       (1UL << 1)
 
 This ioctl allows the user to create or modify a guest physical memory
 slot.  When changing an existing slot, it may be moved in the guest
@@ -873,14 +874,17 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
 be identical.  This allows large pages in the guest to be backed by large
 pages in the host.
 
-The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
-instructs kvm to keep track of writes to memory within the slot.  See
-the KVM_GET_DIRTY_LOG ioctl.
+The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs
+kvm to keep track of writes to memory within the slot.  See KVM_GET_DIRTY_LOG
+ioctl.  The KVM_CAP_READONLY_MEM capability indicates the availability of the
+KVM_MEM_READONLY flag.  When this flag is set for a memory region, KVM only
+allows read accesses.  Writes will be posted to userspace as KVM_EXIT_MMIO
+exits.
 
-When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
-region are automatically reflected into the guest.  For example, an mmap()
-that affects the region will be made visible immediately.  Another example
-is madvise(MADV_DROP).
+When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
+the memory region are automatically reflected into the guest.  For example, an
+mmap() that affects the region will be made visible immediately.  Another
+example is madvise(MADV_DROP).
 
 It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
 The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
@@ -1946,6 +1950,19 @@ the guest using the specified gsi pin.  The irqfd is removed using
 the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
 and kvm_irqfd.gsi.
 
+With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify
+mechanism allowing emulation of level-triggered, irqfd-based
+interrupts.  When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an
+additional eventfd in the kvm_irqfd.resamplefd field.  When operating
+in resample mode, posting of an interrupt through kvm_irq.fd asserts
+the specified gsi in the irqchip.  When the irqchip is resampled, such
+as from an EOI, the gsi is de-asserted and the user is notifed via
+kvm_irqfd.resamplefd.  It is the user's responsibility to re-queue
+the interrupt if the device making use of it still requires service.
+Note that closing the resamplefd is not sufficient to disable the
+irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
+and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
+
 4.76 KVM_PPC_ALLOCATE_HTAB
 
 Capability: KVM_CAP_PPC_ALLOC_HTAB
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
new file mode 100644 (file)
index 0000000..ea113b5
--- /dev/null
@@ -0,0 +1,66 @@
+Linux KVM Hypercall:
+===================
+X86:
+ KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall
+ instruction. The hypervisor can replace it with instructions that are
+ guaranteed to be supported.
+
+ Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ The hypercall number should be placed in rax and the return value will be
+ placed in rax.  No other registers will be clobbered unless explicitly stated
+ by the particular hypercall.
+
+S390:
+  R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall
+  number. The return value is written to R2.
+
+  S390 uses diagnose instruction as hypercall (0x500) along with hypercall
+  number in R1.
+
+ PowerPC:
+  It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers.
+  Return value is placed in R3.
+
+  KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions'
+  property inside the device tree's /hypervisor node.
+  For more information refer to Documentation/virtual/kvm/ppc-pv.txt
+
+KVM Hypercalls Documentation
+===========================
+The template for each hypercall is:
+1. Hypercall name.
+2. Architecture(s)
+3. Status (deprecated, obsolete, active)
+4. Purpose
+
+1. KVM_HC_VAPIC_POLL_IRQ
+------------------------
+Architecture: x86
+Status: active
+Purpose: Trigger guest exit so that the host can check for pending
+interrupts on reentry.
+
+2. KVM_HC_MMU_OP
+------------------------
+Architecture: x86
+Status: deprecated.
+Purpose: Support MMU operations such as writing to PTE,
+flushing TLB, release PT.
+
+3. KVM_HC_FEATURES
+------------------------
+Architecture: PPC
+Status: active
+Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid
+used to enumerate which hypercalls are available. On PPC, either device tree
+based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration
+mechanism (which is this hypercall) can be used.
+
+4. KVM_HC_PPC_MAP_MAGIC_PAGE
+------------------------
+Architecture: PPC
+Status: active
+Purpose: To enable communication between the hypervisor and guest there is a
+shared page that contains parts of supervisor visible register state.
+The guest can map this shared page to access its supervisor register through
+memory using this hypercall.
index 730471048583b9a034a78eb1da4b805373f29c25..6d470ae7b073a2fa13e2f28d741166cbc346e42f 100644 (file)
@@ -34,9 +34,12 @@ MSR_KVM_WALL_CLOCK_NEW:   0x4b564d00
                time information and check that they are both equal and even.
                An odd version indicates an in-progress update.
 
-               sec: number of seconds for wallclock.
+               sec: number of seconds for wallclock at time of boot.
 
-               nsec: number of nanoseconds for wallclock.
+               nsec: number of nanoseconds for wallclock at time of boot.
+
+       In order to get the current wallclock time, the system_time from
+       MSR_KVM_SYSTEM_TIME_NEW needs to be added.
 
        Note that although MSRs are per-CPU entities, the effect of this
        particular MSR is global.
@@ -82,20 +85,25 @@ MSR_KVM_SYSTEM_TIME_NEW:  0x4b564d01
                time at the time this structure was last updated. Unit is
                nanoseconds.
 
-               tsc_to_system_mul: a function of the tsc frequency. One has
-               to multiply any tsc-related quantity by this value to get
-               a value in nanoseconds, besides dividing by 2^tsc_shift
+               tsc_to_system_mul: multiplier to be used when converting
+               tsc-related quantity to nanoseconds
 
-               tsc_shift: cycle to nanosecond divider, as a power of two, to
-               allow for shift rights. One has to shift right any tsc-related
-               quantity by this value to get a value in nanoseconds, besides
-               multiplying by tsc_to_system_mul.
+               tsc_shift: shift to be used when converting tsc-related
+               quantity to nanoseconds. This shift will ensure that
+               multiplication with tsc_to_system_mul does not overflow.
+               A positive value denotes a left shift, a negative value
+               a right shift.
 
-               With this information, guests can derive per-CPU time by
-               doing:
+               The conversion from tsc to nanoseconds involves an additional
+               right shift by 32 bits. With this information, guests can
+               derive per-CPU time by doing:
 
                        time = (current_tsc - tsc_timestamp)
-                       time = (time * tsc_to_system_mul) >> tsc_shift
+                       if (tsc_shift >= 0)
+                               time <<= tsc_shift;
+                       else
+                               time >>= -tsc_shift;
+                       time = (time * tsc_to_system_mul) >> 32
                        time = time + system_time
 
                flags: bits in this field indicate extended capabilities
index 4911cf95c67e51507118139a7d20897f67fbd888..4cd076febb0239452db01b8e127d6f810081efbb 100644 (file)
@@ -174,3 +174,25 @@ following:
 That way we can inject an arbitrary amount of code as replacement for a single
 instruction. This allows us to check for pending interrupts when setting EE=1
 for example.
+
+Hypercall ABIs in KVM on PowerPC
+=================================
+1) KVM hypercalls (ePAPR)
+
+These are ePAPR compliant hypercall implementation (mentioned above). Even
+generic hypercalls are implemented here, like the ePAPR idle hcall. These are
+available on all targets.
+
+2) PAPR hypercalls
+
+PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU).
+These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of
+them are handled in the kernel, some are handled in user space. This is only
+available on book3s_64.
+
+3) OSI hypercalls
+
+Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long
+before KVM). This is supported to maintain compatibility. All these hypercalls get
+forwarded to user space. This is only useful on book3s_32, but can be used with
+book3s_64 as well.
index bd77cb507c1c7401124dbd3dc86fe663c17e6737..8b3a9c0e771dc3926131b428ba2472a337d41845 100644 (file)
@@ -924,6 +924,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        return 0;
 }
 
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+       if (!irqchip_in_kernel(kvm))
+               return -ENXIO;
+
+       irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+                                       irq_event->irq, irq_event->level);
+       return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                unsigned int ioctl, unsigned long arg)
 {
@@ -963,29 +973,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        goto out;
                }
                break;
-       case KVM_IRQ_LINE_STATUS:
-       case KVM_IRQ_LINE: {
-               struct kvm_irq_level irq_event;
-
-               r = -EFAULT;
-               if (copy_from_user(&irq_event, argp, sizeof irq_event))
-                       goto out;
-               r = -ENXIO;
-               if (irqchip_in_kernel(kvm)) {
-                       __s32 status;
-                       status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-                                   irq_event.irq, irq_event.level);
-                       if (ioctl == KVM_IRQ_LINE_STATUS) {
-                               r = -EFAULT;
-                               irq_event.status = status;
-                               if (copy_to_user(argp, &irq_event,
-                                                       sizeof irq_event))
-                                       goto out;
-                       }
-                       r = 0;
-               }
-               break;
-               }
        case KVM_GET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
                struct kvm_irqchip chip;
@@ -1626,11 +1613,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        return;
 }
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
        kvm_flush_remote_tlbs(kvm);
 }
 
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot)
+{
+       kvm_arch_flush_shadow_all();
+}
+
 long kvm_arch_dev_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
index a8bf5c673a3c430c7aa6f45cbfd43e290dd3a95c..28e8f5e5c63e7e1ac87247edaf2a0052e6c128a8 100644 (file)
@@ -53,6 +53,8 @@
 
 struct kvm;
 extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range(struct kvm *kvm,
+                              unsigned long start, unsigned long end);
 extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
@@ -220,6 +222,7 @@ struct revmap_entry {
 #define KVMPPC_GOT_PAGE                0x80
 
 struct kvm_arch_memory_slot {
+       unsigned long *rmap;
 };
 
 struct kvm_arch {
index 33aa715dab28b776deeea328271e5eb89abd37b8..5dd3ab46997603e6f55c681781b773e50a0ff557 100644 (file)
@@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
        if (is_error_page(new_page)) {
                printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
                        (unsigned long long)gfn);
-               kvm_release_page_clean(new_page);
                return;
        }
        hpaddr = page_to_phys(new_page);
index d03eb6f7b0584e368bdd96e0f1969af58e885371..d95d11322a159912230832371eefbe74c61c8e62 100644 (file)
@@ -705,7 +705,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                goto out_unlock;
        hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 
-       rmap = &memslot->rmap[gfn - memslot->base_gfn];
+       rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
        lock_rmap(rmap);
 
        /* Check if we might have been invalidated; let the guest retry if so */
@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        goto out_put;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-                                        unsigned long gfn))
+static int kvm_handle_hva_range(struct kvm *kvm,
+                               unsigned long start,
+                               unsigned long end,
+                               int (*handler)(struct kvm *kvm,
+                                              unsigned long *rmapp,
+                                              unsigned long gfn))
 {
        int ret;
        int retval = 0;
@@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 
        slots = kvm_memslots(kvm);
        kvm_for_each_memslot(memslot, slots) {
-               unsigned long start = memslot->userspace_addr;
-               unsigned long end;
+               unsigned long hva_start, hva_end;
+               gfn_t gfn, gfn_end;
 
-               end = start + (memslot->npages << PAGE_SHIFT);
-               if (hva >= start && hva < end) {
-                       gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+               hva_start = max(start, memslot->userspace_addr);
+               hva_end = min(end, memslot->userspace_addr +
+                                       (memslot->npages << PAGE_SHIFT));
+               if (hva_start >= hva_end)
+                       continue;
+               /*
+                * {gfn(page) | page intersects with [hva_start, hva_end)} =
+                * {gfn, gfn+1, ..., gfn_end-1}.
+                */
+               gfn = hva_to_gfn_memslot(hva_start, memslot);
+               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+               for (; gfn < gfn_end; ++gfn) {
+                       gfn_t gfn_offset = gfn - memslot->base_gfn;
 
-                       ret = handler(kvm, &memslot->rmap[gfn_offset],
-                                     memslot->base_gfn + gfn_offset);
+                       ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
                        retval |= ret;
                }
        }
@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
        return retval;
 }
 
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+                         int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+                                        unsigned long gfn))
+{
+       return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
+}
+
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
                           unsigned long gfn)
 {
@@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
        return 0;
 }
 
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+       if (kvm->arch.using_mmu_notifiers)
+               kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+       return 0;
+}
+
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
                         unsigned long gfn)
 {
@@ -1009,7 +1036,7 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
        unsigned long *rmapp, *map;
 
        preempt_disable();
-       rmapp = memslot->rmap;
+       rmapp = memslot->arch.rmap;
        map = memslot->dirty_bitmap;
        for (i = 0; i < memslot->npages; ++i) {
                if (kvm_test_clear_dirty(kvm, rmapp))
index 5c70d19494f9251bee3968ba5f23a7c56cd6c926..fb0e821622d4fc97465288cd57f6c12978bdc381 100644 (file)
@@ -84,7 +84,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
                return;
 
-       rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]);
+       rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
        lock_rmap(rmap);
 
        head = *rmap & KVMPPC_RMAP_INDEX;
@@ -180,7 +180,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
        if (!slot_is_aligned(memslot, psize))
                return H_PARAMETER;
        slot_fn = gfn - memslot->base_gfn;
-       rmap = &memslot->rmap[slot_fn];
+       rmap = &memslot->arch.rmap[slot_fn];
 
        if (!kvm->arch.using_mmu_notifiers) {
                physp = kvm->arch.slot_phys[memslot->id];
@@ -197,7 +197,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
                pa &= PAGE_MASK;
        } else {
                /* Translate to host virtual address */
-               hva = gfn_to_hva_memslot(memslot, gfn);
+               hva = __gfn_to_hva_memslot(memslot, gfn);
 
                /* Look up the Linux PTE for the backing page */
                pte_size = psize;
index a1baec340f7ee3e0492d3fcd86e14e41a8811ccf..05c28f59f77f4a4bc46c196addf12747cbb97e4f 100644 (file)
@@ -242,10 +242,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
        int i;
 
        hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-       if (is_error_page(hpage)) {
-               kvm_release_page_clean(hpage);
+       if (is_error_page(hpage))
                return;
-       }
 
        hpage_offset = pte->raddr & ~PAGE_MASK;
        hpage_offset &= ~0xFFFULL;
index a2b66717813dfef6c43e9bf23864f8766266f0a5..ff38b664195d6dcac44f1e24b6e15c6a431abd7b 100644 (file)
@@ -520,11 +520,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
        if (likely(!pfnmap)) {
                unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
-               pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
+               pfn = gfn_to_pfn_memslot(slot, gfn);
                if (is_error_pfn(pfn)) {
                        printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
                                        (long)gfn);
-                       kvm_release_pfn_clean(pfn);
                        return;
                }
 
index 87f4dc886076d3ad2d6ecede5dfbe1990cd568e0..4d213b8b0fb55eeb1eff3ae1ab591ab91d00dba9 100644 (file)
@@ -302,10 +302,18 @@ long kvm_arch_dev_ioctl(struct file *filp,
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
+       if (!dont || free->arch.rmap != dont->arch.rmap) {
+               vfree(free->arch.rmap);
+               free->arch.rmap = NULL;
+       }
 }
 
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
+       slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
+       if (!slot->arch.rmap)
+               return -ENOMEM;
+
        return 0;
 }
 
@@ -326,8 +334,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        kvmppc_core_commit_memory_region(kvm, mem);
 }
 
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot)
 {
 }
 
index f3e0aabfc6bcb003a367b9b0e5d9d2087989e28e..56831dfa9198c6e2192605fd5ee9d68c538d8b17 100644 (file)
@@ -159,6 +159,7 @@ extern unsigned long thread_saved_pc(struct task_struct *t);
 
 extern void show_code(struct pt_regs *regs);
 extern void print_fn_code(unsigned char *code, unsigned long len);
+extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]);
 
 unsigned long get_wchan(struct task_struct *p);
 #define task_pt_regs(tsk) ((struct pt_regs *) \
index cc84a24c023ff8b2de506e4957861dbd2c9a3e71..f00286bd2ef9050ad0cb166415272915eb7c5f5f 100644 (file)
@@ -1501,6 +1501,33 @@ static struct insn *find_insn(unsigned char *code)
        return NULL;
 }
 
+/**
+ * insn_to_mnemonic - decode an s390 instruction
+ * @instruction: instruction to decode
+ * @buf: buffer to fill with mnemonic
+ *
+ * Decode the instruction at @instruction and store the corresponding
+ * mnemonic into @buf.
+ * @buf is left unchanged if the instruction could not be decoded.
+ * Returns:
+ *  %0 on success, %-ENOENT if the instruction was not found.
+ */
+int insn_to_mnemonic(unsigned char *instruction, char buf[8])
+{
+       struct insn *insn;
+
+       insn = find_insn(instruction);
+       if (!insn)
+               return -ENOENT;
+       if (insn->name[0] == '\0')
+               snprintf(buf, sizeof(buf), "%s",
+                        long_insn_name[(int) insn->name[1]]);
+       else
+               snprintf(buf, sizeof(buf), "%.5s", insn->name);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(insn_to_mnemonic);
+
 static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
 {
        struct insn *insn;
index 9b04a32e56958f2cf5d4397a3721e94ccb2e44d5..b58dd869cb320ffeedafe7bcb6489db59bbb2cba 100644 (file)
@@ -21,6 +21,7 @@ config KVM
        depends on HAVE_KVM && EXPERIMENTAL
        select PREEMPT_NOTIFIERS
        select ANON_INODES
+       select HAVE_KVM_CPU_RELAX_INTERCEPT
        ---help---
          Support hosting paravirtualized guest machines using the SIE
          virtualization capability on the mainframe. This should work
index c88bb7793390d7a1e26d0982529b32d4747ca67f..a390687feb1359d6b579024d51787e4eeff0207e 100644 (file)
@@ -14,6 +14,8 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include "kvm-s390.h"
+#include "trace.h"
+#include "trace-s390.h"
 
 static int diag_release_pages(struct kvm_vcpu *vcpu)
 {
@@ -98,6 +100,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
        vcpu->run->exit_reason = KVM_EXIT_S390_RESET;
        VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx",
          vcpu->run->s390_reset_flags);
+       trace_kvm_s390_request_resets(vcpu->run->s390_reset_flags);
        return -EREMOTE;
 }
 
@@ -105,6 +108,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
 {
        int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
 
+       trace_kvm_s390_handle_diag(vcpu, code);
        switch (code) {
        case 0x10:
                return diag_release_pages(vcpu);
index adae539f12e2fbaeb49f41b462f95bc4354ce3f9..22798ec33fd16bd58e5a9726a6f5eec41d798d6a 100644 (file)
@@ -19,6 +19,8 @@
 
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "trace.h"
+#include "trace-s390.h"
 
 static int handle_lctlg(struct kvm_vcpu *vcpu)
 {
@@ -45,6 +47,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
 
        VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
                   disp2);
+       trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
 
        do {
                rc = get_guest_u64(vcpu, useraddr,
@@ -82,6 +85,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
 
        VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
                   disp2);
+       trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
 
        reg = reg1;
        do {
@@ -135,6 +139,8 @@ static int handle_stop(struct kvm_vcpu *vcpu)
        vcpu->stat.exit_stop_request++;
        spin_lock_bh(&vcpu->arch.local_int.lock);
 
+       trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
+
        if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
                vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
                rc = SIE_INTERCEPT_RERUNVCPU;
@@ -171,6 +177,7 @@ static int handle_validity(struct kvm_vcpu *vcpu)
        int rc;
 
        vcpu->stat.exit_validity++;
+       trace_kvm_s390_intercept_validity(vcpu, viwhy);
        if (viwhy == 0x37) {
                vmaddr = gmap_fault(vcpu->arch.sie_block->prefix,
                                    vcpu->arch.gmap);
@@ -213,6 +220,9 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
        intercept_handler_t handler;
 
        vcpu->stat.exit_instruction++;
+       trace_kvm_s390_intercept_instruction(vcpu,
+                                            vcpu->arch.sie_block->ipa,
+                                            vcpu->arch.sie_block->ipb);
        handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8];
        if (handler)
                return handler(vcpu);
@@ -222,6 +232,7 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
 static int handle_prog(struct kvm_vcpu *vcpu)
 {
        vcpu->stat.exit_program_interruption++;
+       trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc);
        return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc);
 }
 
index b7bc1aac8ed2dc3611c50f24c7c4b53989642661..ff1e2f8ef94a02146e9b90e56c7105f9e9b140f3 100644 (file)
@@ -19,6 +19,7 @@
 #include <asm/uaccess.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "trace-s390.h"
 
 static int psw_extint_disabled(struct kvm_vcpu *vcpu)
 {
@@ -130,6 +131,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
        case KVM_S390_INT_EMERGENCY:
                VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg");
                vcpu->stat.deliver_emergency_signal++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->emerg.code, 0);
                rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201);
                if (rc == -EFAULT)
                        exception = 1;
@@ -152,6 +155,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
        case KVM_S390_INT_EXTERNAL_CALL:
                VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
                vcpu->stat.deliver_external_call++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->extcall.code, 0);
                rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
                if (rc == -EFAULT)
                        exception = 1;
@@ -175,6 +180,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
                           inti->ext.ext_params);
                vcpu->stat.deliver_service_signal++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->ext.ext_params, 0);
                rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401);
                if (rc == -EFAULT)
                        exception = 1;
@@ -198,6 +205,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
                           inti->ext.ext_params, inti->ext.ext_params2);
                vcpu->stat.deliver_virtio_interrupt++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->ext.ext_params,
+                                                inti->ext.ext_params2);
                rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603);
                if (rc == -EFAULT)
                        exception = 1;
@@ -229,6 +239,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
        case KVM_S390_SIGP_STOP:
                VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
                vcpu->stat.deliver_stop_signal++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                0, 0);
                __set_intercept_indicator(vcpu, inti);
                break;
 
@@ -236,12 +248,16 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
                           inti->prefix.address);
                vcpu->stat.deliver_prefix_signal++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->prefix.address, 0);
                kvm_s390_set_prefix(vcpu, inti->prefix.address);
                break;
 
        case KVM_S390_RESTART:
                VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart");
                vcpu->stat.deliver_restart_signal++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                0, 0);
                rc = copy_to_guest(vcpu, offsetof(struct _lowcore,
                  restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
                if (rc == -EFAULT)
@@ -259,6 +275,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                           inti->pgm.code,
                           table[vcpu->arch.sie_block->ipa >> 14]);
                vcpu->stat.deliver_program_int++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->pgm.code, 0);
                rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code);
                if (rc == -EFAULT)
                        exception = 1;
@@ -405,9 +423,7 @@ no_timer:
                set_current_state(TASK_INTERRUPTIBLE);
                spin_unlock_bh(&vcpu->arch.local_int.lock);
                spin_unlock(&vcpu->arch.local_int.float_int->lock);
-               vcpu_put(vcpu);
                schedule();
-               vcpu_load(vcpu);
                spin_lock(&vcpu->arch.local_int.float_int->lock);
                spin_lock_bh(&vcpu->arch.local_int.lock);
        }
@@ -515,6 +531,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
        inti->pgm.code = code;
 
        VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code);
+       trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1);
        spin_lock_bh(&li->lock);
        list_add(&inti->list, &li->list);
        atomic_set(&li->active, 1);
@@ -556,6 +573,8 @@ int kvm_s390_inject_vm(struct kvm *kvm,
                kfree(inti);
                return -EINVAL;
        }
+       trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
+                                2);
 
        mutex_lock(&kvm->lock);
        fi = &kvm->arch.float_int;
@@ -621,6 +640,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
                kfree(inti);
                return -EINVAL;
        }
+       trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm,
+                                  s390int->parm64, 2);
 
        mutex_lock(&vcpu->kvm->lock);
        li = &vcpu->arch.local_int;
index d470ccbfabae02e015e206f833a93ec0820941fa..ecced9d18986895fc8af13680a876d5f0cf532db 100644 (file)
 #include "kvm-s390.h"
 #include "gaccess.h"
 
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+#include "trace-s390.h"
+
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
@@ -242,6 +246,7 @@ out_err:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+       trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
        if (!kvm_is_ucontrol(vcpu->kvm)) {
                clear_bit(63 - vcpu->vcpu_id,
                          (unsigned long *) &vcpu->kvm->arch.sca->mcn);
@@ -417,6 +422,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                goto out_free_sie_block;
        VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
                 vcpu->arch.sie_block);
+       trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
 
        return vcpu;
 out_free_sie_block:
@@ -607,18 +613,22 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
        local_irq_enable();
        VCPU_EVENT(vcpu, 6, "entering sie flags %x",
                   atomic_read(&vcpu->arch.sie_block->cpuflags));
+       trace_kvm_s390_sie_enter(vcpu,
+                                atomic_read(&vcpu->arch.sie_block->cpuflags));
        rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
        if (rc) {
                if (kvm_is_ucontrol(vcpu->kvm)) {
                        rc = SIE_INTERCEPT_UCONTROL;
                } else {
                        VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
+                       trace_kvm_s390_sie_fault(vcpu);
                        kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                        rc = 0;
                }
        }
        VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
                   vcpu->arch.sie_block->icptcode);
+       trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
        local_irq_disable();
        kvm_guest_exit();
        local_irq_enable();
@@ -959,7 +969,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        return;
 }
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot)
 {
 }
 
index 310be61bead74bd3baf5fcb1d3e50cc132604408..d768906f15c81b27b82698ae49368579a2169ef9 100644 (file)
@@ -20,6 +20,7 @@
 #include <asm/sysinfo.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
+#include "trace.h"
 
 static int handle_set_prefix(struct kvm_vcpu *vcpu)
 {
@@ -59,6 +60,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
        kvm_s390_set_prefix(vcpu, address);
 
        VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
+       trace_kvm_s390_handle_prefix(vcpu, 1, address);
 out:
        return 0;
 }
@@ -91,6 +93,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
        }
 
        VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
+       trace_kvm_s390_handle_prefix(vcpu, 0, address);
 out:
        return 0;
 }
@@ -119,6 +122,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
        }
 
        VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
+       trace_kvm_s390_handle_stap(vcpu, useraddr);
 out:
        return 0;
 }
@@ -164,9 +168,11 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
                           &facility_list, sizeof(facility_list));
        if (rc == -EFAULT)
                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-       else
+       else {
                VCPU_EVENT(vcpu, 5, "store facility list value %x",
                           facility_list);
+               trace_kvm_s390_handle_stfl(vcpu, facility_list);
+       }
        return 0;
 }
 
@@ -278,6 +284,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                goto out_mem;
        }
+       trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
        free_page(mem);
        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
        vcpu->run->s.regs.gprs[0] = 0;
index 56f80e1f98f7b1955e23d07bb80497ac3b33dca7..566ddf6e8dfb54290afdb69d706cceadd7b78dbb 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/sigp.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
+#include "trace.h"
 
 static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
                        u64 *reg)
@@ -344,6 +345,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
        else
                parameter = vcpu->run->s.regs.gprs[r1 + 1];
 
+       trace_kvm_s390_handle_sigp(vcpu, order_code, cpu_addr, parameter);
        switch (order_code) {
        case SIGP_SENSE:
                vcpu->stat.instruction_sigp_sense++;
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
new file mode 100644 (file)
index 0000000..90fdf85
--- /dev/null
@@ -0,0 +1,210 @@
+#if !defined(_TRACE_KVMS390_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMS390_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm-s390
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace-s390
+
+/*
+ * Trace point for the creation of the kvm instance.
+ */
+TRACE_EVENT(kvm_s390_create_vm,
+           TP_PROTO(unsigned long type),
+           TP_ARGS(type),
+
+           TP_STRUCT__entry(
+                   __field(unsigned long, type)
+                   ),
+
+           TP_fast_assign(
+                   __entry->type = type;
+                   ),
+
+           TP_printk("create vm%s",
+                     __entry->type & KVM_VM_S390_UCONTROL ? " (UCONTROL)" : "")
+       );
+
+/*
+ * Trace points for creation and destruction of vpcus.
+ */
+TRACE_EVENT(kvm_s390_create_vcpu,
+           TP_PROTO(unsigned int id, struct kvm_vcpu *vcpu,
+                    struct kvm_s390_sie_block *sie_block),
+           TP_ARGS(id, vcpu, sie_block),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int, id)
+                   __field(struct kvm_vcpu *, vcpu)
+                   __field(struct kvm_s390_sie_block *, sie_block)
+                   ),
+
+           TP_fast_assign(
+                   __entry->id = id;
+                   __entry->vcpu = vcpu;
+                   __entry->sie_block = sie_block;
+                   ),
+
+           TP_printk("create cpu %d at %p, sie block at %p", __entry->id,
+                     __entry->vcpu, __entry->sie_block)
+       );
+
+TRACE_EVENT(kvm_s390_destroy_vcpu,
+           TP_PROTO(unsigned int id),
+           TP_ARGS(id),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int, id)
+                   ),
+
+           TP_fast_assign(
+                   __entry->id = id;
+                   ),
+
+           TP_printk("destroy cpu %d", __entry->id)
+       );
+
+/*
+ * Trace points for injection of interrupts, either per machine or
+ * per vcpu.
+ */
+
+#define kvm_s390_int_type                                              \
+       {KVM_S390_SIGP_STOP, "sigp stop"},                              \
+       {KVM_S390_PROGRAM_INT, "program interrupt"},                    \
+       {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"},                  \
+       {KVM_S390_RESTART, "sigp restart"},                             \
+       {KVM_S390_INT_VIRTIO, "virtio interrupt"},                      \
+       {KVM_S390_INT_SERVICE, "sclp interrupt"},                       \
+       {KVM_S390_INT_EMERGENCY, "sigp emergency"},                     \
+       {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"}
+
+TRACE_EVENT(kvm_s390_inject_vm,
+           TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who),
+           TP_ARGS(type, parm, parm64, who),
+
+           TP_STRUCT__entry(
+                   __field(__u32, inttype)
+                   __field(__u32, parm)
+                   __field(__u64, parm64)
+                   __field(int, who)
+                   ),
+
+           TP_fast_assign(
+                   __entry->inttype = type & 0x00000000ffffffff;
+                   __entry->parm = parm;
+                   __entry->parm64 = parm64;
+                   __entry->who = who;
+                   ),
+
+           TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx",
+                     (__entry->who == 1) ? " (from kernel)" :
+                     (__entry->who == 2) ? " (from user)" : "",
+                     __entry->inttype,
+                     __print_symbolic(__entry->inttype, kvm_s390_int_type),
+                     __entry->parm, __entry->parm64)
+       );
+
+TRACE_EVENT(kvm_s390_inject_vcpu,
+           TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \
+                    int who),
+           TP_ARGS(id, type, parm, parm64, who),
+
+           TP_STRUCT__entry(
+                   __field(int, id)
+                   __field(__u32, inttype)
+                   __field(__u32, parm)
+                   __field(__u64, parm64)
+                   __field(int, who)
+                   ),
+
+           TP_fast_assign(
+                   __entry->id = id;
+                   __entry->inttype = type & 0x00000000ffffffff;
+                   __entry->parm = parm;
+                   __entry->parm64 = parm64;
+                   __entry->who = who;
+                   ),
+
+           TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx",
+                     (__entry->who == 1) ? " (from kernel)" :
+                     (__entry->who == 2) ? " (from user)" : "",
+                     __entry->id, __entry->inttype,
+                     __print_symbolic(__entry->inttype, kvm_s390_int_type),
+                     __entry->parm, __entry->parm64)
+       );
+
+/*
+ * Trace point for the actual delivery of interrupts.
+ */
+TRACE_EVENT(kvm_s390_deliver_interrupt,
+           TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1),
+           TP_ARGS(id, type, data0, data1),
+
+           TP_STRUCT__entry(
+                   __field(int, id)
+                   __field(__u32, inttype)
+                   __field(__u32, data0)
+                   __field(__u64, data1)
+                   ),
+
+           TP_fast_assign(
+                   __entry->id = id;
+                   __entry->inttype = type & 0x00000000ffffffff;
+                   __entry->data0 = data0;
+                   __entry->data1 = data1;
+                   ),
+
+           TP_printk("deliver interrupt (vcpu %d): type:%x (%s) "      \
+                     "data:%08x %016llx",
+                     __entry->id, __entry->inttype,
+                     __print_symbolic(__entry->inttype, kvm_s390_int_type),
+                     __entry->data0, __entry->data1)
+       );
+
+/*
+ * Trace point for resets that may be requested from userspace.
+ */
+TRACE_EVENT(kvm_s390_request_resets,
+           TP_PROTO(__u64 resets),
+           TP_ARGS(resets),
+
+           TP_STRUCT__entry(
+                   __field(__u64, resets)
+                   ),
+
+           TP_fast_assign(
+                   __entry->resets = resets;
+                   ),
+
+           TP_printk("requesting userspace resets %llx",
+                     __entry->resets)
+       );
+
+/*
+ * Trace point for a vcpu's stop requests.
+ */
+TRACE_EVENT(kvm_s390_stop_request,
+           TP_PROTO(unsigned int action_bits),
+           TP_ARGS(action_bits),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int, action_bits)
+                   ),
+
+           TP_fast_assign(
+                   __entry->action_bits = action_bits;
+                   ),
+
+           TP_printk("stop request, action_bits = %08x",
+                     __entry->action_bits)
+       );
+
+
+#endif /* _TRACE_KVMS390_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
new file mode 100644 (file)
index 0000000..2b29e62
--- /dev/null
@@ -0,0 +1,341 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/sigp.h>
+#include <asm/debug.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Helpers for vcpu-specific tracepoints containing the same information
+ * as s390dbf VCPU_EVENTs.
+ */
+#define VCPU_PROTO_COMMON struct kvm_vcpu *vcpu
+#define VCPU_ARGS_COMMON vcpu
+#define VCPU_FIELD_COMMON __field(int, id)                     \
+       __field(unsigned long, pswmask)                         \
+       __field(unsigned long, pswaddr)
+#define VCPU_ASSIGN_COMMON do {                                                \
+       __entry->id = vcpu->vcpu_id;                                    \
+       __entry->pswmask = vcpu->arch.sie_block->gpsw.mask;             \
+       __entry->pswaddr = vcpu->arch.sie_block->gpsw.addr;             \
+       } while (0);
+#define VCPU_TP_PRINTK(p_str, p_args...)                               \
+       TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id,           \
+                 __entry->pswmask, __entry->pswaddr, p_args)
+
+/*
+ * Tracepoints for SIE entry and exit.
+ */
+TRACE_EVENT(kvm_s390_sie_enter,
+           TP_PROTO(VCPU_PROTO_COMMON, int cpuflags),
+           TP_ARGS(VCPU_ARGS_COMMON, cpuflags),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(int, cpuflags)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->cpuflags = cpuflags;
+                   ),
+
+           VCPU_TP_PRINTK("entering sie flags %x", __entry->cpuflags)
+       );
+
+TRACE_EVENT(kvm_s390_sie_fault,
+           TP_PROTO(VCPU_PROTO_COMMON),
+           TP_ARGS(VCPU_ARGS_COMMON),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   ),
+
+           VCPU_TP_PRINTK("%s", "fault in sie instruction")
+       );
+
+#define sie_intercept_code                             \
+       {0x04, "Instruction"},                          \
+       {0x08, "Program interruption"},                 \
+       {0x0C, "Instruction and program interuption"},  \
+       {0x10, "External request"},                     \
+       {0x14, "External interruption"},                \
+       {0x18, "I/O request"},                          \
+       {0x1C, "Wait state"},                           \
+       {0x20, "Validity"},                             \
+       {0x28, "Stop request"}
+
+TRACE_EVENT(kvm_s390_sie_exit,
+           TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode),
+           TP_ARGS(VCPU_ARGS_COMMON, icptcode),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(u8, icptcode)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->icptcode = icptcode;
+                   ),
+
+           VCPU_TP_PRINTK("exit sie icptcode %d (%s)", __entry->icptcode,
+                          __print_symbolic(__entry->icptcode,
+                                           sie_intercept_code))
+       );
+
+/*
+ * Trace point for intercepted instructions.
+ */
+TRACE_EVENT(kvm_s390_intercept_instruction,
+           TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+           TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(__u64, instruction)
+                   __field(char, insn[8])
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->instruction = ((__u64)ipa << 48) |
+                   ((__u64)ipb << 16);
+                   ),
+
+           VCPU_TP_PRINTK("intercepted instruction %016llx (%s)",
+                          __entry->instruction,
+                          insn_to_mnemonic((unsigned char *)
+                                           &__entry->instruction,
+                                        __entry->insn) ?
+                          "unknown" : __entry->insn)
+       );
+
+/*
+ * Trace point for intercepted program interruptions.
+ */
+TRACE_EVENT(kvm_s390_intercept_prog,
+           TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+           TP_ARGS(VCPU_ARGS_COMMON, code),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(__u16, code)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->code = code;
+                   ),
+
+           VCPU_TP_PRINTK("intercepted program interruption %04x",
+                          __entry->code)
+       );
+
+/*
+ * Trace point for validity intercepts.
+ */
+TRACE_EVENT(kvm_s390_intercept_validity,
+           TP_PROTO(VCPU_PROTO_COMMON, __u16 viwhy),
+           TP_ARGS(VCPU_ARGS_COMMON, viwhy),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(__u16, viwhy)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->viwhy = viwhy;
+                   ),
+
+           VCPU_TP_PRINTK("got validity intercept %04x", __entry->viwhy)
+       );
+
+/*
+ * Trace points for instructions that are of special interest.
+ */
+
+#define sigp_order_codes                                       \
+       {SIGP_SENSE, "sense"},                                  \
+       {SIGP_EXTERNAL_CALL, "external call"},                  \
+       {SIGP_EMERGENCY_SIGNAL, "emergency signal"},            \
+       {SIGP_STOP, "stop"},                                    \
+       {SIGP_STOP_AND_STORE_STATUS, "stop and store status"},  \
+       {SIGP_SET_ARCHITECTURE, "set architecture"},            \
+       {SIGP_SET_PREFIX, "set prefix"},                        \
+       {SIGP_SENSE_RUNNING, "sense running"},                  \
+       {SIGP_RESTART, "restart"}
+
+TRACE_EVENT(kvm_s390_handle_sigp,
+           TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \
+                    __u32 parameter),
+           TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr, parameter),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(__u8, order_code)
+                   __field(__u16, cpu_addr)
+                   __field(__u32, parameter)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->order_code = order_code;
+                   __entry->cpu_addr = cpu_addr;
+                   __entry->parameter = parameter;
+                   ),
+
+           VCPU_TP_PRINTK("handle sigp order %02x (%s), cpu address %04x, " \
+                          "parameter %08x", __entry->order_code,
+                          __print_symbolic(__entry->order_code,
+                                           sigp_order_codes),
+                          __entry->cpu_addr, __entry->parameter)
+       );
+
+#define diagnose_codes                         \
+       {0x10, "release pages"},                \
+       {0x44, "time slice end"},               \
+       {0x308, "ipl functions"},               \
+       {0x500, "kvm hypercall"},               \
+       {0x501, "kvm breakpoint"}
+
+TRACE_EVENT(kvm_s390_handle_diag,
+           TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+           TP_ARGS(VCPU_ARGS_COMMON, code),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(__u16, code)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->code = code;
+                   ),
+
+           VCPU_TP_PRINTK("handle diagnose call %04x (%s)", __entry->code,
+                          __print_symbolic(__entry->code, diagnose_codes))
+       );
+
+TRACE_EVENT(kvm_s390_handle_lctl,
+           TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr),
+           TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(int, g)
+                   __field(int, reg1)
+                   __field(int, reg3)
+                   __field(u64, addr)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->g = g;
+                   __entry->reg1 = reg1;
+                   __entry->reg3 = reg3;
+                   __entry->addr = addr;
+                   ),
+
+           VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx",
+                          __entry->g ? "lctlg" : "lctl",
+                          __entry->reg1, __entry->reg3, __entry->addr)
+       );
+
+TRACE_EVENT(kvm_s390_handle_prefix,
+           TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address),
+           TP_ARGS(VCPU_ARGS_COMMON, set, address),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(int, set)
+                   __field(u32, address)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->set = set;
+                   __entry->address = address;
+                   ),
+
+           VCPU_TP_PRINTK("%s prefix to %08x",
+                          __entry->set ? "setting" : "storing",
+                          __entry->address)
+       );
+
+TRACE_EVENT(kvm_s390_handle_stap,
+           TP_PROTO(VCPU_PROTO_COMMON, u64 address),
+           TP_ARGS(VCPU_ARGS_COMMON, address),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(u64, address)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->address = address;
+                   ),
+
+           VCPU_TP_PRINTK("storing cpu address to %016llx",
+                          __entry->address)
+       );
+
+TRACE_EVENT(kvm_s390_handle_stfl,
+           TP_PROTO(VCPU_PROTO_COMMON, unsigned int facility_list),
+           TP_ARGS(VCPU_ARGS_COMMON, facility_list),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(unsigned int, facility_list)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->facility_list = facility_list;
+                   ),
+
+           VCPU_TP_PRINTK("store facility list value %08x",
+                          __entry->facility_list)
+       );
+
+TRACE_EVENT(kvm_s390_handle_stsi,
+           TP_PROTO(VCPU_PROTO_COMMON, int fc, int sel1, int sel2, u64 addr),
+           TP_ARGS(VCPU_ARGS_COMMON, fc, sel1, sel2, addr),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(int, fc)
+                   __field(int, sel1)
+                   __field(int, sel2)
+                   __field(u64, addr)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->fc = fc;
+                   __entry->sel1 = sel1;
+                   __entry->sel2 = sel2;
+                   __entry->addr = addr;
+                   ),
+
+           VCPU_TP_PRINTK("STSI %d.%d.%d information stored to %016llx",
+                          __entry->fc, __entry->sel1, __entry->sel2,
+                          __entry->addr)
+       );
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 7f9a395c52548e021253dbe78511427700f05fb2..b72777ff32a9439ad401779b3bdce873f0d0b744 100644 (file)
@@ -586,23 +586,18 @@ config PARAVIRT_TIME_ACCOUNTING
 
 source "arch/x86/xen/Kconfig"
 
-config KVM_CLOCK
-       bool "KVM paravirtualized clock"
-       select PARAVIRT
-       select PARAVIRT_CLOCK
-       ---help---
-         Turning on this option will allow you to run a paravirtualized clock
-         when running over the KVM hypervisor. Instead of relying on a PIT
-         (or probably other) emulation by the underlying device model, the host
-         provides the guest with timing infrastructure such as time of day, and
-         system time
-
 config KVM_GUEST
-       bool "KVM Guest support"
+       bool "KVM Guest support (including kvmclock)"
+       select PARAVIRT
        select PARAVIRT
+       select PARAVIRT_CLOCK
+       default y if PARAVIRT_GUEST
        ---help---
          This option enables various optimizations for running under the KVM
-         hypervisor.
+         hypervisor. It includes a paravirtualized clock, so that instead
+         of relying on a PIT (or probably other) emulation by the
+         underlying device model, the host provides the guest with
+         timing infrastructure such as time of day, and system time
 
 source "arch/x86/lguest/Kconfig"
 
index 41e08cb6a0924bbbc9b92c4ea8a2de7b4af206ef..a65ec29e6ffb0e4cbc69bdbaf353f207f0495c26 100644 (file)
@@ -41,6 +41,7 @@
 #define __KVM_HAVE_DEBUGREGS
 #define __KVM_HAVE_XSAVE
 #define __KVM_HAVE_XCRS
+#define __KVM_HAVE_READONLY_MEM
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
index c764f43b71c5c851f46be710023d4945f1df3c76..15f960c06ff7ca23de88f7d72c48c83dbb489810 100644 (file)
@@ -85,6 +85,19 @@ struct x86_instruction_info {
 #define X86EMUL_INTERCEPTED     6 /* Intercepted by nested VMCB/VMCS */
 
 struct x86_emulate_ops {
+       /*
+        * read_gpr: read a general purpose register (rax - r15)
+        *
+        * @reg: gpr number.
+        */
+       ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
+       /*
+        * write_gpr: write a general purpose register (rax - r15)
+        *
+        * @reg: gpr number.
+        * @val: value to write.
+        */
+       void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
        /*
         * read_std: Read bytes of standard (non-emulated/special) memory.
         *           Used for descriptor reading.
@@ -200,8 +213,9 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
 
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
-       enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+       enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
        unsigned int bytes;
+       unsigned int count;
        union {
                unsigned long orig_val;
                u64 orig_val64;
@@ -221,6 +235,7 @@ struct operand {
                char valptr[sizeof(unsigned long) + 2];
                sse128_t vec_val;
                u64 mm_val;
+               void *data;
        };
 };
 
@@ -236,14 +251,23 @@ struct read_cache {
        unsigned long end;
 };
 
+/* Execution mode, passed to the emulator. */
+enum x86emul_mode {
+       X86EMUL_MODE_REAL,      /* Real mode.             */
+       X86EMUL_MODE_VM86,      /* Virtual 8086 mode.     */
+       X86EMUL_MODE_PROT16,    /* 16-bit protected mode. */
+       X86EMUL_MODE_PROT32,    /* 32-bit protected mode. */
+       X86EMUL_MODE_PROT64,    /* 64-bit (long) mode.    */
+};
+
 struct x86_emulate_ctxt {
-       struct x86_emulate_ops *ops;
+       const struct x86_emulate_ops *ops;
 
        /* Register state before/after emulation. */
        unsigned long eflags;
        unsigned long eip; /* eip before instruction emulation */
        /* Emulated execution mode, represented by an X86EMUL_MODE value. */
-       int mode;
+       enum x86emul_mode mode;
 
        /* interruptibility state, as a result of execution of STI or MOV SS */
        int interruptibility;
@@ -281,8 +305,10 @@ struct x86_emulate_ctxt {
        bool rip_relative;
        unsigned long _eip;
        struct operand memop;
+       u32 regs_valid;  /* bitmaps of registers in _regs[] that can be read */
+       u32 regs_dirty;  /* bitmaps of registers in _regs[] that have been written */
        /* Fields above regs are cleared together. */
-       unsigned long regs[NR_VCPU_REGS];
+       unsigned long _regs[NR_VCPU_REGS];
        struct operand *memopp;
        struct fetch_cache fetch;
        struct read_cache io_read;
@@ -293,17 +319,6 @@ struct x86_emulate_ctxt {
 #define REPE_PREFIX    0xf3
 #define REPNE_PREFIX   0xf2
 
-/* Execution mode, passed to the emulator. */
-#define X86EMUL_MODE_REAL     0        /* Real mode.             */
-#define X86EMUL_MODE_VM86     1        /* Virtual 8086 mode.     */
-#define X86EMUL_MODE_PROT16   2        /* 16-bit protected mode. */
-#define X86EMUL_MODE_PROT32   4        /* 32-bit protected mode. */
-#define X86EMUL_MODE_PROT64   8        /* 64-bit (long) mode.    */
-
-/* any protected mode   */
-#define X86EMUL_MODE_PROT     (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
-                              X86EMUL_MODE_PROT64)
-
 /* CPUID vendors */
 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
@@ -394,4 +409,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
                         u16 tss_selector, int idt_index, int reason,
                         bool has_error_code, u32 error_code);
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
index 1eaa6b056670d19d965bbf46050e0a9266ddef7f..b2e11f4524354db111134fb28ef9dfd99f3edaee 100644 (file)
@@ -271,10 +271,24 @@ struct kvm_mmu {
        union kvm_mmu_page_role base_role;
        bool direct_map;
 
+       /*
+        * Bitmap; bit set = permission fault
+        * Byte index: page fault error code [4:1]
+        * Bit index: pte permissions in ACC_* format
+        */
+       u8 permissions[16];
+
        u64 *pae_root;
        u64 *lm_root;
        u64 rsvd_bits_mask[2][4];
 
+       /*
+        * Bitmap: bit set = last pte in walk
+        * index[0:1]: level (zero-based)
+        * index[2]: pte.ps
+        */
+       u8 last_pte_bitmap;
+
        bool nx;
 
        u64 pdptrs[4]; /* pae */
@@ -398,12 +412,15 @@ struct kvm_vcpu_arch {
        struct x86_emulate_ctxt emulate_ctxt;
        bool emulate_regs_need_sync_to_vcpu;
        bool emulate_regs_need_sync_from_vcpu;
+       int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
 
        gpa_t time;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned int hw_tsc_khz;
        unsigned int time_offset;
        struct page *time_page;
+       /* set guest stopped flag in pvclock flags field */
+       bool pvclock_set_guest_stopped_request;
 
        struct {
                u64 msr_val;
@@ -438,6 +455,7 @@ struct kvm_vcpu_arch {
        unsigned long dr6;
        unsigned long dr7;
        unsigned long eff_db[KVM_NR_DB_REGS];
+       unsigned long guest_debug_dr7;
 
        u64 mcg_cap;
        u64 mcg_status;
@@ -484,14 +502,24 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_lpage_info {
-       unsigned long rmap_pde;
        int write_count;
 };
 
 struct kvm_arch_memory_slot {
+       unsigned long *rmap[KVM_NR_PAGE_SIZES];
        struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
 
+struct kvm_apic_map {
+       struct rcu_head rcu;
+       u8 ldr_bits;
+       /* fields bellow are used to decode ldr values in different modes */
+       u32 cid_shift, cid_mask, lid_mask;
+       struct kvm_lapic *phys_map[256];
+       /* first index is cluster id second is cpu id in a cluster */
+       struct kvm_lapic *logical_map[16][16];
+};
+
 struct kvm_arch {
        unsigned int n_used_mmu_pages;
        unsigned int n_requested_mmu_pages;
@@ -509,6 +537,8 @@ struct kvm_arch {
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
        int vapics_in_nmi_mode;
+       struct mutex apic_map_lock;
+       struct kvm_apic_map *apic_map;
 
        unsigned int tss_addr;
        struct page *apic_access_page;
@@ -602,8 +632,7 @@ struct kvm_x86_ops {
        void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
        void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
-       void (*set_guest_debug)(struct kvm_vcpu *vcpu,
-                               struct kvm_guest_debug *dbg);
+       void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
        int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
        int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
        u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -941,6 +970,7 @@ extern bool kvm_rebooting;
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
index 2f7712e08b1e80ceb98b889ab1b3b19fb1706a11..eb3e9d85e1f1c75fc0db709cc40dd7a226d928cd 100644 (file)
@@ -102,21 +102,21 @@ struct kvm_vcpu_pv_apf_data {
 extern void kvmclock_init(void);
 extern int kvm_register_clock(char *txt);
 
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
 bool kvm_check_and_clear_guest_paused(void);
 #else
 static inline bool kvm_check_and_clear_guest_paused(void)
 {
        return false;
 }
-#endif /* CONFIG_KVMCLOCK */
+#endif /* CONFIG_KVM_GUEST */
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
  */
 #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
 
-/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
  * instruction.  The hypervisor may replace it with something else but only the
  * instructions are guaranteed to be supported.
  *
index 8d7a619718b5fac1b245985cbc185c108c5a4a94..a48ea05157d3bbcb8cb56c4a2f92bb45cad446e8 100644 (file)
@@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST)       += test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)    += test_nx.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
-obj-$(CONFIG_KVM_GUEST)                += kvm.o
-obj-$(CONFIG_KVM_CLOCK)                += kvmclock.o
+obj-$(CONFIG_KVM_GUEST)                += kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)         += paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)   += pvclock.o
index c1d61ee4b4f12acfe6094b5c81bf3566cb6df0b3..b3e5e51bc907ef4da71549ff7e2214467721af0a 100644 (file)
@@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                wrmsrl(MSR_KVM_PV_EOI_EN, 0);
        kvm_pv_disable_apf();
+       kvm_disable_steal_time();
 }
 
 static int kvm_pv_reboot_notify(struct notifier_block *nb,
@@ -396,9 +397,7 @@ void kvm_disable_steal_time(void)
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
-#ifdef CONFIG_KVM_CLOCK
        WARN_ON(kvm_register_clock("primary cpu clock"));
-#endif
        kvm_guest_cpu_init();
        native_smp_prepare_boot_cpu();
 }
index 4f165479c4537cee4badf71a4862826936d7a228..d609be046b5749991c01919561055072d717f970 100644 (file)
@@ -957,7 +957,7 @@ void __init setup_arch(char **cmdline_p)
        initmem_init();
        memblock_find_dma_reserve();
 
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
        kvmclock_init();
 #endif
 
index a28f338843eaa083a046ea69dca78f6e3ff541fe..586f0005980510e9683ea6c1dbf436cec6e4d52f 100644 (file)
@@ -20,6 +20,7 @@ if VIRTUALIZATION
 config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
        depends on HAVE_KVM
+       depends on HIGH_RES_TIMERS
        # for device assignment:
        depends on PCI
        # for TASKSTATS/TASK_DELAY_ACCT:
@@ -37,6 +38,7 @@ config KVM
        select TASK_DELAY_ACCT
        select PERF_EVENTS
        select HAVE_KVM_MSI
+       select HAVE_KVM_CPU_RELAX_INTERCEPT
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
index 4f579e8dcacf6747a7e3a34db765bf112233680f..04d30401c5cb26aa2b491ad3b0ceeb8d9fa7e8ff 100644 (file)
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API)       += $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)     += $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-                          i8254.o timer.o cpuid.o pmu.o
+                          i8254.o cpuid.o pmu.o
 kvm-intel-y            += vmx.o
 kvm-amd-y              += svm.o
 
index 0595f1397b7c0013058afec906e5d329f0d7f7d0..ec79e773342ea58cc28d614b7a1ca911f3677694 100644 (file)
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        }
        case 7: {
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-               /* Mask ebx against host capbability word 9 */
+               /* Mask ebx against host capability word 9 */
                if (index == 0) {
                        entry->ebx &= kvm_supported_word9_x86_features;
                        cpuid_mask(&entry->ebx, 9);
@@ -397,8 +397,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                break;
        }
        case KVM_CPUID_SIGNATURE: {
-               char signature[12] = "KVMKVMKVM\0\0";
-               u32 *sigptr = (u32 *)signature;
+               static const char signature[12] = "KVMKVMKVM\0\0";
+               const u32 *sigptr = (const u32 *)signature;
                entry->eax = KVM_CPUID_FEATURES;
                entry->ebx = sigptr[0];
                entry->ecx = sigptr[1];
@@ -484,10 +484,10 @@ struct kvm_cpuid_param {
        u32 func;
        u32 idx;
        bool has_leaf_count;
-       bool (*qualifier)(struct kvm_cpuid_param *param);
+       bool (*qualifier)(const struct kvm_cpuid_param *param);
 };
 
-static bool is_centaur_cpu(struct kvm_cpuid_param *param)
+static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
 {
        return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
 }
@@ -498,7 +498,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
        struct kvm_cpuid_entry2 *cpuid_entries;
        int limit, nent = 0, r = -E2BIG, i;
        u32 func;
-       static struct kvm_cpuid_param param[] = {
+       static const struct kvm_cpuid_param param[] = {
                { .func = 0, .has_leaf_count = true },
                { .func = 0x80000000, .has_leaf_count = true },
                { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
@@ -517,7 +517,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 
        r = 0;
        for (i = 0; i < ARRAY_SIZE(param); i++) {
-               struct kvm_cpuid_param *ent = &param[i];
+               const struct kvm_cpuid_param *ent = &param[i];
 
                if (ent->qualifier && !ent->qualifier(ent))
                        continue;
index a3b57a27be880649ac7b0ae4144bc62faf105cbc..39171cb307ea05d6687bfc083ddd87935cf4ca4d 100644 (file)
@@ -161,9 +161,9 @@ struct opcode {
        u64 intercept : 8;
        union {
                int (*execute)(struct x86_emulate_ctxt *ctxt);
-               struct opcode *group;
-               struct group_dual *gdual;
-               struct gprefix *gprefix;
+               const struct opcode *group;
+               const struct group_dual *gdual;
+               const struct gprefix *gprefix;
        } u;
        int (*check_perm)(struct x86_emulate_ctxt *ctxt);
 };
@@ -202,6 +202,42 @@ struct gprefix {
 #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
 #define EFLG_RESERVED_ONE_MASK 2
 
+static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+       if (!(ctxt->regs_valid & (1 << nr))) {
+               ctxt->regs_valid |= 1 << nr;
+               ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+       }
+       return ctxt->_regs[nr];
+}
+
+static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+       ctxt->regs_valid |= 1 << nr;
+       ctxt->regs_dirty |= 1 << nr;
+       return &ctxt->_regs[nr];
+}
+
+static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+       reg_read(ctxt, nr);
+       return reg_write(ctxt, nr);
+}
+
+static void writeback_registers(struct x86_emulate_ctxt *ctxt)
+{
+       unsigned reg;
+
+       for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
+               ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
+}
+
+static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
+{
+       ctxt->regs_dirty = 0;
+       ctxt->regs_valid = 0;
+}
+
 /*
  * Instruction emulation:
  * Most instructions are emulated directly via a fragment of inline assembly
@@ -374,8 +410,8 @@ struct gprefix {
 #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex)                 \
        do {                                                            \
                unsigned long _tmp;                                     \
-               ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX];              \
-               ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX];              \
+               ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX);            \
+               ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX);            \
                                                                        \
                __asm__ __volatile__ (                                  \
                        _PRE_EFLAGS("0", "5", "1")                      \
@@ -494,7 +530,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in
 
 static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
 {
-       masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc);
+       masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
 }
 
 static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -632,8 +668,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 
        la = seg_base(ctxt, addr.seg) + addr.ea;
        switch (ctxt->mode) {
-       case X86EMUL_MODE_REAL:
-               break;
        case X86EMUL_MODE_PROT64:
                if (((signed long)la << 16) >> 16 != la)
                        return emulate_gp(ctxt, 0);
@@ -655,7 +689,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
                        if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
                                goto bad;
                } else {
-                       /* exapand-down segment */
+                       /* expand-down segment */
                        if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
                                goto bad;
                        lim = desc.d ? 0xffffffff : 0xffff;
@@ -663,7 +697,10 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
                                goto bad;
                }
                cpl = ctxt->ops->cpl(ctxt);
-               rpl = sel & 3;
+               if (ctxt->mode == X86EMUL_MODE_REAL)
+                       rpl = 0;
+               else
+                       rpl = sel & 3;
                cpl = max(cpl, rpl);
                if (!(desc.type & 8)) {
                        /* data segment */
@@ -688,9 +725,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
        return X86EMUL_CONTINUE;
 bad:
        if (addr.seg == VCPU_SREG_SS)
-               return emulate_ss(ctxt, addr.seg);
+               return emulate_ss(ctxt, sel);
        else
-               return emulate_gp(ctxt, addr.seg);
+               return emulate_gp(ctxt, sel);
 }
 
 static int linearize(struct x86_emulate_ctxt *ctxt,
@@ -786,14 +823,15 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
  * pointer into the block that addresses the relevant register.
  * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
  */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
+static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
                             int highbyte_regs)
 {
        void *p;
 
-       p = &regs[modrm_reg];
        if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
-               p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+               p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
+       else
+               p = reg_rmw(ctxt, modrm_reg);
        return p;
 }
 
@@ -871,23 +909,23 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
 {
        ctxt->ops->get_fpu(ctxt);
        switch (reg) {
-       case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
-       case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
-       case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
-       case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
-       case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
-       case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
-       case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
-       case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
+       case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+       case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+       case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+       case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+       case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+       case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+       case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+       case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
 #ifdef CONFIG_X86_64
-       case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
-       case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
-       case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
-       case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
-       case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
-       case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
-       case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
-       case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
+       case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+       case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+       case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+       case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+       case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+       case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+       case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+       case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
 #endif
        default: BUG();
        }
@@ -899,23 +937,23 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 {
        ctxt->ops->get_fpu(ctxt);
        switch (reg) {
-       case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
-       case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
-       case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
-       case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
-       case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
-       case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
-       case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
-       case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
+       case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+       case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+       case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+       case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+       case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+       case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+       case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+       case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
 #ifdef CONFIG_X86_64
-       case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
-       case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
-       case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
-       case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
-       case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
-       case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
-       case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
-       case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
+       case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+       case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+       case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+       case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+       case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+       case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+       case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+       case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
 #endif
        default: BUG();
        }
@@ -982,10 +1020,10 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 
        op->type = OP_REG;
        if (ctxt->d & ByteOp) {
-               op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
+               op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
                op->bytes = 1;
        } else {
-               op->addr.reg = decode_register(reg, ctxt->regs, 0);
+               op->addr.reg = decode_register(ctxt, reg, 0);
                op->bytes = ctxt->op_bytes;
        }
        fetch_register_operand(op);
@@ -1020,8 +1058,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
        if (ctxt->modrm_mod == 3) {
                op->type = OP_REG;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-               op->addr.reg = decode_register(ctxt->modrm_rm,
-                                              ctxt->regs, ctxt->d & ByteOp);
+               op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp);
                if (ctxt->d & Sse) {
                        op->type = OP_XMM;
                        op->bytes = 16;
@@ -1042,10 +1079,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
        op->type = OP_MEM;
 
        if (ctxt->ad_bytes == 2) {
-               unsigned bx = ctxt->regs[VCPU_REGS_RBX];
-               unsigned bp = ctxt->regs[VCPU_REGS_RBP];
-               unsigned si = ctxt->regs[VCPU_REGS_RSI];
-               unsigned di = ctxt->regs[VCPU_REGS_RDI];
+               unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
+               unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
+               unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
+               unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
 
                /* 16-bit ModR/M decode. */
                switch (ctxt->modrm_mod) {
@@ -1102,17 +1139,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                        if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
                                modrm_ea += insn_fetch(s32, ctxt);
                        else {
-                               modrm_ea += ctxt->regs[base_reg];
+                               modrm_ea += reg_read(ctxt, base_reg);
                                adjust_modrm_seg(ctxt, base_reg);
                        }
                        if (index_reg != 4)
-                               modrm_ea += ctxt->regs[index_reg] << scale;
+                               modrm_ea += reg_read(ctxt, index_reg) << scale;
                } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
                        if (ctxt->mode == X86EMUL_MODE_PROT64)
                                ctxt->rip_relative = 1;
                } else {
                        base_reg = ctxt->modrm_rm;
-                       modrm_ea += ctxt->regs[base_reg];
+                       modrm_ea += reg_read(ctxt, base_reg);
                        adjust_modrm_seg(ctxt, base_reg);
                }
                switch (ctxt->modrm_mod) {
@@ -1179,24 +1216,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
        int rc;
        struct read_cache *mc = &ctxt->mem_read;
 
-       while (size) {
-               int n = min(size, 8u);
-               size -= n;
-               if (mc->pos < mc->end)
-                       goto read_cached;
+       if (mc->pos < mc->end)
+               goto read_cached;
 
-               rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
-                                             &ctxt->exception);
-               if (rc != X86EMUL_CONTINUE)
-                       return rc;
-               mc->end += n;
+       WARN_ON((mc->end + size) >= sizeof(mc->data));
 
-       read_cached:
-               memcpy(dest, mc->data + mc->pos, n);
-               mc->pos += n;
-               dest += n;
-               addr += n;
-       }
+       rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+                                     &ctxt->exception);
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
+
+       mc->end += size;
+
+read_cached:
+       memcpy(dest, mc->data + mc->pos, size);
+       mc->pos += size;
        return X86EMUL_CONTINUE;
 }
 
@@ -1253,10 +1287,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
        if (rc->pos == rc->end) { /* refill pio read ahead */
                unsigned int in_page, n;
                unsigned int count = ctxt->rep_prefix ?
-                       address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
+                       address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
                in_page = (ctxt->eflags & EFLG_DF) ?
-                       offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
-                       PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
+                       offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
+                       PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
                n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
                        count);
                if (n == 0)
@@ -1267,8 +1301,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
                rc->end = n * size;
        }
 
-       memcpy(dest, rc->data + rc->pos, size);
-       rc->pos += size;
+       if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
+               ctxt->dst.data = rc->data + rc->pos;
+               ctxt->dst.type = OP_MEM_STR;
+               ctxt->dst.count = (rc->end - rc->pos) / size;
+               rc->pos = rc->end;
+       } else {
+               memcpy(dest, rc->data + rc->pos, size);
+               rc->pos += size;
+       }
        return 1;
 }
 
@@ -1291,7 +1332,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
                                     u16 selector, struct desc_ptr *dt)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
 
        if (selector & 1 << 2) {
                struct desc_struct desc;
@@ -1355,19 +1396,15 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
        ulong desc_addr;
        int ret;
+       u16 dummy;
 
        memset(&seg_desc, 0, sizeof seg_desc);
 
        if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
            || ctxt->mode == X86EMUL_MODE_REAL) {
                /* set real mode segment descriptor */
+               ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
                set_desc_base(&seg_desc, selector << 4);
-               set_desc_limit(&seg_desc, 0xffff);
-               seg_desc.type = 3;
-               seg_desc.p = 1;
-               seg_desc.s = 1;
-               if (ctxt->mode == X86EMUL_MODE_VM86)
-                       seg_desc.dpl = 3;
                goto load;
        }
 
@@ -1396,7 +1433,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        err_code = selector & 0xfffc;
        err_vec = GP_VECTOR;
 
-       /* can't load system descriptor into segment selecor */
+       /* can't load system descriptor into segment selector */
        if (seg <= VCPU_SREG_GS && !seg_desc.s)
                goto exception;
 
@@ -1516,6 +1553,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                break;
+       case OP_MEM_STR:
+               rc = segmented_write(ctxt,
+                               ctxt->dst.addr.mem,
+                               ctxt->dst.data,
+                               ctxt->dst.bytes * ctxt->dst.count);
+               if (rc != X86EMUL_CONTINUE)
+                       return rc;
+               break;
        case OP_XMM:
                write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
                break;
@@ -1536,7 +1581,7 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
        struct segmented_address addr;
 
        rsp_increment(ctxt, -bytes);
-       addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
+       addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
        addr.seg = VCPU_SREG_SS;
 
        return segmented_write(ctxt, addr, data, bytes);
@@ -1555,7 +1600,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
        int rc;
        struct segmented_address addr;
 
-       addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
+       addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
        addr.seg = VCPU_SREG_SS;
        rc = segmented_read(ctxt, addr, dest, len);
        if (rc != X86EMUL_CONTINUE)
@@ -1623,26 +1668,28 @@ static int em_enter(struct x86_emulate_ctxt *ctxt)
        int rc;
        unsigned frame_size = ctxt->src.val;
        unsigned nesting_level = ctxt->src2.val & 31;
+       ulong rbp;
 
        if (nesting_level)
                return X86EMUL_UNHANDLEABLE;
 
-       rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
+       rbp = reg_read(ctxt, VCPU_REGS_RBP);
+       rc = push(ctxt, &rbp, stack_size(ctxt));
        if (rc != X86EMUL_CONTINUE)
                return rc;
-       assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
+       assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
                      stack_mask(ctxt));
-       assign_masked(&ctxt->regs[VCPU_REGS_RSP],
-                     ctxt->regs[VCPU_REGS_RSP] - frame_size,
+       assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
+                     reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
                      stack_mask(ctxt));
        return X86EMUL_CONTINUE;
 }
 
 static int em_leave(struct x86_emulate_ctxt *ctxt)
 {
-       assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
+       assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
                      stack_mask(ctxt));
-       return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
+       return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
 }
 
 static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
@@ -1670,13 +1717,13 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
 
 static int em_pusha(struct x86_emulate_ctxt *ctxt)
 {
-       unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
+       unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
        int rc = X86EMUL_CONTINUE;
        int reg = VCPU_REGS_RAX;
 
        while (reg <= VCPU_REGS_RDI) {
                (reg == VCPU_REGS_RSP) ?
-               (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
+               (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
 
                rc = em_push(ctxt);
                if (rc != X86EMUL_CONTINUE)
@@ -1705,7 +1752,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
                        --reg;
                }
 
-               rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
+               rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
                if (rc != X86EMUL_CONTINUE)
                        break;
                --reg;
@@ -1713,9 +1760,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
        return rc;
 }
 
-int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        int rc;
        struct desc_ptr dt;
        gva_t cs_addr;
@@ -1762,11 +1809,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
        return rc;
 }
 
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+       int rc;
+
+       invalidate_registers(ctxt);
+       rc = __emulate_int_real(ctxt, irq);
+       if (rc == X86EMUL_CONTINUE)
+               writeback_registers(ctxt);
+       return rc;
+}
+
 static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
 {
        switch(ctxt->mode) {
        case X86EMUL_MODE_REAL:
-               return emulate_int_real(ctxt, irq);
+               return __emulate_int_real(ctxt, irq);
        case X86EMUL_MODE_VM86:
        case X86EMUL_MODE_PROT16:
        case X86EMUL_MODE_PROT32:
@@ -1973,14 +2031,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 {
        u64 old = ctxt->dst.orig_val64;
 
-       if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
-           ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
-               ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-               ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+       if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+           ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+               *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
+               *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
                ctxt->eflags &= ~EFLG_ZF;
        } else {
-               ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
-                       (u32) ctxt->regs[VCPU_REGS_RBX];
+               ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
+                       (u32) reg_read(ctxt, VCPU_REGS_RBX);
 
                ctxt->eflags |= EFLG_ZF;
        }
@@ -2016,7 +2074,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
 {
        /* Save real source value, then compare EAX against destination. */
        ctxt->src.orig_val = ctxt->src.val;
-       ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+       ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
        emulate_2op_SrcV(ctxt, "cmp");
 
        if (ctxt->eflags & EFLG_ZF) {
@@ -2025,7 +2083,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
        } else {
                /* Failure: write the value we saw to EAX. */
                ctxt->dst.type = OP_REG;
-               ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
+               ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
        }
        return X86EMUL_CONTINUE;
 }
@@ -2050,12 +2108,6 @@ static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
                        struct desc_struct *cs, struct desc_struct *ss)
 {
-       u16 selector;
-
-       memset(cs, 0, sizeof(struct desc_struct));
-       ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
-       memset(ss, 0, sizeof(struct desc_struct));
-
        cs->l = 0;              /* will be adjusted later */
        set_desc_base(cs, 0);   /* flat segment */
        cs->g = 1;              /* 4kb granularity */
@@ -2065,6 +2117,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
        cs->dpl = 0;            /* will be adjusted later */
        cs->p = 1;
        cs->d = 1;
+       cs->avl = 0;
 
        set_desc_base(ss, 0);   /* flat segment */
        set_desc_limit(ss, 0xfffff);    /* 4GB limit */
@@ -2074,6 +2127,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
        ss->d = 1;              /* 32bit stack segment */
        ss->dpl = 0;
        ss->p = 1;
+       ss->l = 0;
+       ss->avl = 0;
 }
 
 static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
@@ -2089,7 +2144,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
 
 static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        u32 eax, ebx, ecx, edx;
 
        /*
@@ -2133,7 +2188,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 
 static int em_syscall(struct x86_emulate_ctxt *ctxt)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct cs, ss;
        u64 msr_data;
        u16 cs_sel, ss_sel;
@@ -2165,10 +2220,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
        ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
        ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-       ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
+       *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
        if (efer & EFER_LMA) {
 #ifdef CONFIG_X86_64
-               ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+               *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
 
                ops->get_msr(ctxt,
                             ctxt->mode == X86EMUL_MODE_PROT64 ?
@@ -2191,7 +2246,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 
 static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct cs, ss;
        u64 msr_data;
        u16 cs_sel, ss_sel;
@@ -2228,6 +2283,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
                if (msr_data == 0x0)
                        return emulate_gp(ctxt, 0);
                break;
+       default:
+               break;
        }
 
        ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -2247,14 +2304,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
        ctxt->_eip = msr_data;
 
        ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
-       ctxt->regs[VCPU_REGS_RSP] = msr_data;
+       *reg_write(ctxt, VCPU_REGS_RSP) = msr_data;
 
        return X86EMUL_CONTINUE;
 }
 
 static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct cs, ss;
        u64 msr_data;
        int usermode;
@@ -2297,8 +2354,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
        ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
        ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-       ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
-       ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
+       ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
+       *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
 
        return X86EMUL_CONTINUE;
 }
@@ -2317,7 +2374,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
                                            u16 port, u16 len)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct tr_seg;
        u32 base3;
        int r;
@@ -2367,14 +2424,14 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 {
        tss->ip = ctxt->_eip;
        tss->flag = ctxt->eflags;
-       tss->ax = ctxt->regs[VCPU_REGS_RAX];
-       tss->cx = ctxt->regs[VCPU_REGS_RCX];
-       tss->dx = ctxt->regs[VCPU_REGS_RDX];
-       tss->bx = ctxt->regs[VCPU_REGS_RBX];
-       tss->sp = ctxt->regs[VCPU_REGS_RSP];
-       tss->bp = ctxt->regs[VCPU_REGS_RBP];
-       tss->si = ctxt->regs[VCPU_REGS_RSI];
-       tss->di = ctxt->regs[VCPU_REGS_RDI];
+       tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
+       tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
+       tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
+       tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
+       tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
+       tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
+       tss->si = reg_read(ctxt, VCPU_REGS_RSI);
+       tss->di = reg_read(ctxt, VCPU_REGS_RDI);
 
        tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
        tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2390,14 +2447,14 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 
        ctxt->_eip = tss->ip;
        ctxt->eflags = tss->flag | 2;
-       ctxt->regs[VCPU_REGS_RAX] = tss->ax;
-       ctxt->regs[VCPU_REGS_RCX] = tss->cx;
-       ctxt->regs[VCPU_REGS_RDX] = tss->dx;
-       ctxt->regs[VCPU_REGS_RBX] = tss->bx;
-       ctxt->regs[VCPU_REGS_RSP] = tss->sp;
-       ctxt->regs[VCPU_REGS_RBP] = tss->bp;
-       ctxt->regs[VCPU_REGS_RSI] = tss->si;
-       ctxt->regs[VCPU_REGS_RDI] = tss->di;
+       *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
+       *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
+       *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
+       *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
+       *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
+       *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
+       *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
+       *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
 
        /*
         * SDM says that segment selectors are loaded before segment
@@ -2410,7 +2467,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
        set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
 
        /*
-        * Now load segment descriptors. If fault happenes at this stage
+        * Now load segment descriptors. If fault happens at this stage
         * it is handled in a context of new task
         */
        ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
@@ -2436,7 +2493,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
                          u16 tss_selector, u16 old_tss_sel,
                          ulong old_tss_base, struct desc_struct *new_desc)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        struct tss_segment_16 tss_seg;
        int ret;
        u32 new_tss_base = get_desc_base(new_desc);
@@ -2482,14 +2539,14 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
        tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
        tss->eip = ctxt->_eip;
        tss->eflags = ctxt->eflags;
-       tss->eax = ctxt->regs[VCPU_REGS_RAX];
-       tss->ecx = ctxt->regs[VCPU_REGS_RCX];
-       tss->edx = ctxt->regs[VCPU_REGS_RDX];
-       tss->ebx = ctxt->regs[VCPU_REGS_RBX];
-       tss->esp = ctxt->regs[VCPU_REGS_RSP];
-       tss->ebp = ctxt->regs[VCPU_REGS_RBP];
-       tss->esi = ctxt->regs[VCPU_REGS_RSI];
-       tss->edi = ctxt->regs[VCPU_REGS_RDI];
+       tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
+       tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
+       tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
+       tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
+       tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
+       tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
+       tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
+       tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
 
        tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
        tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2511,14 +2568,14 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
        ctxt->eflags = tss->eflags | 2;
 
        /* General purpose registers */
-       ctxt->regs[VCPU_REGS_RAX] = tss->eax;
-       ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
-       ctxt->regs[VCPU_REGS_RDX] = tss->edx;
-       ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
-       ctxt->regs[VCPU_REGS_RSP] = tss->esp;
-       ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
-       ctxt->regs[VCPU_REGS_RSI] = tss->esi;
-       ctxt->regs[VCPU_REGS_RDI] = tss->edi;
+       *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
+       *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
+       *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
+       *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
+       *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
+       *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
+       *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
+       *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
 
        /*
         * SDM says that segment selectors are loaded before segment
@@ -2583,7 +2640,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                          u16 tss_selector, u16 old_tss_sel,
                          ulong old_tss_base, struct desc_struct *new_desc)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        struct tss_segment_32 tss_seg;
        int ret;
        u32 new_tss_base = get_desc_base(new_desc);
@@ -2627,7 +2684,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
                                   u16 tss_selector, int idt_index, int reason,
                                   bool has_error_code, u32 error_code)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct curr_tss_desc, next_tss_desc;
        int ret;
        u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2652,7 +2709,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
         *
         * 1. jmp/call/int to task gate: Check against DPL of the task gate
         * 2. Exception/IRQ/iret: No check is performed
-        * 3. jmp/call to TSS: Check agains DPL of the TSS
+        * 3. jmp/call to TSS: Check against DPL of the TSS
         */
        if (reason == TASK_SWITCH_GATE) {
                if (idt_index != -1) {
@@ -2693,7 +2750,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
                ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
 
        /* set back link to prev task only if NT bit is set in eflags
-          note that old_tss_sel is not used afetr this point */
+          note that old_tss_sel is not used after this point */
        if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
                old_tss_sel = 0xffff;
 
@@ -2733,26 +2790,28 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 {
        int rc;
 
+       invalidate_registers(ctxt);
        ctxt->_eip = ctxt->eip;
        ctxt->dst.type = OP_NONE;
 
        rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
                                     has_error_code, error_code);
 
-       if (rc == X86EMUL_CONTINUE)
+       if (rc == X86EMUL_CONTINUE) {
                ctxt->eip = ctxt->_eip;
+               writeback_registers(ctxt);
+       }
 
        return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
-                           int reg, struct operand *op)
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
+               struct operand *op)
 {
-       int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+       int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
 
-       register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
-       op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
-       op->addr.mem.seg = seg;
+       register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
+       op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
 }
 
 static int em_das(struct x86_emulate_ctxt *ctxt)
@@ -2927,7 +2986,7 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
 {
        ctxt->dst.type = OP_REG;
        ctxt->dst.bytes = ctxt->src.bytes;
-       ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+       ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
        ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
 
        return X86EMUL_CONTINUE;
@@ -2938,8 +2997,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
        u64 tsc = 0;
 
        ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
-       ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
-       ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
+       *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
+       *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
        return X86EMUL_CONTINUE;
 }
 
@@ -2947,10 +3006,10 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
 {
        u64 pmc;
 
-       if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
+       if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
                return emulate_gp(ctxt, 0);
-       ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
-       ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
+       *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
+       *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
        return X86EMUL_CONTINUE;
 }
 
@@ -2992,9 +3051,9 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
 {
        u64 msr_data;
 
-       msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
-               | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
-       if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
+       msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
+               | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
+       if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
                return emulate_gp(ctxt, 0);
 
        return X86EMUL_CONTINUE;
@@ -3004,11 +3063,11 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
 {
        u64 msr_data;
 
-       if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
+       if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
                return emulate_gp(ctxt, 0);
 
-       ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
-       ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
+       *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+       *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
        return X86EMUL_CONTINUE;
 }
 
@@ -3188,8 +3247,8 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
 
 static int em_loop(struct x86_emulate_ctxt *ctxt)
 {
-       register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
-       if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
+       register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
+       if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
            (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
                jmp_rel(ctxt, ctxt->src.val);
 
@@ -3198,7 +3257,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt)
 
 static int em_jcxz(struct x86_emulate_ctxt *ctxt)
 {
-       if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
+       if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
                jmp_rel(ctxt, ctxt->src.val);
 
        return X86EMUL_CONTINUE;
@@ -3286,20 +3345,20 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
 {
        u32 eax, ebx, ecx, edx;
 
-       eax = ctxt->regs[VCPU_REGS_RAX];
-       ecx = ctxt->regs[VCPU_REGS_RCX];
+       eax = reg_read(ctxt, VCPU_REGS_RAX);
+       ecx = reg_read(ctxt, VCPU_REGS_RCX);
        ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
-       ctxt->regs[VCPU_REGS_RAX] = eax;
-       ctxt->regs[VCPU_REGS_RBX] = ebx;
-       ctxt->regs[VCPU_REGS_RCX] = ecx;
-       ctxt->regs[VCPU_REGS_RDX] = edx;
+       *reg_write(ctxt, VCPU_REGS_RAX) = eax;
+       *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
+       *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
+       *reg_write(ctxt, VCPU_REGS_RDX) = edx;
        return X86EMUL_CONTINUE;
 }
 
 static int em_lahf(struct x86_emulate_ctxt *ctxt)
 {
-       ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
-       ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
+       *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
+       *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
        return X86EMUL_CONTINUE;
 }
 
@@ -3456,7 +3515,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
 
 static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
 {
-       u64 rax = ctxt->regs[VCPU_REGS_RAX];
+       u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
 
        /* Valid physical address? */
        if (rax & 0xffff000000000000ULL)
@@ -3478,7 +3537,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
 static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
 {
        u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
-       u64 rcx = ctxt->regs[VCPU_REGS_RCX];
+       u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
 
        if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
            (rcx > 3))
@@ -3531,13 +3590,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
                I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e),     \
                I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
 
-static struct opcode group7_rm1[] = {
+static const struct opcode group7_rm1[] = {
        DI(SrcNone | Priv, monitor),
        DI(SrcNone | Priv, mwait),
        N, N, N, N, N, N,
 };
 
-static struct opcode group7_rm3[] = {
+static const struct opcode group7_rm3[] = {
        DIP(SrcNone | Prot | Priv,              vmrun,          check_svme_pa),
        II(SrcNone  | Prot | VendorSpecific,    em_vmmcall,     vmmcall),
        DIP(SrcNone | Prot | Priv,              vmload,         check_svme_pa),
@@ -3548,13 +3607,13 @@ static struct opcode group7_rm3[] = {
        DIP(SrcNone | Prot | Priv,              invlpga,        check_svme),
 };
 
-static struct opcode group7_rm7[] = {
+static const struct opcode group7_rm7[] = {
        N,
        DIP(SrcNone, rdtscp, check_rdtsc),
        N, N, N, N, N, N,
 };
 
-static struct opcode group1[] = {
+static const struct opcode group1[] = {
        I(Lock, em_add),
        I(Lock | PageTable, em_or),
        I(Lock, em_adc),
@@ -3565,11 +3624,11 @@ static struct opcode group1[] = {
        I(0, em_cmp),
 };
 
-static struct opcode group1A[] = {
+static const struct opcode group1A[] = {
        I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
 };
 
-static struct opcode group3[] = {
+static const struct opcode group3[] = {
        I(DstMem | SrcImm, em_test),
        I(DstMem | SrcImm, em_test),
        I(DstMem | SrcNone | Lock, em_not),
@@ -3580,13 +3639,13 @@ static struct opcode group3[] = {
        I(SrcMem, em_idiv_ex),
 };
 
-static struct opcode group4[] = {
+static const struct opcode group4[] = {
        I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
        I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
        N, N, N, N, N, N,
 };
 
-static struct opcode group5[] = {
+static const struct opcode group5[] = {
        I(DstMem | SrcNone | Lock,              em_grp45),
        I(DstMem | SrcNone | Lock,              em_grp45),
        I(SrcMem | Stack,                       em_grp45),
@@ -3596,7 +3655,7 @@ static struct opcode group5[] = {
        I(SrcMem | Stack,                       em_grp45), N,
 };
 
-static struct opcode group6[] = {
+static const struct opcode group6[] = {
        DI(Prot,        sldt),
        DI(Prot,        str),
        II(Prot | Priv | SrcMem16, em_lldt, lldt),
@@ -3604,7 +3663,7 @@ static struct opcode group6[] = {
        N, N, N, N,
 };
 
-static struct group_dual group7 = { {
+static const struct group_dual group7 = { {
        II(Mov | DstMem | Priv,                 em_sgdt, sgdt),
        II(Mov | DstMem | Priv,                 em_sidt, sidt),
        II(SrcMem | Priv,                       em_lgdt, lgdt),
@@ -3621,7 +3680,7 @@ static struct group_dual group7 = { {
        EXT(0, group7_rm7),
 } };
 
-static struct opcode group8[] = {
+static const struct opcode group8[] = {
        N, N, N, N,
        I(DstMem | SrcImmByte,                          em_bt),
        I(DstMem | SrcImmByte | Lock | PageTable,       em_bts),
@@ -3629,26 +3688,26 @@ static struct opcode group8[] = {
        I(DstMem | SrcImmByte | Lock | PageTable,       em_btc),
 };
 
-static struct group_dual group9 = { {
+static const struct group_dual group9 = { {
        N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
 }, {
        N, N, N, N, N, N, N, N,
 } };
 
-static struct opcode group11[] = {
+static const struct opcode group11[] = {
        I(DstMem | SrcImm | Mov | PageTable, em_mov),
        X7(D(Undefined)),
 };
 
-static struct gprefix pfx_0f_6f_0f_7f = {
+static const struct gprefix pfx_0f_6f_0f_7f = {
        I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
-static struct gprefix pfx_vmovntpx = {
+static const struct gprefix pfx_vmovntpx = {
        I(0, em_mov), N, N, N,
 };
 
-static struct opcode opcode_table[256] = {
+static const struct opcode opcode_table[256] = {
        /* 0x00 - 0x07 */
        I6ALU(Lock, em_add),
        I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
@@ -3689,7 +3748,7 @@ static struct opcode opcode_table[256] = {
        I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
        I(SrcImmByte | Mov | Stack, em_push),
        I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
-       I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
+       I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
        I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
        /* 0x70 - 0x7F */
        X16(D(SrcImmByte)),
@@ -3765,7 +3824,7 @@ static struct opcode opcode_table[256] = {
        D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
-static struct opcode twobyte_table[256] = {
+static const struct opcode twobyte_table[256] = {
        /* 0x00 - 0x0F */
        G(0, group6), GD(0, &group7), N, N,
        N, I(ImplicitOps | VendorSpecific, em_syscall),
@@ -3936,7 +3995,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
        case OpAcc:
                op->type = OP_REG;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-               op->addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+               op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
                fetch_register_operand(op);
                op->orig_val = op->val;
                break;
@@ -3944,19 +4003,20 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
                op->type = OP_MEM;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
                op->addr.mem.ea =
-                       register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+                       register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
                op->addr.mem.seg = VCPU_SREG_ES;
                op->val = 0;
+               op->count = 1;
                break;
        case OpDX:
                op->type = OP_REG;
                op->bytes = 2;
-               op->addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+               op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
                fetch_register_operand(op);
                break;
        case OpCL:
                op->bytes = 1;
-               op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
+               op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
                break;
        case OpImmByte:
                rc = decode_imm(ctxt, op, 1, true);
@@ -3987,9 +4047,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
                op->type = OP_MEM;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
                op->addr.mem.ea =
-                       register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+                       register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
                op->addr.mem.seg = seg_override(ctxt);
                op->val = 0;
+               op->count = 1;
                break;
        case OpImmFAddr:
                op->type = OP_IMM;
@@ -4293,9 +4354,10 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
                read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
 }
 
+
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
+       const struct x86_emulate_ops *ops = ctxt->ops;
        int rc = X86EMUL_CONTINUE;
        int saved_dst_type = ctxt->dst.type;
 
@@ -4356,7 +4418,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
        }
 
        /* Instruction can only be executed in protected mode */
-       if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+       if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
                rc = emulate_ud(ctxt);
                goto done;
        }
@@ -4377,7 +4439,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
        if (ctxt->rep_prefix && (ctxt->d & String)) {
                /* All REP prefixes have the same first termination condition */
-               if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
+               if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
                        ctxt->eip = ctxt->_eip;
                        goto done;
                }
@@ -4450,7 +4512,7 @@ special_insn:
                ctxt->dst.val = ctxt->src.addr.mem.ea;
                break;
        case 0x90 ... 0x97: /* nop / xchg reg, rax */
-               if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
+               if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
                        break;
                rc = em_xchg(ctxt);
                break;
@@ -4478,7 +4540,7 @@ special_insn:
                rc = em_grp2(ctxt);
                break;
        case 0xd2 ... 0xd3:     /* Grp2 */
-               ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
+               ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
                rc = em_grp2(ctxt);
                break;
        case 0xe9: /* jmp rel */
@@ -4524,23 +4586,27 @@ writeback:
        ctxt->dst.type = saved_dst_type;
 
        if ((ctxt->d & SrcMask) == SrcSI)
-               string_addr_inc(ctxt, seg_override(ctxt),
-                               VCPU_REGS_RSI, &ctxt->src);
+               string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
 
        if ((ctxt->d & DstMask) == DstDI)
-               string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
-                               &ctxt->dst);
+               string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
 
        if (ctxt->rep_prefix && (ctxt->d & String)) {
+               unsigned int count;
                struct read_cache *r = &ctxt->io_read;
-               register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
+               if ((ctxt->d & SrcMask) == SrcSI)
+                       count = ctxt->src.count;
+               else
+                       count = ctxt->dst.count;
+               register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX),
+                               -count);
 
                if (!string_insn_completed(ctxt)) {
                        /*
                         * Re-enter guest when pio read ahead buffer is empty
                         * or, if it is not used, after each 1024 iteration.
                         */
-                       if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
+                       if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
                            (r->end == 0 || r->end != r->pos)) {
                                /*
                                 * Reset read cache. Usually happens before
@@ -4548,6 +4614,7 @@ writeback:
                                 * we have to do it here.
                                 */
                                ctxt->mem_read.end = 0;
+                               writeback_registers(ctxt);
                                return EMULATION_RESTART;
                        }
                        goto done; /* skip rip writeback */
@@ -4562,6 +4629,9 @@ done:
        if (rc == X86EMUL_INTERCEPTED)
                return EMULATION_INTERCEPTED;
 
+       if (rc == X86EMUL_CONTINUE)
+               writeback_registers(ctxt);
+
        return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
@@ -4634,3 +4704,13 @@ twobyte_insn:
 cannot_emulate:
        return EMULATION_FAILED;
 }
+
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+       invalidate_registers(ctxt);
+}
+
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+       writeback_registers(ctxt);
+}
index adba28f88d1a9d56c45e9716ac8dc8a69958144c..11300d2fa71445ff332cbdff30e7c640b959f02d 100644 (file)
@@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
        ktime_t remaining;
        struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 
-       if (!ps->pit_timer.period)
+       if (!ps->period)
                return 0;
 
        /*
@@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
         * itself with the initial count and continues counting
         * from there.
         */
-       remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
-       elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
-       elapsed = mod_64(elapsed, ps->pit_timer.period);
+       remaining = hrtimer_get_remaining(&ps->timer);
+       elapsed = ps->period - ktime_to_ns(remaining);
+       elapsed = mod_64(elapsed, ps->period);
 
        return elapsed;
 }
@@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
        int value;
 
        spin_lock(&ps->inject_lock);
-       value = atomic_dec_return(&ps->pit_timer.pending);
+       value = atomic_dec_return(&ps->pending);
        if (value < 0)
                /* spurious acks can be generated if, for example, the
                 * PIC is being reset.  Handle it gracefully here
                 */
-               atomic_inc(&ps->pit_timer.pending);
+               atomic_inc(&ps->pending);
        else if (value > 0)
                /* in this case, we had multiple outstanding pit interrupts
                 * that we needed to inject.  Reinject
@@ -261,28 +261,17 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
        if (!kvm_vcpu_is_bsp(vcpu) || !pit)
                return;
 
-       timer = &pit->pit_state.pit_timer.timer;
+       timer = &pit->pit_state.timer;
        if (hrtimer_cancel(timer))
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 static void destroy_pit_timer(struct kvm_pit *pit)
 {
-       hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+       hrtimer_cancel(&pit->pit_state.timer);
        flush_kthread_work(&pit->expired);
 }
 
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
-{
-       struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
-                                                pit_timer);
-       return ps->is_periodic;
-}
-
-static struct kvm_timer_ops kpit_ops = {
-       .is_periodic = kpit_is_periodic,
-};
-
 static void pit_do_work(struct kthread_work *work)
 {
        struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
@@ -322,16 +311,16 @@ static void pit_do_work(struct kthread_work *work)
 
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
-       struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-       struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+       struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+       struct kvm_pit *pt = ps->kvm->arch.vpit;
 
-       if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
-               atomic_inc(&ktimer->pending);
+       if (ps->reinject || !atomic_read(&ps->pending)) {
+               atomic_inc(&ps->pending);
                queue_kthread_work(&pt->worker, &pt->expired);
        }
 
-       if (ktimer->t_ops->is_periodic(ktimer)) {
-               hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+       if (ps->is_periodic) {
+               hrtimer_add_expires_ns(&ps->timer, ps->period);
                return HRTIMER_RESTART;
        } else
                return HRTIMER_NORESTART;
@@ -340,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 {
        struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-       struct kvm_timer *pt = &ps->pit_timer;
        s64 interval;
 
        if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
@@ -351,19 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
        pr_debug("create pit timer, interval is %llu nsec\n", interval);
 
        /* TODO The new value only affected after the retriggered */
-       hrtimer_cancel(&pt->timer);
+       hrtimer_cancel(&ps->timer);
        flush_kthread_work(&ps->pit->expired);
-       pt->period = interval;
+       ps->period = interval;
        ps->is_periodic = is_period;
 
-       pt->timer.function = pit_timer_fn;
-       pt->t_ops = &kpit_ops;
-       pt->kvm = ps->pit->kvm;
+       ps->timer.function = pit_timer_fn;
+       ps->kvm = ps->pit->kvm;
 
-       atomic_set(&pt->pending, 0);
+       atomic_set(&ps->pending, 0);
        ps->irq_ack = 1;
 
-       hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+       hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
                      HRTIMER_MODE_ABS);
 }
 
@@ -639,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
        }
        mutex_unlock(&pit->pit_state.lock);
 
-       atomic_set(&pit->pit_state.pit_timer.pending, 0);
+       atomic_set(&pit->pit_state.pending, 0);
        pit->pit_state.irq_ack = 1;
 }
 
@@ -648,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
        struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
 
        if (!mask) {
-               atomic_set(&pit->pit_state.pit_timer.pending, 0);
+               atomic_set(&pit->pit_state.pending, 0);
                pit->pit_state.irq_ack = 1;
        }
 }
@@ -706,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
        pit_state = &pit->pit_state;
        pit_state->pit = pit;
-       hrtimer_init(&pit_state->pit_timer.timer,
-                    CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
        pit_state->irq_ack_notifier.gsi = 0;
        pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
        kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
-       pit_state->pit_timer.reinject = true;
+       pit_state->reinject = true;
        mutex_unlock(&pit->pit_state.lock);
 
        kvm_pit_reset(pit);
@@ -761,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm)
                kvm_unregister_irq_ack_notifier(kvm,
                                &kvm->arch.vpit->pit_state.irq_ack_notifier);
                mutex_lock(&kvm->arch.vpit->pit_state.lock);
-               timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+               timer = &kvm->arch.vpit->pit_state.timer;
                hrtimer_cancel(timer);
                flush_kthread_work(&kvm->arch.vpit->expired);
                kthread_stop(kvm->arch.vpit->worker_task);
index fdf40425ea1de2946bd40e31a1d1a6045ccdcb15..dd1b16b611b0ae6c9d2386a7e690e56a774f0b74 100644 (file)
@@ -24,8 +24,12 @@ struct kvm_kpit_channel_state {
 struct kvm_kpit_state {
        struct kvm_kpit_channel_state channels[3];
        u32 flags;
-       struct kvm_timer pit_timer;
        bool is_periodic;
+       s64 period;                             /* unit: ns */
+       struct hrtimer timer;
+       atomic_t pending;                       /* accumulated triggered timers */
+       bool reinject;
+       struct kvm *kvm;
        u32    speaker_data_on;
        struct mutex lock;
        struct kvm_pit *pit;
index 9fc9aa7ac7034c64dc8cdb59f27f5ac80e1d77ab..848206df0967d1d35e4fde100575147ef482a2f4 100644 (file)
@@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 
 int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
 {
-       int ret = -1;
+       int ret, irq_level;
+
+       BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
 
        pic_lock(s);
-       if (irq >= 0 && irq < PIC_NUM_PINS) {
-               int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
-                                                    irq_source_id, level);
-               ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
-               pic_update_irq(s);
-               trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
-                                     s->pics[irq >> 3].imr, ret == 0);
-       }
+       irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+                                        irq_source_id, level);
+       ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+       pic_update_irq(s);
+       trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+                             s->pics[irq >> 3].imr, ret == 0);
        pic_unlock(s);
 
        return ret;
@@ -275,23 +275,20 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 {
        int irq, i;
        struct kvm_vcpu *vcpu;
-       u8 irr = s->irr, isr = s->imr;
+       u8 edge_irr = s->irr & ~s->elcr;
        bool found = false;
 
        s->last_irr = 0;
-       s->irr = 0;
+       s->irr &= s->elcr;
        s->imr = 0;
-       s->isr = 0;
        s->priority_add = 0;
-       s->irq_base = 0;
-       s->read_reg_select = 0;
-       s->poll = 0;
        s->special_mask = 0;
-       s->init_state = 0;
-       s->auto_eoi = 0;
-       s->rotate_on_auto_eoi = 0;
-       s->special_fully_nested_mode = 0;
-       s->init4 = 0;
+       s->read_reg_select = 0;
+       if (!s->init4) {
+               s->special_fully_nested_mode = 0;
+               s->auto_eoi = 0;
+       }
+       s->init_state = 1;
 
        kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
                if (kvm_apic_accept_pic_intr(vcpu)) {
@@ -304,7 +301,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
                return;
 
        for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
-               if (irr & (1 << irq) || isr & (1 << irq))
+               if (edge_irr & (1 << irq))
                        pic_clear_isr(s, irq);
 }
 
@@ -316,40 +313,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
        addr &= 1;
        if (addr == 0) {
                if (val & 0x10) {
-                       u8 edge_irr = s->irr & ~s->elcr;
-                       int i;
-                       bool found = false;
-                       struct kvm_vcpu *vcpu;
-
                        s->init4 = val & 1;
-                       s->last_irr = 0;
-                       s->irr &= s->elcr;
-                       s->imr = 0;
-                       s->priority_add = 0;
-                       s->special_mask = 0;
-                       s->read_reg_select = 0;
-                       if (!s->init4) {
-                               s->special_fully_nested_mode = 0;
-                               s->auto_eoi = 0;
-                       }
-                       s->init_state = 1;
                        if (val & 0x02)
                                pr_pic_unimpl("single mode not supported");
                        if (val & 0x08)
                                pr_pic_unimpl(
-                                       "level sensitive irq not supported");
-
-                       kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
-                               if (kvm_apic_accept_pic_intr(vcpu)) {
-                                       found = true;
-                                       break;
-                               }
-
-
-                       if (found)
-                               for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
-                                       if (edge_irr & (1 << irq))
-                                               pic_clear_isr(s, irq);
+                                               "level sensitive irq not supported");
+                       kvm_pic_reset(s);
                } else if (val & 0x08) {
                        if (val & 0x04)
                                s->poll = 1;
index 2086f2bfba33db1d11a119ef57fa82aa01763804..2d03568e9498356716b7504c195c71a912819f4d 100644 (file)
@@ -70,7 +70,7 @@ struct kvm_pic {
        struct kvm_io_device dev_slave;
        struct kvm_io_device dev_eclr;
        void (*ack_notifier)(void *opaque, int irq);
-       unsigned long irq_states[16];
+       unsigned long irq_states[PIC_NUM_PINS];
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644 (file)
index 497dbaa..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-
-struct kvm_timer {
-       struct hrtimer timer;
-       s64 period;                             /* unit: ns */
-       u32 timer_mode_mask;
-       u64 tscdeadline;
-       atomic_t pending;                       /* accumulated triggered timers */
-       bool reinject;
-       struct kvm_timer_ops *t_ops;
-       struct kvm *kvm;
-       struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
-       bool (*is_periodic)(struct kvm_timer *);
-};
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
index ce878788a39fd13e486fd3557011d5bf28a47d2d..c6e6b721b6ee3a38d6267ee8867bd029fbdc6d6a 100644 (file)
@@ -34,6 +34,7 @@
 #include <asm/current.h>
 #include <asm/apicdef.h>
 #include <linux/atomic.h>
+#include <linux/jump_label.h>
 #include "kvm_cache_regs.h"
 #include "irq.h"
 #include "trace.h"
@@ -65,6 +66,7 @@
 #define APIC_DEST_NOSHORT              0x0
 #define APIC_DEST_MASK                 0x800
 #define MAX_APIC_VECTOR                        256
+#define APIC_VECTORS_PER_REG           32
 
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
 static unsigned int min_timer_period_us = 500;
 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
-       return *((u32 *) (apic->regs + reg_off));
-}
-
 static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
 {
        *((u32 *) (apic->regs + reg_off)) = val;
@@ -117,19 +114,23 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
        return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
-       return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
-}
+struct static_key_deferred apic_hw_disabled __read_mostly;
+struct static_key_deferred apic_sw_disabled __read_mostly;
 
-static inline int  apic_sw_enabled(struct kvm_lapic *apic)
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 {
-       return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+       if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
+               if (val & APIC_SPIV_APIC_ENABLED)
+                       static_key_slow_dec_deferred(&apic_sw_disabled);
+               else
+                       static_key_slow_inc(&apic_sw_disabled.key);
+       }
+       apic_set_reg(apic, APIC_SPIV, val);
 }
 
 static inline int apic_enabled(struct kvm_lapic *apic)
 {
-       return apic_sw_enabled(apic) && apic_hw_enabled(apic);
+       return kvm_apic_sw_enabled(apic) &&     kvm_apic_hw_enabled(apic);
 }
 
 #define LVT_MASK       \
@@ -139,36 +140,135 @@ static inline int apic_enabled(struct kvm_lapic *apic)
        (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
         APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+       return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
 static inline int kvm_apic_id(struct kvm_lapic *apic)
 {
-       return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+       return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+}
+
+static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+{
+       u16 cid;
+       ldr >>= 32 - map->ldr_bits;
+       cid = (ldr >> map->cid_shift) & map->cid_mask;
+
+       BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+
+       return cid;
+}
+
+static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
+{
+       ldr >>= (32 - map->ldr_bits);
+       return ldr & map->lid_mask;
+}
+
+static void recalculate_apic_map(struct kvm *kvm)
+{
+       struct kvm_apic_map *new, *old = NULL;
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+
+       mutex_lock(&kvm->arch.apic_map_lock);
+
+       if (!new)
+               goto out;
+
+       new->ldr_bits = 8;
+       /* flat mode is default */
+       new->cid_shift = 8;
+       new->cid_mask = 0;
+       new->lid_mask = 0xff;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               struct kvm_lapic *apic = vcpu->arch.apic;
+               u16 cid, lid;
+               u32 ldr;
+
+               if (!kvm_apic_present(vcpu))
+                       continue;
+
+               /*
+                * All APICs have to be configured in the same mode by an OS.
+                * We take advatage of this while building logical id loockup
+                * table. After reset APICs are in xapic/flat mode, so if we
+                * find apic with different setting we assume this is the mode
+                * OS wants all apics to be in; build lookup table accordingly.
+                */
+               if (apic_x2apic_mode(apic)) {
+                       new->ldr_bits = 32;
+                       new->cid_shift = 16;
+                       new->cid_mask = new->lid_mask = 0xffff;
+               } else if (kvm_apic_sw_enabled(apic) &&
+                               !new->cid_mask /* flat mode */ &&
+                               kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
+                       new->cid_shift = 4;
+                       new->cid_mask = 0xf;
+                       new->lid_mask = 0xf;
+               }
+
+               new->phys_map[kvm_apic_id(apic)] = apic;
+
+               ldr = kvm_apic_get_reg(apic, APIC_LDR);
+               cid = apic_cluster_id(new, ldr);
+               lid = apic_logical_id(new, ldr);
+
+               if (lid)
+                       new->logical_map[cid][ffs(lid) - 1] = apic;
+       }
+out:
+       old = rcu_dereference_protected(kvm->arch.apic_map,
+                       lockdep_is_held(&kvm->arch.apic_map_lock));
+       rcu_assign_pointer(kvm->arch.apic_map, new);
+       mutex_unlock(&kvm->arch.apic_map_lock);
+
+       if (old)
+               kfree_rcu(old, rcu);
+}
+
+static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+{
+       apic_set_reg(apic, APIC_ID, id << 24);
+       recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+       apic_set_reg(apic, APIC_LDR, id);
+       recalculate_apic_map(apic->vcpu->kvm);
 }
 
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
 {
-       return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+       return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
 }
 
 static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
 {
-       return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+       return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
 }
 
 static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
 {
-       return ((apic_get_reg(apic, APIC_LVTT) &
+       return ((kvm_apic_get_reg(apic, APIC_LVTT) &
                apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
 }
 
 static inline int apic_lvtt_period(struct kvm_lapic *apic)
 {
-       return ((apic_get_reg(apic, APIC_LVTT) &
+       return ((kvm_apic_get_reg(apic, APIC_LVTT) &
                apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
 }
 
 static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
 {
-       return ((apic_get_reg(apic, APIC_LVTT) &
+       return ((kvm_apic_get_reg(apic, APIC_LVTT) &
                apic->lapic_timer.timer_mode_mask) ==
                        APIC_LVT_TIMER_TSCDEADLINE);
 }
@@ -184,7 +284,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
        struct kvm_cpuid_entry2 *feat;
        u32 v = APIC_VERSION;
 
-       if (!irqchip_in_kernel(vcpu->kvm))
+       if (!kvm_vcpu_has_lapic(vcpu))
                return;
 
        feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -193,12 +293,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
        apic_set_reg(apic, APIC_LVR, v);
 }
 
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
-{
-       return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
-}
-
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
+static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
        LVT_MASK ,      /* part LVTT mask, timer mode mask added at runtime */
        LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
        LVT_MASK | APIC_MODE_MASK,      /* LVTPC */
@@ -208,25 +303,30 @@ static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
 
 static int find_highest_vector(void *bitmap)
 {
-       u32 *word = bitmap;
-       int word_offset = MAX_APIC_VECTOR >> 5;
+       int vec;
+       u32 *reg;
 
-       while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
-               continue;
+       for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
+            vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+               reg = bitmap + REG_POS(vec);
+               if (*reg)
+                       return fls(*reg) - 1 + vec;
+       }
 
-       if (likely(!word_offset && !word[0]))
-               return -1;
-       else
-               return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+       return -1;
 }
 
 static u8 count_vectors(void *bitmap)
 {
-       u32 *word = bitmap;
-       int word_offset;
+       int vec;
+       u32 *reg;
        u8 count = 0;
-       for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
-               count += hweight32(word[word_offset << 2]);
+
+       for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
+               reg = bitmap + REG_POS(vec);
+               count += hweight32(*reg);
+       }
+
        return count;
 }
 
@@ -285,7 +385,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = vcpu->arch.apic;
        int highest_irr;
 
        /* This may race with setting of irr in __apic_accept_irq() and
@@ -293,9 +392,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
         * will cause vmexit immediately and the value will be recalculated
         * on the next vmentry.
         */
-       if (!apic)
+       if (!kvm_vcpu_has_lapic(vcpu))
                return 0;
-       highest_irr = apic_find_highest_irr(apic);
+       highest_irr = apic_find_highest_irr(vcpu->arch.apic);
 
        return highest_irr;
 }
@@ -378,8 +477,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
        u32 tpr, isrv, ppr, old_ppr;
        int isr;
 
-       old_ppr = apic_get_reg(apic, APIC_PROCPRI);
-       tpr = apic_get_reg(apic, APIC_TASKPRI);
+       old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
+       tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
        isr = apic_find_highest_isr(apic);
        isrv = (isr != -1) ? isr : 0;
 
@@ -415,13 +514,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
        u32 logical_id;
 
        if (apic_x2apic_mode(apic)) {
-               logical_id = apic_get_reg(apic, APIC_LDR);
+               logical_id = kvm_apic_get_reg(apic, APIC_LDR);
                return logical_id & mda;
        }
 
-       logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+       logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
 
-       switch (apic_get_reg(apic, APIC_DFR)) {
+       switch (kvm_apic_get_reg(apic, APIC_DFR)) {
        case APIC_DFR_FLAT:
                if (logical_id & mda)
                        result = 1;
@@ -433,7 +532,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
                break;
        default:
                apic_debug("Bad DFR vcpu %d: %08x\n",
-                          apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+                          apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
                break;
        }
 
@@ -478,6 +577,72 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
        return result;
 }
 
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+               struct kvm_lapic_irq *irq, int *r)
+{
+       struct kvm_apic_map *map;
+       unsigned long bitmap = 1;
+       struct kvm_lapic **dst;
+       int i;
+       bool ret = false;
+
+       *r = -1;
+
+       if (irq->shorthand == APIC_DEST_SELF) {
+               *r = kvm_apic_set_irq(src->vcpu, irq);
+               return true;
+       }
+
+       if (irq->shorthand)
+               return false;
+
+       rcu_read_lock();
+       map = rcu_dereference(kvm->arch.apic_map);
+
+       if (!map)
+               goto out;
+
+       if (irq->dest_mode == 0) { /* physical mode */
+               if (irq->delivery_mode == APIC_DM_LOWEST ||
+                               irq->dest_id == 0xff)
+                       goto out;
+               dst = &map->phys_map[irq->dest_id & 0xff];
+       } else {
+               u32 mda = irq->dest_id << (32 - map->ldr_bits);
+
+               dst = map->logical_map[apic_cluster_id(map, mda)];
+
+               bitmap = apic_logical_id(map, mda);
+
+               if (irq->delivery_mode == APIC_DM_LOWEST) {
+                       int l = -1;
+                       for_each_set_bit(i, &bitmap, 16) {
+                               if (!dst[i])
+                                       continue;
+                               if (l < 0)
+                                       l = i;
+                               else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+                                       l = i;
+                       }
+
+                       bitmap = (l >= 0) ? 1 << l : 0;
+               }
+       }
+
+       for_each_set_bit(i, &bitmap, 16) {
+               if (!dst[i])
+                       continue;
+               if (*r < 0)
+                       *r = 0;
+               *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
+       }
+
+       ret = true;
+out:
+       rcu_read_unlock();
+       return ret;
+}
+
 /*
  * Add a pending IRQ into lapic.
  * Return 1 if successfully added and 0 if discarded.
@@ -591,7 +756,7 @@ static int apic_set_eoi(struct kvm_lapic *apic)
        apic_clear_isr(vector, apic);
        apic_update_ppr(apic);
 
-       if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+       if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
            kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
                int trigger_mode;
                if (apic_test_vector(vector, apic->regs + APIC_TMR))
@@ -606,8 +771,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
 
 static void apic_send_ipi(struct kvm_lapic *apic)
 {
-       u32 icr_low = apic_get_reg(apic, APIC_ICR);
-       u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+       u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
+       u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
        struct kvm_lapic_irq irq;
 
        irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -642,7 +807,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
        ASSERT(apic != NULL);
 
        /* if initial count is 0, current count should also be 0 */
-       if (apic_get_reg(apic, APIC_TMICT) == 0)
+       if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
                return 0;
 
        remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
@@ -696,13 +861,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 
                val = apic_get_tmcct(apic);
                break;
-
+       case APIC_PROCPRI:
+               apic_update_ppr(apic);
+               val = kvm_apic_get_reg(apic, offset);
+               break;
        case APIC_TASKPRI:
                report_tpr_access(apic, false);
                /* fall thru */
        default:
-               apic_update_ppr(apic);
-               val = apic_get_reg(apic, offset);
+               val = kvm_apic_get_reg(apic, offset);
                break;
        }
 
@@ -719,7 +886,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 {
        unsigned char alignment = offset & 0xf;
        u32 result;
-       /* this bitmask has a bit cleared for each reserver register */
+       /* this bitmask has a bit cleared for each reserved register */
        static const u64 rmask = 0x43ff01ffffffe70cULL;
 
        if ((alignment + len) > 4) {
@@ -754,7 +921,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 
 static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
 {
-       return apic_hw_enabled(apic) &&
+       return kvm_apic_hw_enabled(apic) &&
            addr >= apic->base_address &&
            addr < apic->base_address + LAPIC_MMIO_LENGTH;
 }
@@ -777,7 +944,7 @@ static void update_divide_count(struct kvm_lapic *apic)
 {
        u32 tmp1, tmp2, tdcr;
 
-       tdcr = apic_get_reg(apic, APIC_TDCR);
+       tdcr = kvm_apic_get_reg(apic, APIC_TDCR);
        tmp1 = tdcr & 0xf;
        tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
        apic->divide_count = 0x1 << (tmp2 & 0x7);
@@ -792,9 +959,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
        atomic_set(&apic->lapic_timer.pending, 0);
 
        if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
-               /* lapic timer in oneshot or peroidic mode */
+               /* lapic timer in oneshot or periodic mode */
                now = apic->lapic_timer.timer.base->get_time();
-               apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+               apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT)
                            * APIC_BUS_CYCLE_NS * apic->divide_count;
 
                if (!apic->lapic_timer.period)
@@ -826,7 +993,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
                           "timer initial count 0x%x, period %lldns, "
                           "expire @ 0x%016" PRIx64 ".\n", __func__,
                           APIC_BUS_CYCLE_NS, ktime_to_ns(now),
-                          apic_get_reg(apic, APIC_TMICT),
+                          kvm_apic_get_reg(apic, APIC_TMICT),
                           apic->lapic_timer.period,
                           ktime_to_ns(ktime_add_ns(now,
                                        apic->lapic_timer.period)));
@@ -858,7 +1025,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 {
-       int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+       int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
 
        if (apic_lvt_nmi_mode(lvt0_val)) {
                if (!nmi_wd_enabled) {
@@ -879,7 +1046,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
        switch (reg) {
        case APIC_ID:           /* Local APIC ID */
                if (!apic_x2apic_mode(apic))
-                       apic_set_reg(apic, APIC_ID, val);
+                       kvm_apic_set_id(apic, val >> 24);
                else
                        ret = 1;
                break;
@@ -895,29 +1062,30 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
        case APIC_LDR:
                if (!apic_x2apic_mode(apic))
-                       apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+                       kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
                else
                        ret = 1;
                break;
 
        case APIC_DFR:
-               if (!apic_x2apic_mode(apic))
+               if (!apic_x2apic_mode(apic)) {
                        apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-               else
+                       recalculate_apic_map(apic->vcpu->kvm);
+               } else
                        ret = 1;
                break;
 
        case APIC_SPIV: {
                u32 mask = 0x3ff;
-               if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+               if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
                        mask |= APIC_SPIV_DIRECTED_EOI;
-               apic_set_reg(apic, APIC_SPIV, val & mask);
+               apic_set_spiv(apic, val & mask);
                if (!(val & APIC_SPIV_APIC_ENABLED)) {
                        int i;
                        u32 lvt_val;
 
                        for (i = 0; i < APIC_LVT_NUM; i++) {
-                               lvt_val = apic_get_reg(apic,
+                               lvt_val = kvm_apic_get_reg(apic,
                                                       APIC_LVTT + 0x10 * i);
                                apic_set_reg(apic, APIC_LVTT + 0x10 * i,
                                             lvt_val | APIC_LVT_MASKED);
@@ -946,7 +1114,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
        case APIC_LVT1:
        case APIC_LVTERR:
                /* TODO: Check vector */
-               if (!apic_sw_enabled(apic))
+               if (!kvm_apic_sw_enabled(apic))
                        val |= APIC_LVT_MASKED;
 
                val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
@@ -955,12 +1123,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
                break;
 
        case APIC_LVTT:
-               if ((apic_get_reg(apic, APIC_LVTT) &
+               if ((kvm_apic_get_reg(apic, APIC_LVTT) &
                    apic->lapic_timer.timer_mode_mask) !=
                   (val & apic->lapic_timer.timer_mode_mask))
                        hrtimer_cancel(&apic->lapic_timer.timer);
 
-               if (!apic_sw_enabled(apic))
+               if (!kvm_apic_sw_enabled(apic))
                        val |= APIC_LVT_MASKED;
                val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
                apic_set_reg(apic, APIC_LVTT, val);
@@ -1039,24 +1207,30 @@ static int apic_mmio_write(struct kvm_io_device *this,
 
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = vcpu->arch.apic;
-
-       if (apic)
+       if (kvm_vcpu_has_lapic(vcpu))
                apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 
 void kvm_free_lapic(struct kvm_vcpu *vcpu)
 {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
        if (!vcpu->arch.apic)
                return;
 
-       hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+       hrtimer_cancel(&apic->lapic_timer.timer);
+
+       if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+               static_key_slow_dec_deferred(&apic_hw_disabled);
+
+       if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
+               static_key_slow_dec_deferred(&apic_sw_disabled);
 
-       if (vcpu->arch.apic->regs)
-               free_page((unsigned long)vcpu->arch.apic->regs);
+       if (apic->regs)
+               free_page((unsigned long)apic->regs);
 
-       kfree(vcpu->arch.apic);
+       kfree(apic);
 }
 
 /*
@@ -1068,10 +1242,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
-       if (!apic)
-               return 0;
 
-       if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+                       apic_lvtt_period(apic))
                return 0;
 
        return apic->lapic_timer.tscdeadline;
@@ -1080,10 +1253,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
-       if (!apic)
-               return;
 
-       if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+                       apic_lvtt_period(apic))
                return;
 
        hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1095,20 +1267,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (!apic)
+       if (!kvm_vcpu_has_lapic(vcpu))
                return;
+
        apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
-                    | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+                    | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
 
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = vcpu->arch.apic;
        u64 tpr;
 
-       if (!apic)
+       if (!kvm_vcpu_has_lapic(vcpu))
                return 0;
-       tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+       tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
 
        return (tpr & 0xf0) >> 4;
 }
@@ -1123,6 +1296,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                return;
        }
 
+       /* update jump label if enable bit changes */
+       if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
+               if (value & MSR_IA32_APICBASE_ENABLE)
+                       static_key_slow_dec_deferred(&apic_hw_disabled);
+               else
+                       static_key_slow_inc(&apic_hw_disabled.key);
+               recalculate_apic_map(vcpu->kvm);
+       }
+
        if (!kvm_vcpu_is_bsp(apic->vcpu))
                value &= ~MSR_IA32_APICBASE_BSP;
 
@@ -1130,7 +1312,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
        if (apic_x2apic_mode(apic)) {
                u32 id = kvm_apic_id(apic);
                u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
-               apic_set_reg(apic, APIC_LDR, ldr);
+               kvm_apic_set_ldr(apic, ldr);
        }
        apic->base_address = apic->vcpu->arch.apic_base &
                             MSR_IA32_APICBASE_BASE;
@@ -1155,7 +1337,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
        /* Stop the timer in case it's a reset to an active apic */
        hrtimer_cancel(&apic->lapic_timer.timer);
 
-       apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+       kvm_apic_set_id(apic, vcpu->vcpu_id);
        kvm_apic_set_version(apic->vcpu);
 
        for (i = 0; i < APIC_LVT_NUM; i++)
@@ -1164,9 +1346,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
                     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
 
        apic_set_reg(apic, APIC_DFR, 0xffffffffU);
-       apic_set_reg(apic, APIC_SPIV, 0xff);
+       apic_set_spiv(apic, 0xff);
        apic_set_reg(apic, APIC_TASKPRI, 0);
-       apic_set_reg(apic, APIC_LDR, 0);
+       kvm_apic_set_ldr(apic, 0);
        apic_set_reg(apic, APIC_ESR, 0);
        apic_set_reg(apic, APIC_ICR, 0);
        apic_set_reg(apic, APIC_ICR2, 0);
@@ -1183,7 +1365,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
        update_divide_count(apic);
        atomic_set(&apic->lapic_timer.pending, 0);
        if (kvm_vcpu_is_bsp(vcpu))
-               vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+               kvm_lapic_set_base(vcpu,
+                               vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
        vcpu->arch.pv_eoi.msr_val = 0;
        apic_update_ppr(apic);
 
@@ -1196,45 +1379,34 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
                   vcpu->arch.apic_base, apic->base_address);
 }
 
-bool kvm_apic_present(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
-}
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
-       return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
-}
-
 /*
  *----------------------------------------------------------------------
  * timer interface
  *----------------------------------------------------------------------
  */
 
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
+static bool lapic_is_periodic(struct kvm_lapic *apic)
 {
-       struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
-                                             lapic_timer);
        return apic_lvtt_period(apic);
 }
 
 int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *lapic = vcpu->arch.apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
-               return atomic_read(&lapic->lapic_timer.pending);
+       if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
+                       apic_lvt_enabled(apic, APIC_LVTT))
+               return atomic_read(&apic->lapic_timer.pending);
 
        return 0;
 }
 
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
 {
-       u32 reg = apic_get_reg(apic, lvt_type);
+       u32 reg = kvm_apic_get_reg(apic, lvt_type);
        int vector, mode, trig_mode;
 
-       if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+       if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
                vector = reg & APIC_VECTOR_MASK;
                mode = reg & APIC_MODE_MASK;
                trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -1251,15 +1423,40 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
                kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
-static struct kvm_timer_ops lapic_timer_ops = {
-       .is_periodic = lapic_is_periodic,
-};
-
 static const struct kvm_io_device_ops apic_mmio_ops = {
        .read     = apic_mmio_read,
        .write    = apic_mmio_write,
 };
 
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+       struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+       struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       wait_queue_head_t *q = &vcpu->wq;
+
+       /*
+        * There is a race window between reading and incrementing, but we do
+        * not care about potentially losing timer events in the !reinject
+        * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+        * in vcpu_enter_guest.
+        */
+       if (!atomic_read(&ktimer->pending)) {
+               atomic_inc(&ktimer->pending);
+               /* FIXME: this code should not know anything about vcpus */
+               kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+       }
+
+       if (waitqueue_active(q))
+               wake_up_interruptible(q);
+
+       if (lapic_is_periodic(apic)) {
+               hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+               return HRTIMER_RESTART;
+       } else
+               return HRTIMER_NORESTART;
+}
+
 int kvm_create_lapic(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic;
@@ -1283,14 +1480,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 
        hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_ABS);
-       apic->lapic_timer.timer.function = kvm_timer_fn;
-       apic->lapic_timer.t_ops = &lapic_timer_ops;
-       apic->lapic_timer.kvm = vcpu->kvm;
-       apic->lapic_timer.vcpu = vcpu;
+       apic->lapic_timer.timer.function = apic_timer_fn;
 
-       apic->base_address = APIC_DEFAULT_PHYS_BASE;
-       vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+       /*
+        * APIC is created enabled. This will prevent kvm_lapic_set_base from
+        * thinking that APIC satet has changed.
+        */
+       vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+       kvm_lapic_set_base(vcpu,
+                       APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
 
+       static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
        kvm_lapic_reset(vcpu);
        kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
@@ -1306,23 +1506,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
        struct kvm_lapic *apic = vcpu->arch.apic;
        int highest_irr;
 
-       if (!apic || !apic_enabled(apic))
+       if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
                return -1;
 
        apic_update_ppr(apic);
        highest_irr = apic_find_highest_irr(apic);
        if ((highest_irr == -1) ||
-           ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+           ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
                return -1;
        return highest_irr;
 }
 
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 {
-       u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+       u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
        int r = 0;
 
-       if (!apic_hw_enabled(vcpu->arch.apic))
+       if (!kvm_apic_hw_enabled(vcpu->arch.apic))
                r = 1;
        if ((lvt0 & APIC_LVT_MASKED) == 0 &&
            GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1334,7 +1534,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
+       if (!kvm_vcpu_has_lapic(vcpu))
+               return;
+
+       if (atomic_read(&apic->lapic_timer.pending) > 0) {
                if (kvm_apic_local_deliver(apic, APIC_LVTT))
                        atomic_dec(&apic->lapic_timer.pending);
        }
@@ -1354,12 +1557,17 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
        return vector;
 }
 
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+               struct kvm_lapic_state *s)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       apic->base_address = vcpu->arch.apic_base &
-                            MSR_IA32_APICBASE_BASE;
+       kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+       /* set SPIV separately to get count of SW disabled APICs right */
+       apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+       memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+       /* call kvm_apic_set_id() to put apic into apic_map */
+       kvm_apic_set_id(apic, kvm_apic_id(apic));
        kvm_apic_set_version(vcpu);
 
        apic_update_ppr(apic);
@@ -1374,13 +1582,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = vcpu->arch.apic;
        struct hrtimer *timer;
 
-       if (!apic)
+       if (!kvm_vcpu_has_lapic(vcpu))
                return;
 
-       timer = &apic->lapic_timer.timer;
+       timer = &vcpu->arch.apic->lapic_timer.timer;
        if (hrtimer_cancel(timer))
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
@@ -1478,7 +1685,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
        if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
                return;
 
-       tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+       tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff;
        max_irr = apic_find_highest_irr(apic);
        if (max_irr < 0)
                max_irr = 0;
@@ -1537,7 +1744,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (!irqchip_in_kernel(vcpu->kvm))
+       if (!kvm_vcpu_has_lapic(vcpu))
                return 1;
 
        /* if this is ICR write vector before command */
@@ -1551,7 +1758,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
        struct kvm_lapic *apic = vcpu->arch.apic;
        u32 low, high = 0;
 
-       if (!irqchip_in_kernel(vcpu->kvm))
+       if (!kvm_vcpu_has_lapic(vcpu))
                return 1;
 
        if (apic_reg_read(apic, reg, 4, &low))
@@ -1576,3 +1783,10 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
        return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
                                         addr);
 }
+
+void kvm_lapic_init(void)
+{
+       /* do not patch jump label more than once per second */
+       jump_label_rate_limit(&apic_hw_disabled, HZ);
+       jump_label_rate_limit(&apic_sw_disabled, HZ);
+}
index 4af5405ae1e2f4e2822cde6bd844e73f0489e850..e5ebf9f3571ff47c9d09d48cc2e0f9285f5af92c 100644 (file)
@@ -2,10 +2,17 @@
 #define __KVM_X86_LAPIC_H
 
 #include "iodev.h"
-#include "kvm_timer.h"
 
 #include <linux/kvm_host.h>
 
+struct kvm_timer {
+       struct hrtimer timer;
+       s64 period;                             /* unit: ns */
+       u32 timer_mode_mask;
+       u64 tscdeadline;
+       atomic_t pending;                       /* accumulated triggered timers */
+};
+
 struct kvm_lapic {
        unsigned long base_address;
        struct kvm_io_device dev;
@@ -45,11 +52,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+               struct kvm_lapic_irq *irq, int *r);
+
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+               struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -71,4 +80,48 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 }
 
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+void kvm_lapic_init(void);
+
+static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+               return *((u32 *) (apic->regs + reg_off));
+}
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+{
+       if (static_key_false(&kvm_no_apic_vcpu))
+               return vcpu->arch.apic;
+       return true;
+}
+
+extern struct static_key_deferred apic_hw_disabled;
+
+static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+       if (static_key_false(&apic_hw_disabled.key))
+               return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+       return MSR_IA32_APICBASE_ENABLE;
+}
+
+extern struct static_key_deferred apic_sw_disabled;
+
+static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+       if (static_key_false(&apic_sw_disabled.key))
+               return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+       return APIC_SPIV_APIC_ENABLED;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+       return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+       return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
 #endif
index 7fbd0d273ea83dbec4a330fcb6d14a8ab46462b0..d289fee1ffb8631c0b93f663104965fe7def7b11 100644 (file)
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
                return 0;
 
        pfn = spte_to_pfn(old_spte);
+
+       /*
+        * KVM does not hold the refcount of the page used by
+        * kvm mmu, before reclaiming the page, we should
+        * unmap it from mmu first.
+        */
+       WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+
        if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
                kvm_set_pfn_accessed(pfn);
        if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
@@ -960,13 +968,10 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
                                    struct kvm_memory_slot *slot)
 {
-       struct kvm_lpage_info *linfo;
-
-       if (likely(level == PT_PAGE_TABLE_LEVEL))
-               return &slot->rmap[gfn - slot->base_gfn];
+       unsigned long idx;
 
-       linfo = lpage_info_slot(gfn, slot, level);
-       return &linfo->rmap_pde;
+       idx = gfn_to_index(gfn, slot->base_gfn, level);
+       return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
 }
 
 /*
@@ -1173,7 +1178,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
        unsigned long *rmapp;
 
        while (mask) {
-               rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+               rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+                                     PT_PAGE_TABLE_LEVEL, slot);
                __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
 
                /* clear the first set bit */
@@ -1200,7 +1206,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-                          unsigned long data)
+                          struct kvm_memory_slot *slot, unsigned long data)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1218,7 +1224,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
-                            unsigned long data)
+                            struct kvm_memory_slot *slot, unsigned long data)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1259,43 +1265,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
        return 0;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         unsigned long data,
-                         int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-                                        unsigned long data))
+static int kvm_handle_hva_range(struct kvm *kvm,
+                               unsigned long start,
+                               unsigned long end,
+                               unsigned long data,
+                               int (*handler)(struct kvm *kvm,
+                                              unsigned long *rmapp,
+                                              struct kvm_memory_slot *slot,
+                                              unsigned long data))
 {
        int j;
-       int ret;
-       int retval = 0;
+       int ret = 0;
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
 
        slots = kvm_memslots(kvm);
 
        kvm_for_each_memslot(memslot, slots) {
-               unsigned long start = memslot->userspace_addr;
-               unsigned long end;
+               unsigned long hva_start, hva_end;
+               gfn_t gfn_start, gfn_end;
 
-               end = start + (memslot->npages << PAGE_SHIFT);
-               if (hva >= start && hva < end) {
-                       gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
-                       gfn_t gfn = memslot->base_gfn + gfn_offset;
+               hva_start = max(start, memslot->userspace_addr);
+               hva_end = min(end, memslot->userspace_addr +
+                                       (memslot->npages << PAGE_SHIFT));
+               if (hva_start >= hva_end)
+                       continue;
+               /*
+                * {gfn(page) | page intersects with [hva_start, hva_end)} =
+                * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+                */
+               gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-                       ret = handler(kvm, &memslot->rmap[gfn_offset], data);
+               for (j = PT_PAGE_TABLE_LEVEL;
+                    j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+                       unsigned long idx, idx_end;
+                       unsigned long *rmapp;
 
-                       for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-                               struct kvm_lpage_info *linfo;
+                       /*
+                        * {idx(page_j) | page_j intersects with
+                        *  [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+                        */
+                       idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
+                       idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
 
-                               linfo = lpage_info_slot(gfn, memslot,
-                                                       PT_DIRECTORY_LEVEL + j);
-                               ret |= handler(kvm, &linfo->rmap_pde, data);
-                       }
-                       trace_kvm_age_page(hva, memslot, ret);
-                       retval |= ret;
+                       rmapp = __gfn_to_rmap(gfn_start, j, memslot);
+
+                       for (; idx <= idx_end; ++idx)
+                               ret |= handler(kvm, rmapp++, memslot, data);
                }
        }
 
-       return retval;
+       return ret;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+                         unsigned long data,
+                         int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+                                        struct kvm_memory_slot *slot,
+                                        unsigned long data))
+{
+       return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
 }
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
@@ -1303,13 +1333,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
        return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
 }
 
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+       return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
        kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
 }
 
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-                        unsigned long data)
+                        struct kvm_memory_slot *slot, unsigned long data)
 {
        u64 *sptep;
        struct rmap_iterator uninitialized_var(iter);
@@ -1323,8 +1358,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
         * This has some overhead, but not as much as the cost of swapping
         * out actively used pages or breaking up actively used hugepages.
         */
-       if (!shadow_accessed_mask)
-               return kvm_unmap_rmapp(kvm, rmapp, data);
+       if (!shadow_accessed_mask) {
+               young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+               goto out;
+       }
 
        for (sptep = rmap_get_first(*rmapp, &iter); sptep;
             sptep = rmap_get_next(&iter)) {
@@ -1336,12 +1373,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
                                 (unsigned long *)sptep);
                }
        }
-
+out:
+       /* @data has hva passed to kvm_age_hva(). */
+       trace_kvm_age_page(data, slot, young);
        return young;
 }
 
 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-                             unsigned long data)
+                             struct kvm_memory_slot *slot, unsigned long data)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1379,13 +1418,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
        rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
-       kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
+       kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
        kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
-       return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
+       return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
 }
 
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -2457,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                                rmap_recycle(vcpu, sptep, gfn);
                }
        }
-       kvm_release_pfn_clean(pfn);
+
+       if (!is_error_pfn(pfn))
+               kvm_release_pfn_clean(pfn);
 }
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2469,17 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
        struct kvm_memory_slot *slot;
-       unsigned long hva;
 
        slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
-       if (!slot) {
-               get_page(fault_page);
-               return page_to_pfn(fault_page);
-       }
+       if (!slot)
+               return KVM_PFN_ERR_FAULT;
 
-       hva = gfn_to_hva_memslot(slot, gfn);
-
-       return hva_to_pfn_atomic(vcpu->kvm, hva);
+       return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
@@ -2580,11 +2616,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                        sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
                                              iterator.level - 1,
                                              1, ACC_ALL, iterator.sptep);
-                       if (!sp) {
-                               pgprintk("nonpaging_map: ENOMEM\n");
-                               kvm_release_pfn_clean(pfn);
-                               return -ENOMEM;
-                       }
 
                        mmu_spte_set(iterator.sptep,
                                     __pa(sp->spt)
@@ -2611,8 +2642,16 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 
 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
-       kvm_release_pfn_clean(pfn);
-       if (is_hwpoison_pfn(pfn)) {
+       /*
+        * Do not cache the mmio info caused by writing the readonly gfn
+        * into the spte otherwise read access on readonly gfn also can
+        * caused mmio page fault and treat it as mmio access.
+        * Return 1 to tell kvm to emulate it.
+        */
+       if (pfn == KVM_PFN_ERR_RO_FAULT)
+               return 1;
+
+       if (pfn == KVM_PFN_ERR_HWPOISON) {
                kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
                return 0;
        }
@@ -3236,8 +3275,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
        if (!async)
                return false; /* *pfn has correct page already */
 
-       put_page(pfn_to_page(*pfn));
-
        if (!prefault && can_do_async_pf(vcpu)) {
                trace_kvm_try_async_get_page(gva, gfn);
                if (kvm_find_async_pf_gfn(vcpu, gfn)) {
@@ -3371,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
+static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
+{
+       unsigned mask;
+
+       BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+       mask = (unsigned)~ACC_WRITE_MASK;
+       /* Allow write access to dirty gptes */
+       mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
+       *access &= mask;
+}
+
 static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
                           int *nr_present)
 {
@@ -3388,6 +3437,25 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
        return false;
 }
 
+static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
+{
+       unsigned access;
+
+       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+       access &= ~(gpte >> PT64_NX_SHIFT);
+
+       return access;
+}
+
+static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+{
+       unsigned index;
+
+       index = level - 1;
+       index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
+       return mmu->last_pte_bitmap & (1 << index);
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -3457,6 +3525,56 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
        }
 }
 
+static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+       unsigned bit, byte, pfec;
+       u8 map;
+       bool fault, x, w, u, wf, uf, ff, smep;
+
+       smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+       for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+               pfec = byte << 1;
+               map = 0;
+               wf = pfec & PFERR_WRITE_MASK;
+               uf = pfec & PFERR_USER_MASK;
+               ff = pfec & PFERR_FETCH_MASK;
+               for (bit = 0; bit < 8; ++bit) {
+                       x = bit & ACC_EXEC_MASK;
+                       w = bit & ACC_WRITE_MASK;
+                       u = bit & ACC_USER_MASK;
+
+                       /* Not really needed: !nx will cause pte.nx to fault */
+                       x |= !mmu->nx;
+                       /* Allow supervisor writes if !cr0.wp */
+                       w |= !is_write_protection(vcpu) && !uf;
+                       /* Disallow supervisor fetches of user code if cr4.smep */
+                       x &= !(smep && u && !uf);
+
+                       fault = (ff && !x) || (uf && !u) || (wf && !w);
+                       map |= fault << bit;
+               }
+               mmu->permissions[byte] = map;
+       }
+}
+
+static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+       u8 map;
+       unsigned level, root_level = mmu->root_level;
+       const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */
+
+       if (root_level == PT32E_ROOT_LEVEL)
+               --root_level;
+       /* PT_PAGE_TABLE_LEVEL always terminates */
+       map = 1 | (1 << ps_set_index);
+       for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
+               if (level <= PT_PDPE_LEVEL
+                   && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
+                       map |= 1 << (ps_set_index | (level - 1));
+       }
+       mmu->last_pte_bitmap = map;
+}
+
 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
                                        struct kvm_mmu *context,
                                        int level)
@@ -3465,6 +3583,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        context->root_level = level;
 
        reset_rsvds_bits_mask(vcpu, context);
+       update_permission_bitmask(vcpu, context);
+       update_last_pte_bitmap(vcpu, context);
 
        ASSERT(is_pae(vcpu));
        context->new_cr3 = paging_new_cr3;
@@ -3493,6 +3613,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        context->root_level = PT32_ROOT_LEVEL;
 
        reset_rsvds_bits_mask(vcpu, context);
+       update_permission_bitmask(vcpu, context);
+       update_last_pte_bitmap(vcpu, context);
 
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
@@ -3553,6 +3675,9 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
                context->gva_to_gpa = paging32_gva_to_gpa;
        }
 
+       update_permission_bitmask(vcpu, context);
+       update_last_pte_bitmap(vcpu, context);
+
        return 0;
 }
 
@@ -3628,6 +3753,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
                g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
        }
 
+       update_permission_bitmask(vcpu, g_context);
+       update_last_pte_bitmap(vcpu, g_context);
+
        return 0;
 }
 
index e374db9af0218fb1af8955cabd527b9d83d868f7..69871080e8663c76fed7b94783f9b6869e552ac8 100644 (file)
 #define PT_PCD_MASK (1ULL << 4)
 #define PT_ACCESSED_SHIFT 5
 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
+#define PT_PAGE_SIZE_SHIFT 7
+#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
 #define PT_PAT_MASK (1ULL << 7)
 #define PT_GLOBAL_MASK (1ULL << 8)
 #define PT64_NX_SHIFT 63
@@ -88,17 +90,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
        return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
-static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
-                                          bool write_fault, bool user_fault,
-                                          unsigned long pte)
+/*
+ * Will a fault with a given page-fault error code (pfec) cause a permission
+ * fault with the given access (in ACC_* format)?
+ */
+static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
+                                   unsigned pfec)
 {
-       if (unlikely(write_fault && !is_writable_pte(pte)
-             && (user_fault || is_write_protection(vcpu))))
-               return false;
-
-       if (unlikely(user_fault && !(pte & PT_USER_MASK)))
-               return false;
-
-       return true;
+       return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
 }
+
 #endif
index 7d7d0b9e23eb2e3d7b256d74c58852c9f26e5772..daff69e21150d054a109a889630f730702088b76 100644 (file)
@@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
        gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
        pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
-       if (is_error_pfn(pfn)) {
-               kvm_release_pfn_clean(pfn);
+       if (is_error_pfn(pfn))
                return;
-       }
 
        hpa =  pfn << PAGE_SHIFT;
        if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
@@ -190,7 +188,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-       struct kvm_memory_slot *slot;
        unsigned long *rmapp;
        u64 *sptep;
        struct rmap_iterator iter;
@@ -198,8 +195,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
        if (sp->role.direct || sp->unsync || sp->role.invalid)
                return;
 
-       slot = gfn_to_memslot(kvm, sp->gfn);
-       rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+       rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
 
        for (sptep = rmap_get_first(*rmapp, &iter); sptep;
             sptep = rmap_get_next(&iter)) {
index bb7cf01cae76ea735f759ef3c3d2428c271a7151..714e2c01a6fe0fd714f66146c8613f17f31f4a68 100644 (file)
  */
 struct guest_walker {
        int level;
+       unsigned max_level;
        gfn_t table_gfn[PT_MAX_FULL_LEVELS];
        pt_element_t ptes[PT_MAX_FULL_LEVELS];
        pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
        gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+       pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
        unsigned pt_access;
        unsigned pte_access;
        gfn_t gfn;
@@ -101,38 +103,41 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
        return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
-                                  bool last)
+static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
+                                            struct kvm_mmu *mmu,
+                                            struct guest_walker *walker,
+                                            int write_fault)
 {
-       unsigned access;
-
-       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-       if (last && !is_dirty_gpte(gpte))
-               access &= ~ACC_WRITE_MASK;
-
-#if PTTYPE == 64
-       if (vcpu->arch.mmu.nx)
-               access &= ~(gpte >> PT64_NX_SHIFT);
-#endif
-       return access;
-}
-
-static bool FNAME(is_last_gpte)(struct guest_walker *walker,
-                               struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-                               pt_element_t gpte)
-{
-       if (walker->level == PT_PAGE_TABLE_LEVEL)
-               return true;
-
-       if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
-           (PTTYPE == 64 || is_pse(vcpu)))
-               return true;
+       unsigned level, index;
+       pt_element_t pte, orig_pte;
+       pt_element_t __user *ptep_user;
+       gfn_t table_gfn;
+       int ret;
+
+       for (level = walker->max_level; level >= walker->level; --level) {
+               pte = orig_pte = walker->ptes[level - 1];
+               table_gfn = walker->table_gfn[level - 1];
+               ptep_user = walker->ptep_user[level - 1];
+               index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+               if (!(pte & PT_ACCESSED_MASK)) {
+                       trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
+                       pte |= PT_ACCESSED_MASK;
+               }
+               if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+                       trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+                       pte |= PT_DIRTY_MASK;
+               }
+               if (pte == orig_pte)
+                       continue;
 
-       if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
-           (mmu->root_level == PT64_ROOT_LEVEL))
-               return true;
+               ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+               if (ret)
+                       return ret;
 
-       return false;
+               mark_page_dirty(vcpu->kvm, table_gfn);
+               walker->ptes[level] = pte;
+       }
+       return 0;
 }
 
 /*
@@ -142,21 +147,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
                                    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                                    gva_t addr, u32 access)
 {
+       int ret;
        pt_element_t pte;
        pt_element_t __user *uninitialized_var(ptep_user);
        gfn_t table_gfn;
-       unsigned index, pt_access, uninitialized_var(pte_access);
+       unsigned index, pt_access, pte_access, accessed_dirty, shift;
        gpa_t pte_gpa;
-       bool eperm, last_gpte;
        int offset;
        const int write_fault = access & PFERR_WRITE_MASK;
        const int user_fault  = access & PFERR_USER_MASK;
        const int fetch_fault = access & PFERR_FETCH_MASK;
        u16 errcode = 0;
+       gpa_t real_gpa;
+       gfn_t gfn;
 
        trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
-       eperm = false;
        walker->level = mmu->root_level;
        pte           = mmu->get_cr3(vcpu);
 
@@ -169,15 +175,21 @@ retry_walk:
                --walker->level;
        }
 #endif
+       walker->max_level = walker->level;
        ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
               (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
-       pt_access = ACC_ALL;
+       accessed_dirty = PT_ACCESSED_MASK;
+       pt_access = pte_access = ACC_ALL;
+       ++walker->level;
 
-       for (;;) {
+       do {
                gfn_t real_gfn;
                unsigned long host_addr;
 
+               pt_access &= pte_access;
+               --walker->level;
+
                index = PT_INDEX(addr, walker->level);
 
                table_gfn = gpte_to_gfn(pte);
@@ -199,6 +211,7 @@ retry_walk:
                ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
                if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
                        goto error;
+               walker->ptep_user[walker->level - 1] = ptep_user;
 
                trace_kvm_mmu_paging_element(pte, walker->level);
 
@@ -211,92 +224,48 @@ retry_walk:
                        goto error;
                }
 
-               if (!check_write_user_access(vcpu, write_fault, user_fault,
-                                         pte))
-                       eperm = true;
-
-#if PTTYPE == 64
-               if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
-                       eperm = true;
-#endif
-
-               last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
-               if (last_gpte) {
-                       pte_access = pt_access &
-                                    FNAME(gpte_access)(vcpu, pte, true);
-                       /* check if the kernel is fetching from user page */
-                       if (unlikely(pte_access & PT_USER_MASK) &&
-                           kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-                               if (fetch_fault && !user_fault)
-                                       eperm = true;
-               }
-
-               if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
-                       int ret;
-                       trace_kvm_mmu_set_accessed_bit(table_gfn, index,
-                                                      sizeof(pte));
-                       ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
-                                                 pte, pte|PT_ACCESSED_MASK);
-                       if (unlikely(ret < 0))
-                               goto error;
-                       else if (ret)
-                               goto retry_walk;
-
-                       mark_page_dirty(vcpu->kvm, table_gfn);
-                       pte |= PT_ACCESSED_MASK;
-               }
+               accessed_dirty &= pte;
+               pte_access = pt_access & gpte_access(vcpu, pte);
 
                walker->ptes[walker->level - 1] = pte;
+       } while (!is_last_gpte(mmu, walker->level, pte));
 
-               if (last_gpte) {
-                       int lvl = walker->level;
-                       gpa_t real_gpa;
-                       gfn_t gfn;
-                       u32 ac;
-
-                       gfn = gpte_to_gfn_lvl(pte, lvl);
-                       gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
-
-                       if (PTTYPE == 32 &&
-                           walker->level == PT_DIRECTORY_LEVEL &&
-                           is_cpuid_PSE36())
-                               gfn += pse36_gfn_delta(pte);
-
-                       ac = write_fault | fetch_fault | user_fault;
+       if (unlikely(permission_fault(mmu, pte_access, access))) {
+               errcode |= PFERR_PRESENT_MASK;
+               goto error;
+       }
 
-                       real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
-                                                     ac);
-                       if (real_gpa == UNMAPPED_GVA)
-                               return 0;
+       gfn = gpte_to_gfn_lvl(pte, walker->level);
+       gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
 
-                       walker->gfn = real_gpa >> PAGE_SHIFT;
+       if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
+               gfn += pse36_gfn_delta(pte);
 
-                       break;
-               }
+       real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
+       if (real_gpa == UNMAPPED_GVA)
+               return 0;
 
-               pt_access &= FNAME(gpte_access)(vcpu, pte, false);
-               --walker->level;
-       }
+       walker->gfn = real_gpa >> PAGE_SHIFT;
 
-       if (unlikely(eperm)) {
-               errcode |= PFERR_PRESENT_MASK;
-               goto error;
-       }
+       if (!write_fault)
+               protect_clean_gpte(&pte_access, pte);
 
-       if (write_fault && unlikely(!is_dirty_gpte(pte))) {
-               int ret;
+       /*
+        * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
+        * place right.
+        *
+        * On a read fault, do nothing.
+        */
+       shift = write_fault >> ilog2(PFERR_WRITE_MASK);
+       shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
+       accessed_dirty &= pte >> shift;
 
-               trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-               ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
-                                         pte, pte|PT_DIRTY_MASK);
+       if (unlikely(!accessed_dirty)) {
+               ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
                if (unlikely(ret < 0))
                        goto error;
                else if (ret)
                        goto retry_walk;
-
-               mark_page_dirty(vcpu->kvm, table_gfn);
-               pte |= PT_DIRTY_MASK;
-               walker->ptes[walker->level - 1] = pte;
        }
 
        walker->pt_access = pt_access;
@@ -368,12 +337,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                return;
 
        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-       pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
+       pte_access = sp->role.access & gpte_access(vcpu, gpte);
+       protect_clean_gpte(&pte_access, gpte);
        pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
-       if (mmu_invalid_pfn(pfn)) {
-               kvm_release_pfn_clean(pfn);
+       if (mmu_invalid_pfn(pfn))
                return;
-       }
 
        /*
         * we call mmu_set_spte() with host_writable = true because that
@@ -443,15 +411,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                        continue;
 
-               pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
-                                                                 true);
+               pte_access = sp->role.access & gpte_access(vcpu, gpte);
+               protect_clean_gpte(&pte_access, gpte);
                gfn = gpte_to_gfn(gpte);
                pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
                                      pte_access & ACC_WRITE_MASK);
-               if (mmu_invalid_pfn(pfn)) {
-                       kvm_release_pfn_clean(pfn);
+               if (mmu_invalid_pfn(pfn))
                        break;
-               }
 
                mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
                             NULL, PT_PAGE_TABLE_LEVEL, gfn,
@@ -798,7 +764,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
                gfn = gpte_to_gfn(gpte);
                pte_access = sp->role.access;
-               pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+               pte_access &= gpte_access(vcpu, gpte);
+               protect_clean_gpte(&pte_access, gpte);
 
                if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
                        continue;
index 9b7ec1150ab01ad1390217cc04176d255c5d8382..cfc258a6bf97a1efda8b97bb0ae4fd4bd21a9f23 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
  *
  * Copyright 2011 Red Hat, Inc. and/or its affiliates.
  *
index baead950d6c82cfb3ae530c198ab9d022857170f..d017df3899ef23a2dc339ebcf7c7d88b6eef6e8e 100644 (file)
@@ -163,7 +163,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
 
 #define MSR_INVALID                    0xffffffffU
 
-static struct svm_direct_access_msrs {
+static const struct svm_direct_access_msrs {
        u32 index;   /* Index of the MSR */
        bool always; /* True if intercept is always on */
 } direct_access_msrs[] = {
@@ -400,7 +400,7 @@ struct svm_init_data {
        int r;
 };
 
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 
 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 #define MSRS_RANGE_SIZE 2048
@@ -1146,7 +1146,6 @@ static void init_vmcb(struct vcpu_svm *svm)
 
        svm_set_efer(&svm->vcpu, 0);
        save->dr6 = 0xffff0ff0;
-       save->dr7 = 0x400;
        kvm_set_rflags(&svm->vcpu, 2);
        save->rip = 0x0000fff0;
        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1643,7 +1642,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
        mark_dirty(svm->vmcb, VMCB_SEG);
 }
 
-static void update_db_intercept(struct kvm_vcpu *vcpu)
+static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -1663,20 +1662,6 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
                vcpu->guest_debug = 0;
 }
 
-static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-               svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
-       else
-               svm->vmcb->save.dr7 = vcpu->arch.dr7;
-
-       mark_dirty(svm->vmcb, VMCB_DR);
-
-       update_db_intercept(vcpu);
-}
-
 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 {
        if (sd->next_asid > sd->max_asid) {
@@ -1748,7 +1733,7 @@ static int db_interception(struct vcpu_svm *svm)
                if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
                        svm->vmcb->save.rflags &=
                                ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-               update_db_intercept(&svm->vcpu);
+               update_db_bp_intercept(&svm->vcpu);
        }
 
        if (svm->vcpu.guest_debug &
@@ -2063,7 +2048,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
        if (svm->nested.intercept & 1ULL) {
                /*
                 * The #vmexit can't be emulated here directly because this
-                * code path runs with irqs and preemtion disabled. A
+                * code path runs with irqs and preemption disabled. A
                 * #vmexit emulation might sleep. Only signal request for
                 * the #vmexit here.
                 */
@@ -2105,7 +2090,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
        return kmap(page);
 
 error:
-       kvm_release_page_clean(page);
        kvm_inject_gp(&svm->vcpu, 0);
 
        return NULL;
@@ -2409,7 +2393,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 {
        /*
         * This function merges the msr permission bitmaps of kvm and the
-        * nested vmcb. It is omptimized in that it only merges the parts where
+        * nested vmcb. It is optimized in that it only merges the parts where
         * the kvm msr permission bitmap may contain zero bits
         */
        int i;
@@ -3268,7 +3252,7 @@ static int pause_interception(struct vcpu_svm *svm)
        return 1;
 }
 
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_READ_CR0]                     = cr_interception,
        [SVM_EXIT_READ_CR3]                     = cr_interception,
        [SVM_EXIT_READ_CR4]                     = cr_interception,
@@ -3660,7 +3644,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
         */
        svm->nmi_singlestep = true;
        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
-       update_db_intercept(vcpu);
+       update_db_bp_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3783,12 +3767,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
        svm_complete_interrupts(svm);
 }
 
-#ifdef CONFIG_X86_64
-#define R "r"
-#else
-#define R "e"
-#endif
-
 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -3815,13 +3793,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
        local_irq_enable();
 
        asm volatile (
-               "push %%"R"bp; \n\t"
-               "mov %c[rbx](%[svm]), %%"R"bx \n\t"
-               "mov %c[rcx](%[svm]), %%"R"cx \n\t"
-               "mov %c[rdx](%[svm]), %%"R"dx \n\t"
-               "mov %c[rsi](%[svm]), %%"R"si \n\t"
-               "mov %c[rdi](%[svm]), %%"R"di \n\t"
-               "mov %c[rbp](%[svm]), %%"R"bp \n\t"
+               "push %%" _ASM_BP "; \n\t"
+               "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
+               "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
+               "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
+               "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
+               "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
+               "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
 #ifdef CONFIG_X86_64
                "mov %c[r8](%[svm]),  %%r8  \n\t"
                "mov %c[r9](%[svm]),  %%r9  \n\t"
@@ -3834,20 +3812,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 
                /* Enter guest mode */
-               "push %%"R"ax \n\t"
-               "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
+               "push %%" _ASM_AX " \n\t"
+               "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
                __ex(SVM_VMLOAD) "\n\t"
                __ex(SVM_VMRUN) "\n\t"
                __ex(SVM_VMSAVE) "\n\t"
-               "pop %%"R"ax \n\t"
+               "pop %%" _ASM_AX " \n\t"
 
                /* Save guest registers, load host registers */
-               "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
-               "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
-               "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
-               "mov %%"R"si, %c[rsi](%[svm]) \n\t"
-               "mov %%"R"di, %c[rdi](%[svm]) \n\t"
-               "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
+               "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
+               "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
+               "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
+               "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
+               "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
+               "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
 #ifdef CONFIG_X86_64
                "mov %%r8,  %c[r8](%[svm]) \n\t"
                "mov %%r9,  %c[r9](%[svm]) \n\t"
@@ -3858,7 +3836,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
                "mov %%r14, %c[r14](%[svm]) \n\t"
                "mov %%r15, %c[r15](%[svm]) \n\t"
 #endif
-               "pop %%"R"bp"
+               "pop %%" _ASM_BP
                :
                : [svm]"a"(svm),
                  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -3879,9 +3857,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
                  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
 #endif
                : "cc", "memory"
-               , R"bx", R"cx", R"dx", R"si", R"di"
 #ifdef CONFIG_X86_64
+               , "rbx", "rcx", "rdx", "rsi", "rdi"
                , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+               , "ebx", "ecx", "edx", "esi", "edi"
 #endif
                );
 
@@ -3941,8 +3921,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
        mark_all_clean(svm->vmcb);
 }
 
-#undef R
-
 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -4069,7 +4047,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 #define POST_MEM(exit) { .exit_code = (exit), \
                        .stage = X86_ICPT_POST_MEMACCESS, }
 
-static struct __x86_intercept {
+static const struct __x86_intercept {
        u32 exit_code;
        enum x86_intercept_stage stage;
 } x86_intercept_map[] = {
@@ -4260,7 +4238,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .vcpu_load = svm_vcpu_load,
        .vcpu_put = svm_vcpu_put,
 
-       .set_guest_debug = svm_guest_debug,
+       .update_db_bp_intercept = update_db_bp_intercept,
        .get_msr = svm_get_msr,
        .set_msr = svm_set_msr,
        .get_segment_base = svm_get_segment_base,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
deleted file mode 100644 (file)
index 6b85cc6..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * timer support
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/hrtimer.h>
-#include <linux/atomic.h>
-#include "kvm_timer.h"
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
-       struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-       struct kvm_vcpu *vcpu = ktimer->vcpu;
-       wait_queue_head_t *q = &vcpu->wq;
-
-       /*
-        * There is a race window between reading and incrementing, but we do
-        * not care about potentially losing timer events in the !reinject
-        * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
-        * in vcpu_enter_guest.
-        */
-       if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
-               atomic_inc(&ktimer->pending);
-               /* FIXME: this code should not know anything about vcpus */
-               kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
-       }
-
-       if (waitqueue_active(q))
-               wake_up_interruptible(q);
-
-       if (ktimer->t_ops->is_periodic(ktimer)) {
-               hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
-               return HRTIMER_RESTART;
-       } else
-               return HRTIMER_NORESTART;
-}
index 851aa7c3b890f511fc350f2275a33801f58e8350..ad6b1dd06f8b967356d4f081f4cde119f2a2bfe8 100644 (file)
@@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO);
 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
+extern const ulong vmx_return;
+
 #define NR_AUTOLOAD_MSRS 8
 #define VMCS02_POOL_SIZE 1
 
@@ -405,16 +407,16 @@ struct vcpu_vmx {
        struct {
                int vm86_active;
                ulong save_rflags;
+               struct kvm_segment segs[8];
+       } rmode;
+       struct {
+               u32 bitmask; /* 4 bits per segment (1 bit per field) */
                struct kvm_save_segment {
                        u16 selector;
                        unsigned long base;
                        u32 limit;
                        u32 ar;
-               } tr, es, ds, fs, gs;
-       } rmode;
-       struct {
-               u32 bitmask; /* 4 bits per segment (1 bit per field) */
-               struct kvm_save_segment seg[8];
+               } seg[8];
        } segment_cache;
        int vpid;
        bool emulation_required;
@@ -450,7 +452,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 #define FIELD64(number, name)  [number] = VMCS12_OFFSET(name), \
                                [number##_HIGH] = VMCS12_OFFSET(name)+4
 
-static unsigned short vmcs_field_to_offset_table[] = {
+static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
        FIELD(GUEST_ES_SELECTOR, guest_es_selector),
        FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
@@ -596,10 +598,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
 {
        struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
-       if (is_error_page(page)) {
-               kvm_release_page_clean(page);
+       if (is_error_page(page))
                return NULL;
-       }
+
        return page;
 }
 
@@ -667,7 +668,7 @@ static struct vmx_capability {
                .ar_bytes = GUEST_##seg##_AR_BYTES,             \
        }
 
-static struct kvm_vmx_segment_field {
+static const struct kvm_vmx_segment_field {
        unsigned selector;
        unsigned base;
        unsigned limit;
@@ -1343,7 +1344,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
        guest_efer = vmx->vcpu.arch.efer;
 
        /*
-        * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+        * NX is emulated; LMA and LME handled by hardware; SCE meaningless
         * outside long mode
         */
        ignore_bits = EFER_NX | EFER_SCE;
@@ -1995,7 +1996,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #endif
                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
-               CPU_BASED_RDPMC_EXITING |
+               CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
        /*
         * We can allow some features even when not supported by the
@@ -2291,16 +2292,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
        }
 }
 
-static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
-       if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-               vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
-       else
-               vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-
-       update_exception_bitmap(vcpu);
-}
-
 static __init int cpu_has_kvm_support(void)
 {
        return cpu_has_vmx();
@@ -2698,20 +2689,17 @@ static __exit void hardware_unsetup(void)
        free_kvm_area();
 }
 
-static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
+static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
 {
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       struct kvm_segment tmp = *save;
 
-       if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
-               vmcs_write16(sf->selector, save->selector);
-               vmcs_writel(sf->base, save->base);
-               vmcs_write32(sf->limit, save->limit);
-               vmcs_write32(sf->ar_bytes, save->ar);
-       } else {
-               u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
-                       << AR_DPL_SHIFT;
-               vmcs_write32(sf->ar_bytes, 0x93 | dpl);
+       if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
+               tmp.base = vmcs_readl(sf->base);
+               tmp.selector = vmcs_read16(sf->selector);
+               tmp.s = 1;
        }
+       vmx_set_segment(vcpu, &tmp, seg);
 }
 
 static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2724,10 +2712,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 
        vmx_segment_cache_clear(vmx);
 
-       vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
-       vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
-       vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
-       vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
+       vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
 
        flags = vmcs_readl(GUEST_RFLAGS);
        flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2742,10 +2727,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
        if (emulate_invalid_guest_state)
                return;
 
-       fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
-       fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
-       fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
-       fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
+       fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+       fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+       fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+       fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
 
        vmx_segment_cache_clear(vmx);
 
@@ -2773,14 +2758,10 @@ static gva_t rmode_tss_base(struct kvm *kvm)
        return kvm->arch.tss_addr;
 }
 
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+static void fix_rmode_seg(int seg, struct kvm_segment *save)
 {
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
-       save->selector = vmcs_read16(sf->selector);
-       save->base = vmcs_readl(sf->base);
-       save->limit = vmcs_read32(sf->limit);
-       save->ar = vmcs_read32(sf->ar_bytes);
        vmcs_write16(sf->selector, save->base >> 4);
        vmcs_write32(sf->base, save->base & 0xffff0);
        vmcs_write32(sf->limit, 0xffff);
@@ -2800,9 +2781,16 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
        if (enable_unrestricted_guest)
                return;
 
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+
        vmx->emulation_required = 1;
        vmx->rmode.vm86_active = 1;
 
+
        /*
         * Very old userspace does not call KVM_SET_TSS_ADDR before entering
         * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2817,14 +2805,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 
        vmx_segment_cache_clear(vmx);
 
-       vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
-       vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
        vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
-       vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
-       vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
        flags = vmcs_readl(GUEST_RFLAGS);
@@ -3117,35 +3099,24 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_save_segment *save;
        u32 ar;
 
        if (vmx->rmode.vm86_active
            && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
                || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
-               || seg == VCPU_SREG_GS)
-           && !emulate_invalid_guest_state) {
-               switch (seg) {
-               case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
-               case VCPU_SREG_ES: save = &vmx->rmode.es; break;
-               case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
-               case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
-               case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
-               default: BUG();
-               }
-               var->selector = save->selector;
-               var->base = save->base;
-               var->limit = save->limit;
-               ar = save->ar;
+               || seg == VCPU_SREG_GS)) {
+               *var = vmx->rmode.segs[seg];
                if (seg == VCPU_SREG_TR
                    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
-                       goto use_saved_rmode_seg;
+                       return;
+               var->base = vmx_read_guest_seg_base(vmx, seg);
+               var->selector = vmx_read_guest_seg_selector(vmx, seg);
+               return;
        }
        var->base = vmx_read_guest_seg_base(vmx, seg);
        var->limit = vmx_read_guest_seg_limit(vmx, seg);
        var->selector = vmx_read_guest_seg_selector(vmx, seg);
        ar = vmx_read_guest_seg_ar(vmx, seg);
-use_saved_rmode_seg:
        if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
                ar = 0;
        var->type = ar & 15;
@@ -3227,23 +3198,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
        u32 ar;
 
        vmx_segment_cache_clear(vmx);
 
        if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
                vmcs_write16(sf->selector, var->selector);
-               vmx->rmode.tr.selector = var->selector;
-               vmx->rmode.tr.base = var->base;
-               vmx->rmode.tr.limit = var->limit;
-               vmx->rmode.tr.ar = vmx_segment_access_rights(var);
+               vmx->rmode.segs[VCPU_SREG_TR] = *var;
                return;
        }
        vmcs_writel(sf->base, var->base);
        vmcs_write32(sf->limit, var->limit);
        vmcs_write16(sf->selector, var->selector);
        if (vmx->rmode.vm86_active && var->s) {
+               vmx->rmode.segs[seg] = *var;
                /*
                 * Hack real-mode segments into vm86 compatibility.
                 */
@@ -3258,7 +3227,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
         * qemu binaries.
         *   IA32 arch specifies that at the time of processor reset the
         * "Accessed" bit in the AR field of segment registers is 1. And qemu
-        * is setting it to 0 in the usedland code. This causes invalid guest
+        * is setting it to 0 in the userland code. This causes invalid guest
         * state vmexit when "unrestricted guest" mode is turned on.
         *    Fix for this setup issue in cpu_reset is being pushed in the qemu
         * tree. Newer qemu binaries with that qemu fix would not need this
@@ -3288,16 +3257,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
                                     vmcs_readl(GUEST_CS_BASE) >> 4);
                        break;
                case VCPU_SREG_ES:
-                       fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
-                       break;
                case VCPU_SREG_DS:
-                       fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
-                       break;
                case VCPU_SREG_GS:
-                       fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
-                       break;
                case VCPU_SREG_FS:
-                       fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+                       fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
                        break;
                case VCPU_SREG_SS:
                        vmcs_write16(GUEST_SS_SELECTOR,
@@ -3351,9 +3314,9 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
 
        if (var.base != (var.selector << 4))
                return false;
-       if (var.limit != 0xffff)
+       if (var.limit < 0xffff)
                return false;
-       if (ar != 0xf3)
+       if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
                return false;
 
        return true;
@@ -3605,7 +3568,7 @@ out:
 
 static void seg_setup(int seg)
 {
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
        unsigned int ar;
 
        vmcs_write16(sf->selector, 0);
@@ -3770,8 +3733,7 @@ static void vmx_set_constant_host_state(void)
        native_store_idt(&dt);
        vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 
-       asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
-       vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+       vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
 
        rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
        vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
@@ -4005,8 +3967,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                kvm_rip_write(vcpu, 0);
        kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
-       vmcs_writel(GUEST_DR7, 0x400);
-
        vmcs_writel(GUEST_GDTR_BASE, 0);
        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
 
@@ -4456,7 +4416,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
 
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
        if (to_vmx(vcpu)->nested.vmxon &&
@@ -5701,7 +5661,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
  * to be done to userspace and return 0.
  */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
        [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
@@ -6229,17 +6189,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                                        msrs[i].host);
 }
 
-#ifdef CONFIG_X86_64
-#define R "r"
-#define Q "q"
-#else
-#define R "e"
-#define Q "l"
-#endif
-
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long debugctlmsr;
 
        if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6279,34 +6232,35 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                vmx_set_interrupt_shadow(vcpu, 0);
 
        atomic_switch_perf_msrs(vmx);
+       debugctlmsr = get_debugctlmsr();
 
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
-               "push %%"R"dx; push %%"R"bp;"
-               "push %%"R"cx \n\t" /* placeholder for guest rcx */
-               "push %%"R"cx \n\t"
-               "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+               "push %%" _ASM_DX "; push %%" _ASM_BP ";"
+               "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
+               "push %%" _ASM_CX " \n\t"
+               "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
                "je 1f \n\t"
-               "mov %%"R"sp, %c[host_rsp](%0) \n\t"
+               "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
                __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
                "1: \n\t"
                /* Reload cr2 if changed */
-               "mov %c[cr2](%0), %%"R"ax \n\t"
-               "mov %%cr2, %%"R"dx \n\t"
-               "cmp %%"R"ax, %%"R"dx \n\t"
+               "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
+               "mov %%cr2, %%" _ASM_DX " \n\t"
+               "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
                "je 2f \n\t"
-               "mov %%"R"ax, %%cr2 \n\t"
+               "mov %%" _ASM_AX", %%cr2 \n\t"
                "2: \n\t"
                /* Check if vmlaunch of vmresume is needed */
                "cmpl $0, %c[launched](%0) \n\t"
                /* Load guest registers.  Don't clobber flags. */
-               "mov %c[rax](%0), %%"R"ax \n\t"
-               "mov %c[rbx](%0), %%"R"bx \n\t"
-               "mov %c[rdx](%0), %%"R"dx \n\t"
-               "mov %c[rsi](%0), %%"R"si \n\t"
-               "mov %c[rdi](%0), %%"R"di \n\t"
-               "mov %c[rbp](%0), %%"R"bp \n\t"
+               "mov %c[rax](%0), %%" _ASM_AX " \n\t"
+               "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
+               "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
+               "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
+               "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
+               "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
 #ifdef CONFIG_X86_64
                "mov %c[r8](%0),  %%r8  \n\t"
                "mov %c[r9](%0),  %%r9  \n\t"
@@ -6317,24 +6271,24 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                "mov %c[r14](%0), %%r14 \n\t"
                "mov %c[r15](%0), %%r15 \n\t"
 #endif
-               "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+               "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
 
                /* Enter guest mode */
-               "jne .Llaunched \n\t"
+               "jne 1f \n\t"
                __ex(ASM_VMX_VMLAUNCH) "\n\t"
-               "jmp .Lkvm_vmx_return \n\t"
-               ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
-               ".Lkvm_vmx_return: "
+               "jmp 2f \n\t"
+               "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+               "2: "
                /* Save guest registers, load host registers, keep flags */
-               "mov %0, %c[wordsize](%%"R"sp) \n\t"
+               "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
                "pop %0 \n\t"
-               "mov %%"R"ax, %c[rax](%0) \n\t"
-               "mov %%"R"bx, %c[rbx](%0) \n\t"
-               "pop"Q" %c[rcx](%0) \n\t"
-               "mov %%"R"dx, %c[rdx](%0) \n\t"
-               "mov %%"R"si, %c[rsi](%0) \n\t"
-               "mov %%"R"di, %c[rdi](%0) \n\t"
-               "mov %%"R"bp, %c[rbp](%0) \n\t"
+               "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
+               "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
+               __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
+               "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
+               "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
+               "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
+               "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
 #ifdef CONFIG_X86_64
                "mov %%r8,  %c[r8](%0) \n\t"
                "mov %%r9,  %c[r9](%0) \n\t"
@@ -6345,11 +6299,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                "mov %%r14, %c[r14](%0) \n\t"
                "mov %%r15, %c[r15](%0) \n\t"
 #endif
-               "mov %%cr2, %%"R"ax   \n\t"
-               "mov %%"R"ax, %c[cr2](%0) \n\t"
+               "mov %%cr2, %%" _ASM_AX "   \n\t"
+               "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
 
-               "pop  %%"R"bp; pop  %%"R"dx \n\t"
+               "pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
                "setbe %c[fail](%0) \n\t"
+               ".pushsection .rodata \n\t"
+               ".global vmx_return \n\t"
+               "vmx_return: " _ASM_PTR " 2b \n\t"
+               ".popsection"
              : : "c"(vmx), "d"((unsigned long)HOST_RSP),
                [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
@@ -6374,12 +6332,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
                [wordsize]"i"(sizeof(ulong))
              : "cc", "memory"
-               , R"ax", R"bx", R"di", R"si"
 #ifdef CONFIG_X86_64
+               , "rax", "rbx", "rdi", "rsi"
                , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+               , "eax", "ebx", "edi", "esi"
 #endif
              );
 
+       /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+       if (debugctlmsr)
+               update_debugctlmsr(debugctlmsr);
+
 #ifndef CONFIG_X86_64
        /*
         * The sysexit path does not restore ds/es, so we must set them to
@@ -6424,9 +6388,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx_complete_interrupts(vmx);
 }
 
-#undef R
-#undef Q
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7281,7 +7242,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .vcpu_load = vmx_vcpu_load,
        .vcpu_put = vmx_vcpu_put,
 
-       .set_guest_debug = set_guest_debug,
+       .update_db_bp_intercept = update_exception_bitmap,
        .get_msr = vmx_get_msr,
        .set_msr = vmx_set_msr,
        .get_segment_base = vmx_get_segment_base,
index 1f09552572fa15709b51022fdbaa8044b3fdb73d..1eefebe5d72758873df0d13e4ae8c686a7d016ee 100644 (file)
@@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore)
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
-       if (irqchip_in_kernel(vcpu->kvm))
-               return vcpu->arch.apic_base;
-       else
-               return vcpu->arch.apic_base;
+       return vcpu->arch.apic_base;
 }
 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 {
        /* TODO: reserve bits check */
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_lapic_set_base(vcpu, data);
-       else
-               vcpu->arch.apic_base = data;
+       kvm_lapic_set_base(vcpu, data);
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
@@ -698,6 +692,18 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+       unsigned long dr7;
+
+       if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+               dr7 = vcpu->arch.guest_debug_dr7;
+       else
+               dr7 = vcpu->arch.dr7;
+       kvm_x86_ops->set_dr7(vcpu, dr7);
+       vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+}
+
 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
        switch (dr) {
@@ -723,10 +729,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
                if (val & 0xffffffff00000000ULL)
                        return -1; /* #GP */
                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
-               if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-                       kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
-                       vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
-               }
+               kvm_update_dr7(vcpu);
                break;
        }
 
@@ -823,7 +826,7 @@ static u32 msrs_to_save[] = {
 
 static unsigned num_msrs_to_save;
 
-static u32 emulated_msrs[] = {
+static const u32 emulated_msrs[] = {
        MSR_IA32_TSCDEADLINE,
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
@@ -1097,7 +1100,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                 * For each generation, we track the original measured
                 * nanosecond time, offset, and write, so if TSCs are in
                 * sync, we can match exact offset, and if not, we can match
-                * exact software computaion in compute_guest_tsc()
+                * exact software computation in compute_guest_tsc()
                 *
                 * These values are tracked in kvm->arch.cur_xxx variables.
                 */
@@ -1140,6 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        unsigned long this_tsc_khz;
        s64 kernel_ns, max_kernel_ns;
        u64 tsc_timestamp;
+       u8 pvclock_flags;
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
@@ -1221,7 +1225,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
        vcpu->last_kernel_ns = kernel_ns;
        vcpu->last_guest_tsc = tsc_timestamp;
-       vcpu->hv_clock.flags = 0;
+
+       pvclock_flags = 0;
+       if (vcpu->pvclock_set_guest_stopped_request) {
+               pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+               vcpu->pvclock_set_guest_stopped_request = false;
+       }
+
+       vcpu->hv_clock.flags = pvclock_flags;
 
        /*
         * The interface expects us to write an even number signaling that the
@@ -1504,7 +1515,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 {
        gpa_t gpa = data & ~0x3f;
 
-       /* Bits 2:5 are resrved, Should be zero */
+       /* Bits 2:5 are reserved, Should be zero */
        if (data & 0x3c)
                return 1;
 
@@ -1639,10 +1650,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                vcpu->arch.time_page =
                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 
-               if (is_error_page(vcpu->arch.time_page)) {
-                       kvm_release_page_clean(vcpu->arch.time_page);
+               if (is_error_page(vcpu->arch.time_page))
                        vcpu->arch.time_page = NULL;
-               }
+
                break;
        }
        case MSR_KVM_ASYNC_PF_EN:
@@ -1727,7 +1737,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                 * Ignore all writes to this no longer documented MSR.
                 * Writes are only relevant for old K7 processors,
                 * all pre-dating SVM, but a recommended workaround from
-                * AMD for these chips. It is possible to speicify the
+                * AMD for these chips. It is possible to specify the
                 * affected processor models on the command line, hence
                 * the need to ignore the workaround.
                 */
@@ -2177,6 +2187,8 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_GET_TSC_KHZ:
        case KVM_CAP_PCI_2_3:
        case KVM_CAP_KVMCLOCK_CTRL:
+       case KVM_CAP_READONLY_MEM:
+       case KVM_CAP_IRQFD_RESAMPLE:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -2358,8 +2370,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-       memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-       kvm_apic_post_state_restore(vcpu);
+       kvm_apic_post_state_restore(vcpu, s);
        update_cr8_intercept(vcpu);
 
        return 0;
@@ -2368,7 +2379,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
                                    struct kvm_interrupt *irq)
 {
-       if (irq->irq < 0 || irq->irq >= 256)
+       if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
                return -EINVAL;
        if (irqchip_in_kernel(vcpu->kvm))
                return -ENXIO;
@@ -2635,11 +2646,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
  */
 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 {
-       struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
        if (!vcpu->arch.time_page)
                return -EINVAL;
-       src->flags |= PVCLOCK_GUEST_STOPPED;
-       mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+       vcpu->arch.pvclock_set_guest_stopped_request = true;
        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
        return 0;
 }
@@ -3090,7 +3099,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
        if (!kvm->arch.vpit)
                return -ENXIO;
        mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+       kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return 0;
 }
@@ -3173,6 +3182,16 @@ out:
        return r;
 }
 
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+       if (!irqchip_in_kernel(kvm))
+               return -ENXIO;
+
+       irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+                                       irq_event->irq, irq_event->level);
+       return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -3279,29 +3298,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        create_pit_unlock:
                mutex_unlock(&kvm->slots_lock);
                break;
-       case KVM_IRQ_LINE_STATUS:
-       case KVM_IRQ_LINE: {
-               struct kvm_irq_level irq_event;
-
-               r = -EFAULT;
-               if (copy_from_user(&irq_event, argp, sizeof irq_event))
-                       goto out;
-               r = -ENXIO;
-               if (irqchip_in_kernel(kvm)) {
-                       __s32 status;
-                       status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-                                       irq_event.irq, irq_event.level);
-                       if (ioctl == KVM_IRQ_LINE_STATUS) {
-                               r = -EFAULT;
-                               irq_event.status = status;
-                               if (copy_to_user(argp, &irq_event,
-                                                       sizeof irq_event))
-                                       goto out;
-                       }
-                       r = 0;
-               }
-               break;
-       }
        case KVM_GET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
                struct kvm_irqchip *chip;
@@ -3689,20 +3685,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
                                gpa_t *gpa, struct x86_exception *exception,
                                bool write)
 {
-       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+               | (write ? PFERR_WRITE_MASK : 0);
 
-       if (vcpu_match_mmio_gva(vcpu, gva) &&
-                 check_write_user_access(vcpu, write, access,
-                 vcpu->arch.access)) {
+       if (vcpu_match_mmio_gva(vcpu, gva)
+           && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
                *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                        (gva & (PAGE_SIZE - 1));
                trace_vcpu_match_mmio(gva, *gpa, write, false);
                return 1;
        }
 
-       if (write)
-               access |= PFERR_WRITE_MASK;
-
        *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 
        if (*gpa == UNMAPPED_GVA)
@@ -3790,14 +3783,14 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
        return X86EMUL_CONTINUE;
 }
 
-static struct read_write_emulator_ops read_emultor = {
+static const struct read_write_emulator_ops read_emultor = {
        .read_write_prepare = read_prepare,
        .read_write_emulate = read_emulate,
        .read_write_mmio = vcpu_mmio_read,
        .read_write_exit_mmio = read_exit_mmio,
 };
 
-static struct read_write_emulator_ops write_emultor = {
+static const struct read_write_emulator_ops write_emultor = {
        .read_write_emulate = write_emulate,
        .read_write_mmio = write_mmio,
        .read_write_exit_mmio = write_exit_mmio,
@@ -3808,7 +3801,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
                                       unsigned int bytes,
                                       struct x86_exception *exception,
                                       struct kvm_vcpu *vcpu,
-                                      struct read_write_emulator_ops *ops)
+                                      const struct read_write_emulator_ops *ops)
 {
        gpa_t gpa;
        int handled, ret;
@@ -3857,7 +3850,7 @@ mmio:
 int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
                        void *val, unsigned int bytes,
                        struct x86_exception *exception,
-                       struct read_write_emulator_ops *ops)
+                       const struct read_write_emulator_ops *ops)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        gpa_t gpa;
@@ -3962,10 +3955,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                goto emul_write;
 
        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-       if (is_error_page(page)) {
-               kvm_release_page_clean(page);
+       if (is_error_page(page))
                goto emul_write;
-       }
 
        kaddr = kmap_atomic(page);
        kaddr += offset_in_page(gpa);
@@ -4332,7 +4323,19 @@ static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
        kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
 }
 
-static struct x86_emulate_ops emulate_ops = {
+static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+{
+       return kvm_register_read(emul_to_vcpu(ctxt), reg);
+}
+
+static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+{
+       kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+}
+
+static const struct x86_emulate_ops emulate_ops = {
+       .read_gpr            = emulator_read_gpr,
+       .write_gpr           = emulator_write_gpr,
        .read_std            = kvm_read_guest_virt_system,
        .write_std           = kvm_write_guest_virt_system,
        .fetch               = kvm_fetch_guest_virt,
@@ -4367,14 +4370,6 @@ static struct x86_emulate_ops emulate_ops = {
        .get_cpuid           = emulator_get_cpuid,
 };
 
-static void cache_all_regs(struct kvm_vcpu *vcpu)
-{
-       kvm_register_read(vcpu, VCPU_REGS_RAX);
-       kvm_register_read(vcpu, VCPU_REGS_RSP);
-       kvm_register_read(vcpu, VCPU_REGS_RIP);
-       vcpu->arch.regs_dirty = ~0;
-}
-
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 {
        u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
@@ -4401,12 +4396,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
                kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
-                             const unsigned long *regs)
+static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
 {
        memset(&ctxt->twobyte, 0,
-              (void *)&ctxt->regs - (void *)&ctxt->twobyte);
-       memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+              (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
 
        ctxt->fetch.start = 0;
        ctxt->fetch.end = 0;
@@ -4421,14 +4414,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        int cs_db, cs_l;
 
-       /*
-        * TODO: fix emulate.c to use guest_read/write_register
-        * instead of direct ->regs accesses, can save hundred cycles
-        * on Intel for instructions that don't read/change RSP, for
-        * for example.
-        */
-       cache_all_regs(vcpu);
-
        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
        ctxt->eflags = kvm_get_rflags(vcpu);
@@ -4440,7 +4425,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
                                                          X86EMUL_MODE_PROT16;
        ctxt->guest_mode = is_guest_mode(vcpu);
 
-       init_decode_cache(ctxt, vcpu->arch.regs);
+       init_decode_cache(ctxt);
        vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 
@@ -4460,7 +4445,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
                return EMULATE_FAIL;
 
        ctxt->eip = ctxt->_eip;
-       memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
        kvm_rip_write(vcpu, ctxt->eip);
        kvm_set_rflags(vcpu, ctxt->eflags);
 
@@ -4493,13 +4477,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 {
        gpa_t gpa;
+       pfn_t pfn;
 
        if (tdp_enabled)
                return false;
 
        /*
         * if emulation was due to access to shadowed page table
-        * and it failed try to unshadow page and re-entetr the
+        * and it failed try to unshadow page and re-enter the
         * guest to let CPU execute the instruction.
         */
        if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -4510,8 +4495,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
        if (gpa == UNMAPPED_GVA)
                return true; /* let cpu generate fault */
 
-       if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+       /*
+        * Do not retry the unhandleable instruction if it faults on the
+        * readonly host memory, otherwise it will goto a infinite loop:
+        * retry instruction -> write #PF -> emulation fail -> retry
+        * instruction -> ...
+        */
+       pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+       if (!is_error_pfn(pfn)) {
+               kvm_release_pfn_clean(pfn);
                return true;
+       }
 
        return false;
 }
@@ -4560,6 +4554,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
        return true;
 }
 
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                            unsigned long cr2,
                            int emulation_type,
@@ -4608,7 +4605,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
           changes registers values  during IO operation */
        if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
                vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
-               memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
+               emulator_invalidate_register_cache(ctxt);
        }
 
 restart:
@@ -4630,13 +4627,16 @@ restart:
        } else if (vcpu->arch.pio.count) {
                if (!vcpu->arch.pio.in)
                        vcpu->arch.pio.count = 0;
-               else
+               else {
                        writeback = false;
+                       vcpu->arch.complete_userspace_io = complete_emulated_pio;
+               }
                r = EMULATE_DO_MMIO;
        } else if (vcpu->mmio_needed) {
                if (!vcpu->mmio_is_write)
                        writeback = false;
                r = EMULATE_DO_MMIO;
+               vcpu->arch.complete_userspace_io = complete_emulated_mmio;
        } else if (r == EMULATION_RESTART)
                goto restart;
        else
@@ -4646,7 +4646,6 @@ restart:
                toggle_interruptibility(vcpu, ctxt->interruptibility);
                kvm_set_rflags(vcpu, ctxt->eflags);
                kvm_make_request(KVM_REQ_EVENT, vcpu);
-               memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                kvm_rip_write(vcpu, ctxt->eip);
        } else
@@ -4929,6 +4928,7 @@ int kvm_arch_init(void *opaque)
        if (cpu_has_xsave)
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
+       kvm_lapic_init();
        return 0;
 
 out:
@@ -5499,6 +5499,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
        return r;
 }
 
+static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+{
+       int r;
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+       r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+       if (r != EMULATE_DONE)
+               return 0;
+       return 1;
+}
+
+static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+{
+       BUG_ON(!vcpu->arch.pio.count);
+
+       return complete_emulated_io(vcpu);
+}
+
 /*
  * Implements the following, as a state machine:
  *
@@ -5515,47 +5533,37 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
  *      copy data
  *      exit
  */
-static int complete_mmio(struct kvm_vcpu *vcpu)
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
        struct kvm_mmio_fragment *frag;
-       int r;
 
-       if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
-               return 1;
+       BUG_ON(!vcpu->mmio_needed);
 
-       if (vcpu->mmio_needed) {
-               /* Complete previous fragment */
-               frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
-               if (!vcpu->mmio_is_write)
-                       memcpy(frag->data, run->mmio.data, frag->len);
-               if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
-                       vcpu->mmio_needed = 0;
-                       if (vcpu->mmio_is_write)
-                               return 1;
-                       vcpu->mmio_read_completed = 1;
-                       goto done;
-               }
-               /* Initiate next fragment */
-               ++frag;
-               run->exit_reason = KVM_EXIT_MMIO;
-               run->mmio.phys_addr = frag->gpa;
+       /* Complete previous fragment */
+       frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
+       if (!vcpu->mmio_is_write)
+               memcpy(frag->data, run->mmio.data, frag->len);
+       if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+               vcpu->mmio_needed = 0;
                if (vcpu->mmio_is_write)
-                       memcpy(run->mmio.data, frag->data, frag->len);
-               run->mmio.len = frag->len;
-               run->mmio.is_write = vcpu->mmio_is_write;
-               return 0;
-
-       }
-done:
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-       r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-       if (r != EMULATE_DONE)
-               return 0;
-       return 1;
+                       return 1;
+               vcpu->mmio_read_completed = 1;
+               return complete_emulated_io(vcpu);
+       }
+       /* Initiate next fragment */
+       ++frag;
+       run->exit_reason = KVM_EXIT_MMIO;
+       run->mmio.phys_addr = frag->gpa;
+       if (vcpu->mmio_is_write)
+               memcpy(run->mmio.data, frag->data, frag->len);
+       run->mmio.len = frag->len;
+       run->mmio.is_write = vcpu->mmio_is_write;
+       vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+       return 0;
 }
 
+
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        int r;
@@ -5582,9 +5590,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                }
        }
 
-       r = complete_mmio(vcpu);
-       if (r <= 0)
-               goto out;
+       if (unlikely(vcpu->arch.complete_userspace_io)) {
+               int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+               vcpu->arch.complete_userspace_io = NULL;
+               r = cui(vcpu);
+               if (r <= 0)
+                       goto out;
+       } else
+               WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
        r = __vcpu_run(vcpu);
 
@@ -5602,12 +5615,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
                /*
                 * We are here if userspace calls get_regs() in the middle of
                 * instruction emulation. Registers state needs to be copied
-                * back from emulation context to vcpu. Usrapace shouldn't do
+                * back from emulation context to vcpu. Userspace shouldn't do
                 * that usually, but some bad designed PV devices (vmware
                 * backdoor interface) need this to work
                 */
-               struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-               memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+               emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
        }
        regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5747,7 +5759,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
        if (ret)
                return EMULATE_FAIL;
 
-       memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
        kvm_rip_write(vcpu, ctxt->eip);
        kvm_set_rflags(vcpu, ctxt->eflags);
        kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5799,7 +5810,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        if (mmu_reset_needed)
                kvm_mmu_reset_context(vcpu);
 
-       max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+       max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
        if (pending_vec < max_bits) {
@@ -5859,13 +5870,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
                for (i = 0; i < KVM_NR_DB_REGS; ++i)
                        vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
-               vcpu->arch.switch_db_regs =
-                       (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+               vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
        } else {
                for (i = 0; i < KVM_NR_DB_REGS; i++)
                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
-               vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
        }
+       kvm_update_dr7(vcpu);
 
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
@@ -5877,7 +5887,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
         */
        kvm_set_rflags(vcpu, rflags);
 
-       kvm_x86_ops->set_guest_debug(vcpu, dbg);
+       kvm_x86_ops->update_db_bp_intercept(vcpu);
 
        r = 0;
 
@@ -6023,7 +6033,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        int r;
 
        vcpu->arch.mtrr_state.have_fixed = 1;
-       vcpu_load(vcpu);
+       r = vcpu_load(vcpu);
+       if (r)
+               return r;
        r = kvm_arch_vcpu_reset(vcpu);
        if (r == 0)
                r = kvm_mmu_setup(vcpu);
@@ -6034,9 +6046,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+       int r;
        vcpu->arch.apf.msr_val = 0;
 
-       vcpu_load(vcpu);
+       r = vcpu_load(vcpu);
+       BUG_ON(r);
        kvm_mmu_unload(vcpu);
        vcpu_put(vcpu);
 
@@ -6050,10 +6064,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
        vcpu->arch.nmi_pending = 0;
        vcpu->arch.nmi_injected = false;
 
-       vcpu->arch.switch_db_regs = 0;
        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
        vcpu->arch.dr6 = DR6_FIXED_1;
        vcpu->arch.dr7 = DR7_FIXED_1;
+       kvm_update_dr7(vcpu);
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        vcpu->arch.apf.msr_val = 0;
@@ -6132,7 +6146,7 @@ int kvm_arch_hardware_enable(void *garbage)
         * as we reset last_host_tsc on all VCPUs to stop this from being
         * called multiple times (one for each physical CPU bringup).
         *
-        * Platforms with unnreliable TSCs don't have to deal with this, they
+        * Platforms with unreliable TSCs don't have to deal with this, they
         * will be compensated by the logic in vcpu_load, which sets the TSC to
         * catchup mode.  This will catchup all VCPUs to real time, but cannot
         * guarantee that they stay in perfect synchronization.
@@ -6185,6 +6199,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
        return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
 }
 
+struct static_key kvm_no_apic_vcpu __read_mostly;
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
        struct page *page;
@@ -6217,7 +6233,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                r = kvm_create_lapic(vcpu);
                if (r < 0)
                        goto fail_mmu_destroy;
-       }
+       } else
+               static_key_slow_inc(&kvm_no_apic_vcpu);
 
        vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
                                       GFP_KERNEL);
@@ -6257,6 +6274,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
        kvm_mmu_destroy(vcpu);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        free_page((unsigned long)vcpu->arch.pio_data);
+       if (!irqchip_in_kernel(vcpu->kvm))
+               static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -6269,15 +6288,21 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+       /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+       set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+               &kvm->arch.irq_sources_bitmap);
 
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+       mutex_init(&kvm->arch.apic_map_lock);
 
        return 0;
 }
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 {
-       vcpu_load(vcpu);
+       int r;
+       r = vcpu_load(vcpu);
+       BUG_ON(r);
        kvm_mmu_unload(vcpu);
        vcpu_put(vcpu);
 }
@@ -6321,6 +6346,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                put_page(kvm->arch.apic_access_page);
        if (kvm->arch.ept_identity_pagetable)
                put_page(kvm->arch.ept_identity_pagetable);
+       kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 }
 
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
@@ -6328,10 +6354,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 {
        int i;
 
-       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-               if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
-                       kvm_kvfree(free->arch.lpage_info[i]);
-                       free->arch.lpage_info[i] = NULL;
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+                       kvm_kvfree(free->arch.rmap[i]);
+                       free->arch.rmap[i] = NULL;
+               }
+               if (i == 0)
+                       continue;
+
+               if (!dont || free->arch.lpage_info[i - 1] !=
+                            dont->arch.lpage_info[i - 1]) {
+                       kvm_kvfree(free->arch.lpage_info[i - 1]);
+                       free->arch.lpage_info[i - 1] = NULL;
                }
        }
 }
@@ -6340,23 +6374,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
        int i;
 
-       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                unsigned long ugfn;
                int lpages;
-               int level = i + 2;
+               int level = i + 1;
 
                lpages = gfn_to_index(slot->base_gfn + npages - 1,
                                      slot->base_gfn, level) + 1;
 
-               slot->arch.lpage_info[i] =
-                       kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
-               if (!slot->arch.lpage_info[i])
+               slot->arch.rmap[i] =
+                       kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+               if (!slot->arch.rmap[i])
+                       goto out_free;
+               if (i == 0)
+                       continue;
+
+               slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
+                                       sizeof(*slot->arch.lpage_info[i - 1]));
+               if (!slot->arch.lpage_info[i - 1])
                        goto out_free;
 
                if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i][0].write_count = 1;
+                       slot->arch.lpage_info[i - 1][0].write_count = 1;
                if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+                       slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
                ugfn = slot->userspace_addr >> PAGE_SHIFT;
                /*
                 * If the gfn and userspace address are not aligned wrt each
@@ -6368,16 +6409,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
                        unsigned long j;
 
                        for (j = 0; j < lpages; ++j)
-                               slot->arch.lpage_info[i][j].write_count = 1;
+                               slot->arch.lpage_info[i - 1][j].write_count = 1;
                }
        }
 
        return 0;
 
 out_free:
-       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-               kvm_kvfree(slot->arch.lpage_info[i]);
-               slot->arch.lpage_info[i] = NULL;
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               kvm_kvfree(slot->arch.rmap[i]);
+               slot->arch.rmap[i] = NULL;
+               if (i == 0)
+                       continue;
+
+               kvm_kvfree(slot->arch.lpage_info[i - 1]);
+               slot->arch.lpage_info[i - 1] = NULL;
        }
        return -ENOMEM;
 }
@@ -6396,10 +6442,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                map_flags = MAP_SHARED | MAP_ANONYMOUS;
 
        /*To keep backward compatibility with older userspace,
-        *x86 needs to hanlde !user_alloc case.
+        *x86 needs to handle !user_alloc case.
         */
        if (!user_alloc) {
-               if (npages && !old.rmap) {
+               if (npages && !old.npages) {
                        unsigned long userspace_addr;
 
                        userspace_addr = vm_mmap(NULL, 0,
@@ -6427,7 +6473,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
        int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
 
-       if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+       if (!user_alloc && !old.user_alloc && old.npages && !npages) {
                int ret;
 
                ret = vm_munmap(old.userspace_addr,
@@ -6446,14 +6492,28 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
        kvm_mmu_slot_remove_write_access(kvm, mem->slot);
        spin_unlock(&kvm->mmu_lock);
+       /*
+        * If memory slot is created, or moved, we need to clear all
+        * mmio sptes.
+        */
+       if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
+               kvm_mmu_zap_all(kvm);
+               kvm_reload_remote_mmus(kvm);
+       }
 }
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
        kvm_mmu_zap_all(kvm);
        kvm_reload_remote_mmus(kvm);
 }
 
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot)
+{
+       kvm_arch_flush_shadow_all(kvm);
+}
+
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
index 3d1134ddb885622af79bdb0fefb589c8f7b4d5b6..2b5219c12ac8566c24a837ccff3f2fb14db1c7f8 100644 (file)
@@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 
 extern u64 host_xcr0;
 
+extern struct static_key kvm_no_apic_vcpu;
 #endif
index 2ce09aa7d3b3d12b3cb27fbfb2d4896390b3045f..0a6d6ba44c858959cd2ed6b912751c5f36810989 100644 (file)
@@ -101,9 +101,13 @@ struct kvm_userspace_memory_region {
        __u64 userspace_addr; /* start of the userspace allocated memory */
 };
 
-/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
-#define KVM_MEMSLOT_INVALID      (1UL << 1)
+/*
+ * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
+ * other bits are reserved for kvm internal use which are defined in
+ * include/linux/kvm_host.h.
+ */
+#define KVM_MEM_LOG_DIRTY_PAGES        (1UL << 0)
+#define KVM_MEM_READONLY       (1UL << 1)
 
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
@@ -618,6 +622,10 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#ifdef __KVM_HAVE_READONLY_MEM
+#define KVM_CAP_READONLY_MEM 81
+#endif
+#define KVM_CAP_IRQFD_RESAMPLE 82
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -683,12 +691,21 @@ struct kvm_xen_hvm_config {
 #endif
 
 #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+/*
+ * Available with KVM_CAP_IRQFD_RESAMPLE
+ *
+ * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
+ * the irqfd to operate in resampling mode for level triggered interrupt
+ * emlation.  See Documentation/virtual/kvm/api.txt.
+ */
+#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)
 
 struct kvm_irqfd {
        __u32 fd;
        __u32 gsi;
        __u32 flags;
-       __u8  pad[20];
+       __u32 resamplefd;
+       __u8  pad[16];
 };
 
 struct kvm_clock_data {
index 8a59e0abe5faf6c6c3ba2069b9c1b32b65017cf1..93bfc9f9815c7fa178ad70b555e7a2afb86b3f02 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
 #include <linux/ratelimit.h>
+#include <linux/err.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
 #define KVM_MMIO_SIZE 8
 #endif
 
+/*
+ * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
+ * in kvm, other bits are visible for userspace which are defined in
+ * include/linux/kvm_h.
+ */
+#define KVM_MEMSLOT_INVALID    (1UL << 16)
+
 /*
  * If we support unaligned MMIO, at most one fragment will be split into two:
  */
 #define KVM_MAX_MMIO_FRAGMENTS \
        (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
 
+/*
+ * For the normal pfn, the highest 12 bits should be zero,
+ * so we can mask these bits to indicate the error.
+ */
+#define KVM_PFN_ERR_MASK       (0xfffULL << 52)
+
+#define KVM_PFN_ERR_FAULT      (KVM_PFN_ERR_MASK)
+#define KVM_PFN_ERR_HWPOISON   (KVM_PFN_ERR_MASK + 1)
+#define KVM_PFN_ERR_BAD                (KVM_PFN_ERR_MASK + 2)
+#define KVM_PFN_ERR_RO_FAULT   (KVM_PFN_ERR_MASK + 3)
+
+static inline bool is_error_pfn(pfn_t pfn)
+{
+       return !!(pfn & KVM_PFN_ERR_MASK);
+}
+
+static inline bool is_noslot_pfn(pfn_t pfn)
+{
+       return pfn == KVM_PFN_ERR_BAD;
+}
+
+static inline bool is_invalid_pfn(pfn_t pfn)
+{
+       return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
+}
+
+#define KVM_HVA_ERR_BAD                (PAGE_OFFSET)
+#define KVM_HVA_ERR_RO_BAD     (PAGE_OFFSET + PAGE_SIZE)
+
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+       return addr >= PAGE_OFFSET;
+}
+
+#define KVM_ERR_PTR_BAD_PAGE   (ERR_PTR(-ENOENT))
+
+static inline bool is_error_page(struct page *page)
+{
+       return IS_ERR(page);
+}
+
 /*
  * vcpu->requests bit members
  */
 #define KVM_REQ_PMU               16
 #define KVM_REQ_PMI               17
 
-#define KVM_USERSPACE_IRQ_SOURCE_ID    0
+#define KVM_USERSPACE_IRQ_SOURCE_ID            0
+#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID       1
 
 struct kvm;
 struct kvm_vcpu;
@@ -183,6 +233,18 @@ struct kvm_vcpu {
        } async_pf;
 #endif
 
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+       /*
+        * Cpu relax intercept or pause loop exit optimization
+        * in_spin_loop: set when a vcpu does a pause loop exit
+        *  or cpu relax intercepted.
+        * dy_eligible: indicates whether vcpu is eligible for directed yield.
+        */
+       struct {
+               bool in_spin_loop;
+               bool dy_eligible;
+       } spin_loop;
+#endif
        struct kvm_vcpu_arch arch;
 };
 
@@ -201,7 +263,6 @@ struct kvm_memory_slot {
        gfn_t base_gfn;
        unsigned long npages;
        unsigned long flags;
-       unsigned long *rmap;
        unsigned long *dirty_bitmap;
        struct kvm_arch_memory_slot arch;
        unsigned long userspace_addr;
@@ -283,6 +344,8 @@ struct kvm {
        struct {
                spinlock_t        lock;
                struct list_head  items;
+               struct list_head  resampler_list;
+               struct mutex      resampler_lock;
        } irqfds;
        struct list_head ioeventfds;
 #endif
@@ -348,7 +411,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 
-void vcpu_load(struct kvm_vcpu *vcpu);
+int __must_check vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
@@ -378,23 +441,6 @@ id_to_memslot(struct kvm_memslots *slots, int id)
        return slot;
 }
 
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-
-extern struct page *bad_page;
-extern struct page *fault_page;
-
-extern pfn_t bad_pfn;
-extern pfn_t fault_pfn;
-
-int is_error_page(struct page *page);
-int is_error_pfn(pfn_t pfn);
-int is_hwpoison_pfn(pfn_t pfn);
-int is_fault_pfn(pfn_t pfn);
-int is_noslot_pfn(pfn_t pfn);
-int is_invalid_pfn(pfn_t pfn);
-int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
                          struct kvm_userspace_memory_region *mem,
                          int user_alloc);
@@ -415,28 +461,33 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                                int user_alloc);
 bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
-void kvm_arch_flush_shadow(struct kvm *kvm);
+/* flush all memory translations */
+void kvm_arch_flush_shadow_all(struct kvm *kvm);
+/* flush memory translations pointing to 'slot' */
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot);
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
                            int nr_pages);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
                       bool write_fault, bool *writable);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
-                        struct kvm_memory_slot *slot, gfn_t gfn);
-void kvm_release_pfn_dirty(pfn_t);
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+
+void kvm_release_pfn_dirty(pfn_t pfn);
 void kvm_release_pfn_clean(pfn_t pfn);
 void kvm_set_pfn_dirty(pfn_t pfn);
 void kvm_set_pfn_accessed(pfn_t pfn);
@@ -494,6 +545,7 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
                                   struct
                                   kvm_userspace_memory_region *mem,
                                   int user_alloc);
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg);
 
@@ -573,7 +625,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
-int kvm_is_mmio_pfn(pfn_t pfn);
+bool kvm_is_mmio_pfn(pfn_t pfn);
 
 struct kvm_irq_ack_notifier {
        struct hlist_node link;
@@ -728,6 +780,12 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
        return search_memslots(slots, gfn);
 }
 
+static inline unsigned long
+__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+       return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
 static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
 {
        return gfn_to_memslot(kvm, gfn)->id;
@@ -740,10 +798,12 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
                (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 }
 
-static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
-                                              gfn_t gfn)
+static inline gfn_t
+hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
 {
-       return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+       gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
+
+       return slot->base_gfn + gfn_offset;
 }
 
 static inline gpa_t gfn_to_gpa(gfn_t gfn)
@@ -899,5 +959,32 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
        }
 }
 
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+       vcpu->spin_loop.in_spin_loop = val;
+}
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+       vcpu->spin_loop.dy_eligible = val;
+}
+
+#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+       return true;
+}
+
+#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 #endif
 
index 43049192b5ec9bcb57863b3729b3cb6ac18b8d0c..60f48fa0fd0dddf57b7e4ae80a2d45816923b6ce 100644 (file)
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key,
        key->timeout = rl;
        INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
 }
+EXPORT_SYMBOL_GPL(jump_label_rate_limit);
 
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
index 28694f4a91398998f569c234c7faee104a945259..d01b24b72c61e75f3225776541c2f99824df443e 100644 (file)
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
 
 config HAVE_KVM_MSI
        bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+       bool
index 74268b4c2ee167932d514281bf6ecc3187462a3e..ea475cd035112a9db93ffa028a552df9be0724af 100644 (file)
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
                        list_entry(vcpu->async_pf.done.next,
                                   typeof(*work), link);
                list_del(&work->link);
-               if (work->page)
-                       put_page(work->page);
+               if (!is_error_page(work->page))
+                       kvm_release_page_clean(work->page);
                kmem_cache_free(async_pf_cache, work);
        }
        spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 
                list_del(&work->queue);
                vcpu->async_pf.queued--;
-               if (work->page)
-                       put_page(work->page);
+               if (!is_error_page(work->page))
+                       kvm_release_page_clean(work->page);
                kmem_cache_free(async_pf_cache, work);
        }
 }
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
        if (!work)
                return -ENOMEM;
 
-       work->page = bad_page;
-       get_page(bad_page);
+       work->page = KVM_ERR_PTR_BAD_PAGE;
        INIT_LIST_HEAD(&work->queue); /* for list_del to work */
 
        spin_lock(&vcpu->async_pf.lock);
index 67a35e90384c0ae665923bc707c4ebd5fe3c8c22..9718e98d6d2a84956d113bd43f55d804bd30d049 100644 (file)
  * --------------------------------------------------------------------
  */
 
+/*
+ * Resampling irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts.  The interrupt is asserted on eventfd
+ * trigger.  On acknowledgement through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd.  All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user.  We can also therefore share the same irq source ID.
+ */
+struct _irqfd_resampler {
+       struct kvm *kvm;
+       /*
+        * List of resampling struct _irqfd objects sharing this gsi.
+        * RCU list modified under kvm->irqfds.resampler_lock
+        */
+       struct list_head list;
+       struct kvm_irq_ack_notifier notifier;
+       /*
+        * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
+        * resamplers among irqfds on the same gsi.
+        * Accessed and modified under kvm->irqfds.resampler_lock
+        */
+       struct list_head link;
+};
+
 struct _irqfd {
        /* Used for MSI fast-path */
        struct kvm *kvm;
@@ -52,6 +77,12 @@ struct _irqfd {
        /* Used for level IRQ fast-path */
        int gsi;
        struct work_struct inject;
+       /* The resampler used by this irqfd (resampler-only) */
+       struct _irqfd_resampler *resampler;
+       /* Eventfd notified on resample (resampler-only) */
+       struct eventfd_ctx *resamplefd;
+       /* Entry in list of irqfds for a resampler (resampler-only) */
+       struct list_head resampler_link;
        /* Used for setup/shutdown */
        struct eventfd_ctx *eventfd;
        struct list_head list;
@@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work)
        struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
        struct kvm *kvm = irqfd->kvm;
 
-       kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-       kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+       if (!irqfd->resampler) {
+               kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+               kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+       } else
+               kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+                           irqfd->gsi, 1);
+}
+
+/*
+ * Since resampler irqfds share an IRQ source ID, we de-assert once
+ * then notify all of the resampler irqfds using this GSI.  We can't
+ * do multiple de-asserts or we risk racing with incoming re-asserts.
+ */
+static void
+irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
+{
+       struct _irqfd_resampler *resampler;
+       struct _irqfd *irqfd;
+
+       resampler = container_of(kian, struct _irqfd_resampler, notifier);
+
+       kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+                   resampler->notifier.gsi, 0);
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
+               eventfd_signal(irqfd->resamplefd, 1);
+
+       rcu_read_unlock();
+}
+
+static void
+irqfd_resampler_shutdown(struct _irqfd *irqfd)
+{
+       struct _irqfd_resampler *resampler = irqfd->resampler;
+       struct kvm *kvm = resampler->kvm;
+
+       mutex_lock(&kvm->irqfds.resampler_lock);
+
+       list_del_rcu(&irqfd->resampler_link);
+       synchronize_rcu();
+
+       if (list_empty(&resampler->list)) {
+               list_del(&resampler->link);
+               kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+               kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+                           resampler->notifier.gsi, 0);
+               kfree(resampler);
+       }
+
+       mutex_unlock(&kvm->irqfds.resampler_lock);
 }
 
 /*
@@ -92,6 +173,11 @@ irqfd_shutdown(struct work_struct *work)
         */
        flush_work(&irqfd->inject);
 
+       if (irqfd->resampler) {
+               irqfd_resampler_shutdown(irqfd);
+               eventfd_ctx_put(irqfd->resamplefd);
+       }
+
        /*
         * It is now safe to release the object's resources
         */
@@ -203,7 +289,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
        struct kvm_irq_routing_table *irq_rt;
        struct _irqfd *irqfd, *tmp;
        struct file *file = NULL;
-       struct eventfd_ctx *eventfd = NULL;
+       struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
        int ret;
        unsigned int events;
 
@@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 
        irqfd->eventfd = eventfd;
 
+       if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
+               struct _irqfd_resampler *resampler;
+
+               resamplefd = eventfd_ctx_fdget(args->resamplefd);
+               if (IS_ERR(resamplefd)) {
+                       ret = PTR_ERR(resamplefd);
+                       goto fail;
+               }
+
+               irqfd->resamplefd = resamplefd;
+               INIT_LIST_HEAD(&irqfd->resampler_link);
+
+               mutex_lock(&kvm->irqfds.resampler_lock);
+
+               list_for_each_entry(resampler,
+                                   &kvm->irqfds.resampler_list, list) {
+                       if (resampler->notifier.gsi == irqfd->gsi) {
+                               irqfd->resampler = resampler;
+                               break;
+                       }
+               }
+
+               if (!irqfd->resampler) {
+                       resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+                       if (!resampler) {
+                               ret = -ENOMEM;
+                               mutex_unlock(&kvm->irqfds.resampler_lock);
+                               goto fail;
+                       }
+
+                       resampler->kvm = kvm;
+                       INIT_LIST_HEAD(&resampler->list);
+                       resampler->notifier.gsi = irqfd->gsi;
+                       resampler->notifier.irq_acked = irqfd_resampler_ack;
+                       INIT_LIST_HEAD(&resampler->link);
+
+                       list_add(&resampler->link, &kvm->irqfds.resampler_list);
+                       kvm_register_irq_ack_notifier(kvm,
+                                                     &resampler->notifier);
+                       irqfd->resampler = resampler;
+               }
+
+               list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
+               synchronize_rcu();
+
+               mutex_unlock(&kvm->irqfds.resampler_lock);
+       }
+
        /*
         * Install our own custom wake-up handling so we are notified via
         * a callback whenever someone signals the underlying eventfd
@@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
        return 0;
 
 fail:
+       if (irqfd->resampler)
+               irqfd_resampler_shutdown(irqfd);
+
+       if (resamplefd && !IS_ERR(resamplefd))
+               eventfd_ctx_put(resamplefd);
+
        if (eventfd && !IS_ERR(eventfd))
                eventfd_ctx_put(eventfd);
 
@@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm)
 {
        spin_lock_init(&kvm->irqfds.lock);
        INIT_LIST_HEAD(&kvm->irqfds.items);
+       INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+       mutex_init(&kvm->irqfds.resampler_lock);
        INIT_LIST_HEAD(&kvm->ioeventfds);
 }
 
@@ -340,7 +482,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 int
 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 {
-       if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
+       if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
                return -EINVAL;
 
        if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
index ef61d529a6c48b033cdca2dd249a3dbe8c6356ec..cfb7e4d52dc26d1c832eb2a554d8a8ed1d9d23c3 100644 (file)
@@ -197,28 +197,29 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
        u32 old_irr;
        u32 mask = 1 << irq;
        union kvm_ioapic_redirect_entry entry;
-       int ret = 1;
+       int ret, irq_level;
+
+       BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
 
        spin_lock(&ioapic->lock);
        old_irr = ioapic->irr;
-       if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
-               int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
-                                                    irq_source_id, level);
-               entry = ioapic->redirtbl[irq];
-               irq_level ^= entry.fields.polarity;
-               if (!irq_level)
-                       ioapic->irr &= ~mask;
-               else {
-                       int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
-                       ioapic->irr |= mask;
-                       if ((edge && old_irr != ioapic->irr) ||
-                           (!edge && !entry.fields.remote_irr))
-                               ret = ioapic_service(ioapic, irq);
-                       else
-                               ret = 0; /* report coalesced interrupt */
-               }
-               trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+       irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+                                        irq_source_id, level);
+       entry = ioapic->redirtbl[irq];
+       irq_level ^= entry.fields.polarity;
+       if (!irq_level) {
+               ioapic->irr &= ~mask;
+               ret = 1;
+       } else {
+               int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+               ioapic->irr |= mask;
+               if ((edge && old_irr != ioapic->irr) ||
+                   (!edge && !entry.fields.remote_irr))
+                       ret = ioapic_service(ioapic, irq);
+               else
+                       ret = 0; /* report coalesced interrupt */
        }
+       trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
        spin_unlock(&ioapic->lock);
 
        return ret;
index e9fff9830bf0bf6f2229603516ebdd633996fd20..037cb6730e68eef3171b0e9660d1139b423d96de 100644 (file)
@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
 static void kvm_iommu_put_pages(struct kvm *kvm,
                                gfn_t base_gfn, unsigned long npages);
 
-static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
-                          gfn_t gfn, unsigned long size)
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+                          unsigned long size)
 {
        gfn_t end_gfn;
        pfn_t pfn;
 
-       pfn     = gfn_to_pfn_memslot(kvm, slot, gfn);
+       pfn     = gfn_to_pfn_memslot(slot, gfn);
        end_gfn = gfn + (size >> PAGE_SHIFT);
        gfn    += 1;
 
@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
                return pfn;
 
        while (gfn < end_gfn)
-               gfn_to_pfn_memslot(kvm, slot, gfn++);
+               gfn_to_pfn_memslot(slot, gfn++);
 
        return pfn;
 }
@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
                 * Pin all pages we are about to map in memory. This is
                 * important because we unmap and unpin in 4kb steps later.
                 */
-               pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+               pfn = kvm_pin_pages(slot, gfn, page_size);
                if (is_error_pfn(pfn)) {
                        gfn += 1;
                        continue;
@@ -300,6 +300,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 
                /* Get physical address */
                phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+
+               if (!phys) {
+                       gfn++;
+                       continue;
+               }
+
                pfn  = phys >> PAGE_SHIFT;
 
                /* Unmap address from IO address space */
index 83402d74a767bec214d2c2467cceb3a271a6be44..2eb58af7ee99268b81a01680db591ea4991c47eb 100644 (file)
@@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        struct kvm_vcpu *vcpu, *lowest = NULL;
 
        if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
-                       kvm_is_dm_lowest_prio(irq))
+                       kvm_is_dm_lowest_prio(irq)) {
                printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+               irq->delivery_mode = APIC_DM_FIXED;
+       }
+
+       if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
+               return r;
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
                if (!kvm_apic_present(vcpu))
@@ -223,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)
        }
 
        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+       ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
        set_bit(irq_source_id, bitmap);
 unlock:
        mutex_unlock(&kvm->irq_lock);
@@ -233,6 +241,9 @@ unlock:
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 {
        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+       ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
 
        mutex_lock(&kvm->irq_lock);
        if (irq_source_id < 0 ||
@@ -321,11 +332,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
                switch (ue->u.irqchip.irqchip) {
                case KVM_IRQCHIP_PIC_MASTER:
                        e->set = kvm_set_pic_irq;
-                       max_pin = 16;
+                       max_pin = PIC_NUM_PINS;
                        break;
                case KVM_IRQCHIP_PIC_SLAVE:
                        e->set = kvm_set_pic_irq;
-                       max_pin = 16;
+                       max_pin = PIC_NUM_PINS;
                        delta = 8;
                        break;
                case KVM_IRQCHIP_IOAPIC:
index d617f69131d7667d3847c7e0fc040a717625bf38..c353b4599cecdb4db0d3b276e1c12b81974cd2e7 100644 (file)
@@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
 
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-struct page *fault_page;
-pfn_t fault_pfn;
-
-inline int kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_mmio_pfn(pfn_t pfn)
 {
        if (pfn_valid(pfn)) {
                int reserved;
@@ -137,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
-void vcpu_load(struct kvm_vcpu *vcpu)
+int vcpu_load(struct kvm_vcpu *vcpu)
 {
        int cpu;
 
-       mutex_lock(&vcpu->mutex);
+       if (mutex_lock_killable(&vcpu->mutex))
+               return -EINTR;
        if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
                /* The thread running this VCPU changed. */
                struct pid *oldpid = vcpu->pid;
@@ -154,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
        preempt_notifier_register(&vcpu->preempt_notifier);
        kvm_arch_vcpu_load(vcpu, cpu);
        put_cpu();
+       return 0;
 }
 
 void vcpu_put(struct kvm_vcpu *vcpu)
@@ -236,6 +232,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        }
        vcpu->run = page_address(page);
 
+       kvm_vcpu_set_in_spin_loop(vcpu, false);
+       kvm_vcpu_set_dy_eligible(vcpu, false);
+
        r = kvm_arch_vcpu_init(vcpu);
        if (r < 0)
                goto fail_free_run;
@@ -332,8 +331,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
         * count is also read inside the mmu_lock critical section.
         */
        kvm->mmu_notifier_count++;
-       for (; start < end; start += PAGE_SIZE)
-               need_tlb_flush |= kvm_unmap_hva(kvm, start);
+       need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
        need_tlb_flush |= kvm->tlbs_dirty;
        /* we've to flush the tlb before the pages can be freed */
        if (need_tlb_flush)
@@ -412,7 +410,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
        int idx;
 
        idx = srcu_read_lock(&kvm->srcu);
-       kvm_arch_flush_shadow(kvm);
+       kvm_arch_flush_shadow_all(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -551,16 +549,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
                                  struct kvm_memory_slot *dont)
 {
-       if (!dont || free->rmap != dont->rmap)
-               vfree(free->rmap);
-
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
                kvm_destroy_dirty_bitmap(free);
 
        kvm_arch_free_memslot(free, dont);
 
        free->npages = 0;
-       free->rmap = NULL;
 }
 
 void kvm_free_physmem(struct kvm *kvm)
@@ -590,7 +584,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 #else
-       kvm_arch_flush_shadow(kvm);
+       kvm_arch_flush_shadow_all(kvm);
 #endif
        kvm_arch_destroy_vm(kvm);
        kvm_free_physmem(kvm);
@@ -686,6 +680,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
        slots->generation++;
 }
 
+static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+{
+       u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+#ifdef KVM_CAP_READONLY_MEM
+       valid_flags |= KVM_MEM_READONLY;
+#endif
+
+       if (mem->flags & ~valid_flags)
+               return -EINVAL;
+
+       return 0;
+}
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -706,6 +714,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
        struct kvm_memory_slot old, new;
        struct kvm_memslots *slots, *old_memslots;
 
+       r = check_memory_region_flags(mem);
+       if (r)
+               goto out;
+
        r = -EINVAL;
        /* General sanity checks */
        if (mem->memory_size & (PAGE_SIZE - 1))
@@ -769,11 +781,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        if (npages && !old.npages) {
                new.user_alloc = user_alloc;
                new.userspace_addr = mem->userspace_addr;
-#ifndef CONFIG_S390
-               new.rmap = vzalloc(npages * sizeof(*new.rmap));
-               if (!new.rmap)
-                       goto out_free;
-#endif /* not defined CONFIG_S390 */
+
                if (kvm_arch_create_memslot(&new, npages))
                        goto out_free;
        }
@@ -785,7 +793,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
                /* destroy any largepage mappings for dirty tracking */
        }
 
-       if (!npages) {
+       if (!npages || base_gfn != old.base_gfn) {
                struct kvm_memory_slot *slot;
 
                r = -ENOMEM;
@@ -801,14 +809,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
                old_memslots = kvm->memslots;
                rcu_assign_pointer(kvm->memslots, slots);
                synchronize_srcu_expedited(&kvm->srcu);
-               /* From this point no new shadow pages pointing to a deleted
-                * memslot will be created.
+               /* From this point no new shadow pages pointing to a deleted,
+                * or moved, memslot will be created.
                 *
                 * validation of sp->gfn happens in:
                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
                 *      - kvm_is_visible_gfn (mmu_check_roots)
                 */
-               kvm_arch_flush_shadow(kvm);
+               kvm_arch_flush_shadow_memslot(kvm, slot);
                kfree(old_memslots);
        }
 
@@ -832,7 +840,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
        /* actual memory is freed via old in kvm_free_physmem_slot below */
        if (!npages) {
-               new.rmap = NULL;
                new.dirty_bitmap = NULL;
                memset(&new.arch, 0, sizeof(new.arch));
        }
@@ -844,13 +851,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
        kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 
-       /*
-        * If the new memory slot is created, we need to clear all
-        * mmio sptes.
-        */
-       if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
-               kvm_arch_flush_shadow(kvm);
-
        kvm_free_physmem_slot(&old, &new);
        kfree(old_memslots);
 
@@ -932,53 +932,6 @@ void kvm_disable_largepages(void)
 }
 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
-int is_error_page(struct page *page)
-{
-       return page == bad_page || page == hwpoison_page || page == fault_page;
-}
-EXPORT_SYMBOL_GPL(is_error_page);
-
-int is_error_pfn(pfn_t pfn)
-{
-       return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_error_pfn);
-
-int is_hwpoison_pfn(pfn_t pfn)
-{
-       return pfn == hwpoison_pfn;
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
-int is_fault_pfn(pfn_t pfn)
-{
-       return pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_fault_pfn);
-
-int is_noslot_pfn(pfn_t pfn)
-{
-       return pfn == bad_pfn;
-}
-EXPORT_SYMBOL_GPL(is_noslot_pfn);
-
-int is_invalid_pfn(pfn_t pfn)
-{
-       return pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_invalid_pfn);
-
-static inline unsigned long bad_hva(void)
-{
-       return PAGE_OFFSET;
-}
-
-int kvm_is_error_hva(unsigned long addr)
-{
-       return addr == bad_hva();
-}
-EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1021,28 +974,62 @@ out:
        return size;
 }
 
-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
-                                    gfn_t *nr_pages)
+static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+{
+       return slot->flags & KVM_MEM_READONLY;
+}
+
+static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+                                      gfn_t *nr_pages, bool write)
 {
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
-               return bad_hva();
+               return KVM_HVA_ERR_BAD;
+
+       if (memslot_is_readonly(slot) && write)
+               return KVM_HVA_ERR_RO_BAD;
 
        if (nr_pages)
                *nr_pages = slot->npages - (gfn - slot->base_gfn);
 
-       return gfn_to_hva_memslot(slot, gfn);
+       return __gfn_to_hva_memslot(slot, gfn);
 }
 
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+                                    gfn_t *nr_pages)
+{
+       return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+}
+
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+                                gfn_t gfn)
+{
+       return gfn_to_hva_many(slot, gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
        return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-static pfn_t get_fault_pfn(void)
+/*
+ * The hva returned by this function is only allowed to be read.
+ * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
+ */
+static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
+{
+       return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+}
+
+static int kvm_read_hva(void *data, void __user *hva, int len)
 {
-       get_page(fault_page);
-       return fault_pfn;
+       return __copy_from_user(data, hva, len);
+}
+
+static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
+{
+       return __copy_from_user_inatomic(data, hva, len);
 }
 
 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1065,108 +1052,186 @@ static inline int check_user_page_hwpoison(unsigned long addr)
        return rc == -EHWPOISON;
 }
 
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
-                       bool *async, bool write_fault, bool *writable)
+/*
+ * The atomic path to get the writable pfn which will be stored in @pfn,
+ * true indicates success, otherwise false is returned.
+ */
+static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
+                           bool write_fault, bool *writable, pfn_t *pfn)
 {
        struct page *page[1];
-       int npages = 0;
-       pfn_t pfn;
+       int npages;
 
-       /* we can do it either atomically or asynchronously, not both */
-       BUG_ON(atomic && async);
+       if (!(async || atomic))
+               return false;
 
-       BUG_ON(!write_fault && !writable);
+       /*
+        * Fast pin a writable pfn only if it is a write fault request
+        * or the caller allows to map a writable pfn for a read fault
+        * request.
+        */
+       if (!(write_fault || writable))
+               return false;
 
-       if (writable)
-               *writable = true;
+       npages = __get_user_pages_fast(addr, 1, 1, page);
+       if (npages == 1) {
+               *pfn = page_to_pfn(page[0]);
 
-       if (atomic || async)
-               npages = __get_user_pages_fast(addr, 1, 1, page);
+               if (writable)
+                       *writable = true;
+               return true;
+       }
 
-       if (unlikely(npages != 1) && !atomic) {
-               might_sleep();
+       return false;
+}
 
-               if (writable)
-                       *writable = write_fault;
+/*
+ * The slow path to get the pfn of the specified host virtual address,
+ * 1 indicates success, -errno is returned if error is detected.
+ */
+static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+                          bool *writable, pfn_t *pfn)
+{
+       struct page *page[1];
+       int npages = 0;
 
-               if (async) {
-                       down_read(&current->mm->mmap_sem);
-                       npages = get_user_page_nowait(current, current->mm,
-                                                    addr, write_fault, page);
-                       up_read(&current->mm->mmap_sem);
-               } else
-                       npages = get_user_pages_fast(addr, 1, write_fault,
-                                                    page);
-
-               /* map read fault as writable if possible */
-               if (unlikely(!write_fault) && npages == 1) {
-                       struct page *wpage[1];
-
-                       npages = __get_user_pages_fast(addr, 1, 1, wpage);
-                       if (npages == 1) {
-                               *writable = true;
-                               put_page(page[0]);
-                               page[0] = wpage[0];
-                       }
-                       npages = 1;
+       might_sleep();
+
+       if (writable)
+               *writable = write_fault;
+
+       if (async) {
+               down_read(&current->mm->mmap_sem);
+               npages = get_user_page_nowait(current, current->mm,
+                                             addr, write_fault, page);
+               up_read(&current->mm->mmap_sem);
+       } else
+               npages = get_user_pages_fast(addr, 1, write_fault,
+                                            page);
+       if (npages != 1)
+               return npages;
+
+       /* map read fault as writable if possible */
+       if (unlikely(!write_fault) && writable) {
+               struct page *wpage[1];
+
+               npages = __get_user_pages_fast(addr, 1, 1, wpage);
+               if (npages == 1) {
+                       *writable = true;
+                       put_page(page[0]);
+                       page[0] = wpage[0];
                }
+
+               npages = 1;
        }
+       *pfn = page_to_pfn(page[0]);
+       return npages;
+}
 
-       if (unlikely(npages != 1)) {
-               struct vm_area_struct *vma;
+static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+{
+       if (unlikely(!(vma->vm_flags & VM_READ)))
+               return false;
 
-               if (atomic)
-                       return get_fault_pfn();
+       if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+               return false;
 
-               down_read(&current->mm->mmap_sem);
-               if (npages == -EHWPOISON ||
-                       (!async && check_user_page_hwpoison(addr))) {
-                       up_read(&current->mm->mmap_sem);
-                       get_page(hwpoison_page);
-                       return page_to_pfn(hwpoison_page);
-               }
+       return true;
+}
 
-               vma = find_vma_intersection(current->mm, addr, addr+1);
-
-               if (vma == NULL)
-                       pfn = get_fault_pfn();
-               else if ((vma->vm_flags & VM_PFNMAP)) {
-                       pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                               vma->vm_pgoff;
-                       BUG_ON(!kvm_is_mmio_pfn(pfn));
-               } else {
-                       if (async && (vma->vm_flags & VM_WRITE))
-                               *async = true;
-                       pfn = get_fault_pfn();
-               }
-               up_read(&current->mm->mmap_sem);
-       } else
-               pfn = page_to_pfn(page[0]);
+/*
+ * Pin guest page in memory and return its pfn.
+ * @addr: host virtual address which maps memory to the guest
+ * @atomic: whether this function can sleep
+ * @async: whether this function need to wait IO complete if the
+ *         host page is not in the memory
+ * @write_fault: whether we should get a writable host page
+ * @writable: whether it allows to map a writable host page for !@write_fault
+ *
+ * The function will map a writable host page for these two cases:
+ * 1): @write_fault = true
+ * 2): @write_fault = false && @writable, @writable will tell the caller
+ *     whether the mapping is writable.
+ */
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+                       bool write_fault, bool *writable)
+{
+       struct vm_area_struct *vma;
+       pfn_t pfn = 0;
+       int npages;
+
+       /* we can do it either atomically or asynchronously, not both */
+       BUG_ON(atomic && async);
 
+       if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+               return pfn;
+
+       if (atomic)
+               return KVM_PFN_ERR_FAULT;
+
+       npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+       if (npages == 1)
+               return pfn;
+
+       down_read(&current->mm->mmap_sem);
+       if (npages == -EHWPOISON ||
+             (!async && check_user_page_hwpoison(addr))) {
+               pfn = KVM_PFN_ERR_HWPOISON;
+               goto exit;
+       }
+
+       vma = find_vma_intersection(current->mm, addr, addr + 1);
+
+       if (vma == NULL)
+               pfn = KVM_PFN_ERR_FAULT;
+       else if ((vma->vm_flags & VM_PFNMAP)) {
+               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                       vma->vm_pgoff;
+               BUG_ON(!kvm_is_mmio_pfn(pfn));
+       } else {
+               if (async && vma_is_valid(vma, write_fault))
+                       *async = true;
+               pfn = KVM_PFN_ERR_FAULT;
+       }
+exit:
+       up_read(&current->mm->mmap_sem);
        return pfn;
 }
 
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+static pfn_t
+__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+                    bool *async, bool write_fault, bool *writable)
 {
-       return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+       unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+
+       if (addr == KVM_HVA_ERR_RO_BAD)
+               return KVM_PFN_ERR_RO_FAULT;
+
+       if (kvm_is_error_hva(addr))
+               return KVM_PFN_ERR_BAD;
+
+       /* Do not map writable pfn in the readonly memslot. */
+       if (writable && memslot_is_readonly(slot)) {
+               *writable = false;
+               writable = NULL;
+       }
+
+       return hva_to_pfn(addr, atomic, async, write_fault,
+                         writable);
 }
-EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
                          bool write_fault, bool *writable)
 {
-       unsigned long addr;
+       struct kvm_memory_slot *slot;
 
        if (async)
                *async = false;
 
-       addr = gfn_to_hva(kvm, gfn);
-       if (kvm_is_error_hva(addr)) {
-               get_page(bad_page);
-               return page_to_pfn(bad_page);
-       }
+       slot = gfn_to_memslot(kvm, gfn);
 
-       return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+       return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+                                   writable);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1195,12 +1260,16 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
-                        struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+       return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+}
+
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-       unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-       return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+       return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
 }
+EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
                                                                  int nr_pages)
@@ -1219,30 +1288,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
+static struct page *kvm_pfn_to_page(pfn_t pfn)
+{
+       if (is_error_pfn(pfn))
+               return KVM_ERR_PTR_BAD_PAGE;
+
+       if (kvm_is_mmio_pfn(pfn)) {
+               WARN_ON(1);
+               return KVM_ERR_PTR_BAD_PAGE;
+       }
+
+       return pfn_to_page(pfn);
+}
+
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
        pfn_t pfn;
 
        pfn = gfn_to_pfn(kvm, gfn);
-       if (!kvm_is_mmio_pfn(pfn))
-               return pfn_to_page(pfn);
-
-       WARN_ON(kvm_is_mmio_pfn(pfn));
 
-       get_page(bad_page);
-       return bad_page;
+       return kvm_pfn_to_page(pfn);
 }
 
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_page_clean(struct page *page)
 {
+       WARN_ON(is_error_page(page));
+
        kvm_release_pfn_clean(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
+       WARN_ON(is_error_pfn(pfn));
+
        if (!kvm_is_mmio_pfn(pfn))
                put_page(pfn_to_page(pfn));
 }
@@ -1250,6 +1331,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 
 void kvm_release_page_dirty(struct page *page)
 {
+       WARN_ON(is_error_page(page));
+
        kvm_release_pfn_dirty(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1305,10 +1388,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
        int r;
        unsigned long addr;
 
-       addr = gfn_to_hva(kvm, gfn);
+       addr = gfn_to_hva_read(kvm, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
-       r = __copy_from_user(data, (void __user *)addr + offset, len);
+       r = kvm_read_hva(data, (void __user *)addr + offset, len);
        if (r)
                return -EFAULT;
        return 0;
@@ -1343,11 +1426,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
        gfn_t gfn = gpa >> PAGE_SHIFT;
        int offset = offset_in_page(gpa);
 
-       addr = gfn_to_hva(kvm, gfn);
+       addr = gfn_to_hva_read(kvm, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        pagefault_disable();
-       r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+       r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
        pagefault_enable();
        if (r)
                return -EFAULT;
@@ -1580,6 +1663,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
 
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ *  (preempted lock holder), indicated by @in_spin_loop.
+ *  Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ *  chance last time (mostly it has become eligible now since we have probably
+ *  yielded to lockholder in last iteration. This is done by toggling
+ *  @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ *  to preempted lock-holder could result in wrong VCPU selection and CPU
+ *  burning. Giving priority for a potential lock-holder increases lock
+ *  progress.
+ *
+ *  Since algorithm is based on heuristics, accessing another VCPU data without
+ *  locking does not harm. It may result in trying to yield to  same VCPU, fail
+ *  and continue with next VCPU and so on.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+       bool eligible;
+
+       eligible = !vcpu->spin_loop.in_spin_loop ||
+                       (vcpu->spin_loop.in_spin_loop &&
+                        vcpu->spin_loop.dy_eligible);
+
+       if (vcpu->spin_loop.in_spin_loop)
+               kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+       return eligible;
+}
+#endif
 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
        struct kvm *kvm = me->kvm;
@@ -1589,6 +1709,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
        int pass;
        int i;
 
+       kvm_vcpu_set_in_spin_loop(me, true);
        /*
         * We boost the priority of a VCPU that is runnable but not
         * currently running, because it got preempted by something
@@ -1607,6 +1728,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                continue;
                        if (waitqueue_active(&vcpu->wq))
                                continue;
+                       if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+                               continue;
                        if (kvm_vcpu_yield_to(vcpu)) {
                                kvm->last_boosted_vcpu = i;
                                yielded = 1;
@@ -1614,6 +1737,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                        }
                }
        }
+       kvm_vcpu_set_in_spin_loop(me, false);
+
+       /* Ensure vcpu is not eligible during next spinloop */
+       kvm_vcpu_set_dy_eligible(me, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
@@ -1766,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 #endif
 
 
-       vcpu_load(vcpu);
+       r = vcpu_load(vcpu);
+       if (r)
+               return r;
        switch (ioctl) {
        case KVM_RUN:
                r = -EINVAL;
@@ -2093,6 +2222,29 @@ static long kvm_vm_ioctl(struct file *filp,
                r = kvm_send_userspace_msi(kvm, &msi);
                break;
        }
+#endif
+#ifdef __KVM_HAVE_IRQ_LINE
+       case KVM_IRQ_LINE_STATUS:
+       case KVM_IRQ_LINE: {
+               struct kvm_irq_level irq_event;
+
+               r = -EFAULT;
+               if (copy_from_user(&irq_event, argp, sizeof irq_event))
+                       goto out;
+
+               r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+               if (r)
+                       goto out;
+
+               r = -EFAULT;
+               if (ioctl == KVM_IRQ_LINE_STATUS) {
+                       if (copy_to_user(argp, &irq_event, sizeof irq_event))
+                               goto out;
+               }
+
+               r = 0;
+               break;
+       }
 #endif
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
@@ -2698,9 +2850,6 @@ static struct syscore_ops kvm_syscore_ops = {
        .resume = kvm_resume,
 };
 
-struct page *bad_page;
-pfn_t bad_pfn;
-
 static inline
 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 {
@@ -2732,33 +2881,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        if (r)
                goto out_fail;
 
-       bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-       if (bad_page == NULL) {
-               r = -ENOMEM;
-               goto out;
-       }
-
-       bad_pfn = page_to_pfn(bad_page);
-
-       hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-       if (hwpoison_page == NULL) {
-               r = -ENOMEM;
-               goto out_free_0;
-       }
-
-       hwpoison_pfn = page_to_pfn(hwpoison_page);
-
-       fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-       if (fault_page == NULL) {
-               r = -ENOMEM;
-               goto out_free_0;
-       }
-
-       fault_pfn = page_to_pfn(fault_page);
-
        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
                r = -ENOMEM;
                goto out_free_0;
@@ -2833,12 +2955,6 @@ out_free_1:
 out_free_0a:
        free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
-       if (fault_page)
-               __free_page(fault_page);
-       if (hwpoison_page)
-               __free_page(hwpoison_page);
-       __free_page(bad_page);
-out:
        kvm_arch_exit();
 out_fail:
        return r;
@@ -2858,8 +2974,5 @@ void kvm_exit(void)
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();
        free_cpumask_var(cpus_hardware_enabled);
-       __free_page(fault_page);
-       __free_page(hwpoison_page);
-       __free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);