]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge remote-tracking branch 'kvm/linux-next'
authorThierry Reding <treding@nvidia.com>
Thu, 24 Oct 2013 12:58:35 +0000 (14:58 +0200)
committerThierry Reding <treding@nvidia.com>
Thu, 24 Oct 2013 12:58:35 +0000 (14:58 +0200)
Conflicts:
arch/arm/kvm/reset.c

1  2 
arch/arm/kvm/reset.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/x86/include/uapi/asm/msr-index.h
arch/x86/kvm/vmx.c
virt/kvm/kvm_main.c

diff --combined arch/arm/kvm/reset.c
index c02ba4af599f417113fdb2c260270ae7162575e6,d9bbd834f188031ecab5fff7af21a65fd2b9e8a6..d153e64d125505c9a8623521053fbe507db83a2f
  #include <kvm/arm_arch_timer.h>
  
  /******************************************************************************
-  * Cortex-A15 Reset Values
+  * Cortex-A15 and Cortex-A7 Reset Values
   */
  
- static const int a15_max_cpu_idx = 3;
+ static const int cortexa_max_cpu_idx = 3;
  
- static struct kvm_regs a15_regs_reset = {
+ static struct kvm_regs cortexa_regs_reset = {
        .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
  };
  
- static const struct kvm_irq_level a15_vtimer_irq = {
+ static const struct kvm_irq_level cortexa_vtimer_irq = {
        { .irq = 27 },
        .level = 1,
  };
   */
  int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
  {
 -      struct kvm_regs *cpu_reset;
 +      struct kvm_regs *reset_regs;
        const struct kvm_irq_level *cpu_vtimer_irq;
  
        switch (vcpu->arch.target) {
+       case KVM_ARM_TARGET_CORTEX_A7:
        case KVM_ARM_TARGET_CORTEX_A15:
-               if (vcpu->vcpu_id > a15_max_cpu_idx)
+               if (vcpu->vcpu_id > cortexa_max_cpu_idx)
                        return -EINVAL;
-               reset_regs = &a15_regs_reset;
 -              cpu_reset = &cortexa_regs_reset;
++              reset_regs = &cortexa_regs_reset;
                vcpu->arch.midr = read_cpuid_id();
-               cpu_vtimer_irq = &a15_vtimer_irq;
+               cpu_vtimer_irq = &cortexa_vtimer_irq;
                break;
        default:
                return -ENODEV;
        }
  
        /* Reset core registers */
 -      memcpy(&vcpu->arch.regs, cpu_reset, sizeof(vcpu->arch.regs));
 +      memcpy(&vcpu->arch.regs, reset_regs, sizeof(vcpu->arch.regs));
  
        /* Reset CP15 registers */
        kvm_reset_coprocs(vcpu);
index 7f1f7ac5cf7f8a2c3f3966d4fe96fa23af90ea04,e7323cd9f1098dc57cec3f21382330cdbf6a84f8..5f79d2d79ca76f34648677bb3514802458882b81
@@@ -385,7 -385,7 +385,7 @@@ static int kvm_cpu_has_interrupt(struc
        }
  
        if ((!rc) && (vcpu->arch.sie_block->ckc <
 -              get_tod_clock() + vcpu->arch.sie_block->epoch)) {
 +              get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) {
                if ((!psw_extint_disabled(vcpu)) &&
                        (vcpu->arch.sie_block->gcr[0] & 0x800ul))
                        rc = 1;
@@@ -425,7 -425,7 +425,7 @@@ int kvm_s390_handle_wait(struct kvm_vcp
                goto no_timer;
        }
  
 -      now = get_tod_clock() + vcpu->arch.sie_block->epoch;
 +      now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
        if (vcpu->arch.sie_block->ckc < now) {
                __unset_cpu_idle(vcpu);
                return 0;
        hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
        VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
  no_timer:
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        spin_lock(&vcpu->arch.local_int.float_int->lock);
        spin_lock_bh(&vcpu->arch.local_int.lock);
        add_wait_queue(&vcpu->wq, &wait);
        remove_wait_queue(&vcpu->wq, &wait);
        spin_unlock_bh(&vcpu->arch.local_int.lock);
        spin_unlock(&vcpu->arch.local_int.float_int->lock);
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
        hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
        return 0;
  }
@@@ -515,7 -518,7 +518,7 @@@ void kvm_s390_deliver_pending_interrupt
        }
  
        if ((vcpu->arch.sie_block->ckc <
 -              get_tod_clock() + vcpu->arch.sie_block->epoch))
 +              get_tod_clock_fast() + vcpu->arch.sie_block->epoch))
                __try_deliver_ckc_interrupt(vcpu);
  
        if (atomic_read(&fi->active)) {
diff --combined arch/s390/kvm/kvm-s390.c
index ed8064cb5c4921424d5981b890e6fd9b07f9ed02,1e4e7b97337a8a72b8820a77a3f99e6232618bcc..2d67b3bbf1906d4a0f33d72c5e31344c94f2feab
@@@ -343,11 -343,10 +343,11 @@@ void kvm_arch_vcpu_uninit(struct kvm_vc
  
  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
 -      save_fp_regs(&vcpu->arch.host_fpregs);
 +      save_fp_ctl(&vcpu->arch.host_fpregs.fpc);
 +      save_fp_regs(vcpu->arch.host_fpregs.fprs);
        save_access_regs(vcpu->arch.host_acrs);
 -      vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK;
 -      restore_fp_regs(&vcpu->arch.guest_fpregs);
 +      restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
 +      restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
        restore_access_regs(vcpu->run->s.regs.acrs);
        gmap_enable(vcpu->arch.gmap);
        atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
@@@ -357,11 -356,9 +357,11 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
  {
        atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        gmap_disable(vcpu->arch.gmap);
 -      save_fp_regs(&vcpu->arch.guest_fpregs);
 +      save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
 +      save_fp_regs(vcpu->arch.guest_fpregs.fprs);
        save_access_regs(vcpu->run->s.regs.acrs);
 -      restore_fp_regs(&vcpu->arch.host_fpregs);
 +      restore_fp_ctl(&vcpu->arch.host_fpregs.fpc);
 +      restore_fp_regs(vcpu->arch.host_fpregs.fprs);
        restore_access_regs(vcpu->arch.host_acrs);
  }
  
@@@ -621,12 -618,9 +621,12 @@@ int kvm_arch_vcpu_ioctl_get_sregs(struc
  
  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
 +      if (test_fp_ctl(fpu->fpc))
 +              return -EINVAL;
        memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
 -      vcpu->arch.guest_fpregs.fpc = fpu->fpc & FPC_VALID_MASK;
 -      restore_fp_regs(&vcpu->arch.guest_fpregs);
 +      vcpu->arch.guest_fpregs.fpc = fpu->fpc;
 +      restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
 +      restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
        return 0;
  }
  
@@@ -695,9 -689,9 +695,9 @@@ static int kvm_s390_handle_requests(str
        return 0;
  }
  
- static int __vcpu_run(struct kvm_vcpu *vcpu)
+ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
  {
-       int rc;
+       int rc, cpuflags;
  
        memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16);
  
                return rc;
  
        vcpu->arch.sie_block->icptcode = 0;
-       VCPU_EVENT(vcpu, 6, "entering sie flags %x",
-                  atomic_read(&vcpu->arch.sie_block->cpuflags));
-       trace_kvm_s390_sie_enter(vcpu,
-                                atomic_read(&vcpu->arch.sie_block->cpuflags));
+       cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
+       VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
+       trace_kvm_s390_sie_enter(vcpu, cpuflags);
  
-       /*
-        * As PF_VCPU will be used in fault handler, between guest_enter
-        * and guest_exit should be no uaccess.
-        */
-       preempt_disable();
-       kvm_guest_enter();
-       preempt_enable();
-       rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
-       kvm_guest_exit();
+       return 0;
+ }
+ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
+ {
+       int rc;
  
        VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
                   vcpu->arch.sie_block->icptcode);
        trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
  
-       if (rc > 0)
+       if (exit_reason >= 0) {
                rc = 0;
-       if (rc < 0) {
+       } else {
                if (kvm_is_ucontrol(vcpu->kvm)) {
                        rc = SIE_INTERCEPT_UCONTROL;
                } else {
        }
  
        memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
+       if (rc == 0) {
+               if (kvm_is_ucontrol(vcpu->kvm))
+                       rc = -EOPNOTSUPP;
+               else
+                       rc = kvm_handle_sie_intercept(vcpu);
+       }
+       return rc;
+ }
+ static int __vcpu_run(struct kvm_vcpu *vcpu)
+ {
+       int rc, exit_reason;
+       /*
+        * We try to hold kvm->srcu during most of vcpu_run (except when run-
+        * ning the guest), so that memslots (and other stuff) are protected
+        */
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+       do {
+               rc = vcpu_pre_run(vcpu);
+               if (rc)
+                       break;
+               srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+               /*
+                * As PF_VCPU will be used in fault handler, between
+                * guest_enter and guest_exit should be no uaccess.
+                */
+               preempt_disable();
+               kvm_guest_enter();
+               preempt_enable();
+               exit_reason = sie64a(vcpu->arch.sie_block,
+                                    vcpu->run->s.regs.gprs);
+               kvm_guest_exit();
+               vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+               rc = vcpu_post_run(vcpu, exit_reason);
+       } while (!signal_pending(current) && !rc);
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        return rc;
  }
  
@@@ -755,7 -788,6 +794,6 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
        int rc;
        sigset_t sigsaved;
  
- rerun_vcpu:
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
  
        }
  
        might_fault();
-       do {
-               rc = __vcpu_run(vcpu);
-               if (rc)
-                       break;
-               if (kvm_is_ucontrol(vcpu->kvm))
-                       rc = -EOPNOTSUPP;
-               else
-                       rc = kvm_handle_sie_intercept(vcpu);
-       } while (!signal_pending(current) && !rc);
-       if (rc == SIE_INTERCEPT_RERUNVCPU)
-               goto rerun_vcpu;
+       rc = __vcpu_run(vcpu);
  
        if (signal_pending(current) && !rc) {
                kvm_run->exit_reason = KVM_EXIT_INTR;
@@@ -882,8 -902,7 +908,8 @@@ int kvm_s390_vcpu_store_status(struct k
         * copying in vcpu load/put. Lets update our copies before we save
         * it into the save area
         */
 -      save_fp_regs(&vcpu->arch.guest_fpregs);
 +      save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
 +      save_fp_regs(vcpu->arch.guest_fpregs.fprs);
        save_access_regs(vcpu->run->s.regs.acrs);
  
        if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
@@@ -958,6 -977,7 +984,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
  {
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
+       int idx;
        long r;
  
        switch (ioctl) {
                break;
        }
        case KVM_S390_STORE_STATUS:
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
                r = kvm_s390_vcpu_store_status(vcpu, arg);
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
                break;
        case KVM_S390_SET_INITIAL_PSW: {
                psw_t psw;
index 940ed3fd889a743732891945b191f34cddc8e217,b93e09a0fa21c34ee20a5cf3ddd8319efa07f40c..37813b5ddc37472dba6c64b8ff3f2508dc085de0
  #define MSR_PP1_ENERGY_STATUS         0x00000641
  #define MSR_PP1_POLICY                        0x00000642
  
 +#define MSR_CORE_C1_RES                       0x00000660
 +
  #define MSR_AMD64_MC0_MASK            0xc0010044
  
  #define MSR_IA32_MCx_CTL(x)           (MSR_IA32_MC0_CTL + 4*(x))
  
  /* MSR_IA32_VMX_MISC bits */
  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
+ #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
  /* AMD-V MSRs */
  
  #define MSR_VM_CR                       0xc0010114
diff --combined arch/x86/kvm/vmx.c
index 2b2fce1b200900b1af42865f946d5faa25fdc56a,0156560c68a83c758a19e076e4ac730ccf1de8c4..06fd7629068ac6ddd3adde76dbec5aff7d39bf29
@@@ -1898,16 -1898,12 +1898,12 @@@ static void skip_emulated_instruction(s
  /*
   * KVM wants to inject page-faults which it got to the guest. This function
   * checks whether in a nested guest, we need to inject them to L1 or L2.
-  * This function assumes it is called with the exit reason in vmcs02 being
-  * a #PF exception (this is the only case in which KVM injects a #PF when L2
-  * is running).
   */
- static int nested_pf_handled(struct kvm_vcpu *vcpu)
+ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
  {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
-       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-       if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
+       if (!(vmcs12->exception_bitmap & (1u << nr)))
                return 0;
  
        nested_vmx_vmexit(vcpu);
@@@ -1921,8 -1917,8 +1917,8 @@@ static void vmx_queue_exception(struct 
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
  
-       if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
-           !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
+       if (!reinject && is_guest_mode(vcpu) &&
+           nested_vmx_check_exception(vcpu, nr))
                return;
  
        if (has_error_code) {
@@@ -2204,9 -2200,15 +2200,15 @@@ static __init void nested_vmx_setup_ctl
  #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
  #endif
-               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+       if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+           !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
+               nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+               nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       }
        nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
-                                     VM_EXIT_LOAD_IA32_EFER);
+               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
  
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               SECONDARY_EXEC_UNRESTRICTED_GUEST |
                SECONDARY_EXEC_WBINVD_EXITING;
  
        if (enable_ept) {
@@@ -3255,29 -3258,25 +3258,29 @@@ static void vmx_decache_cr4_guest_bits(
  
  static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
  {
 +      struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
 +
        if (!test_bit(VCPU_EXREG_PDPTR,
                      (unsigned long *)&vcpu->arch.regs_dirty))
                return;
  
        if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
 -              vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
 -              vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
 -              vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
 -              vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
 +              vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
 +              vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
 +              vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
 +              vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
        }
  }
  
  static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
  {
 +      struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
 +
        if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
 -              vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
 -              vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
 -              vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
 -              vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
 +              mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
 +              mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
 +              mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
 +              mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
        }
  
        __set_bit(VCPU_EXREG_PDPTR,
@@@ -3380,8 -3379,10 +3383,10 @@@ static void vmx_set_cr3(struct kvm_vcp
        if (enable_ept) {
                eptp = construct_eptp(cr3);
                vmcs_write64(EPT_POINTER, eptp);
-               guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
-                       vcpu->kvm->arch.ept_identity_map_addr;
+               if (is_paging(vcpu) || is_guest_mode(vcpu))
+                       guest_cr3 = kvm_read_cr3(vcpu);
+               else
+                       guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
  
@@@ -4879,6 -4880,17 +4884,17 @@@ vmx_patch_hypercall(struct kvm_vcpu *vc
        hypercall[2] = 0xc1;
  }
  
+ static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+ {
+       unsigned long always_on = VMXON_CR0_ALWAYSON;
+       if (nested_vmx_secondary_ctls_high &
+               SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+           nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+               always_on &= ~(X86_CR0_PE | X86_CR0_PG);
+       return (val & always_on) == always_on;
+ }
  /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
  static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
  {
                val = (val & ~vmcs12->cr0_guest_host_mask) |
                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
  
-               /* TODO: will have to take unrestricted guest mode into
-                * account */
-               if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
+               if (!nested_cr0_valid(vmcs12, val))
                        return 1;
  
                if (kvm_set_cr0(vcpu, val))
@@@ -5349,9 -5359,7 +5363,9 @@@ static int handle_ept_violation(struct 
         * There are errata that may cause this bit to not be set:
         * AAK134, BY25.
         */
 -      if (exit_qualification & INTR_INFO_UNBLOCK_NMI)
 +      if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
 +                      cpu_has_virtual_nmis() &&
 +                      (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
  
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@@ -6722,6 -6730,27 +6736,27 @@@ static void vmx_get_exit_info(struct kv
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
  }
  
+ static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+       u64 delta_tsc_l1;
+       u32 preempt_val_l1, preempt_val_l2, preempt_scale;
+       if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
+                       PIN_BASED_VMX_PREEMPTION_TIMER))
+               return;
+       preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
+                       MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
+       preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+       delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
+               - vcpu->arch.last_guest_tsc;
+       preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
+       if (preempt_val_l2 <= preempt_val_l1)
+               preempt_val_l2 = 0;
+       else
+               preempt_val_l2 -= preempt_val_l1;
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
+ }
  /*
   * The guest has exited.  See if we can fix it or if we need userspace
   * assistance.
@@@ -6736,20 -6765,6 +6771,6 @@@ static int vmx_handle_exit(struct kvm_v
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
  
-       /*
-        * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
-        * we did not inject a still-pending event to L1 now because of
-        * nested_run_pending, we need to re-enable this bit.
-        */
-       if (vmx->nested.nested_run_pending)
-               kvm_make_request(KVM_REQ_EVENT, vcpu);
-       if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
-           exit_reason == EXIT_REASON_VMRESUME))
-               vmx->nested.nested_run_pending = 1;
-       else
-               vmx->nested.nested_run_pending = 0;
        if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
                nested_vmx_vmexit(vcpu);
                return 1;
@@@ -7061,9 -7076,9 +7082,9 @@@ static void __vmx_complete_interrupts(s
        case INTR_TYPE_HARD_EXCEPTION:
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        u32 err = vmcs_read32(error_code_field);
-                       kvm_queue_exception_e(vcpu, vector, err);
+                       kvm_requeue_exception_e(vcpu, vector, err);
                } else
-                       kvm_queue_exception(vcpu, vector);
+                       kvm_requeue_exception(vcpu, vector);
                break;
        case INTR_TYPE_SOFT_INTR:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
@@@ -7146,6 -7161,8 +7167,8 @@@ static void __noclone vmx_vcpu_run(stru
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
  
+       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
+               nested_adjust_preemption_timer(vcpu);
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
        trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
  
+       /*
+        * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+        * we did not inject a still-pending event to L1 now because of
+        * nested_run_pending, we need to re-enable this bit.
+        */
+       if (vmx->nested.nested_run_pending)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+       vmx->nested.nested_run_pending = 0;
        vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
        vmx_complete_interrupts(vmx);
@@@ -7501,9 -7528,9 +7534,9 @@@ static unsigned long nested_ept_get_cr3
        return get_vmcs12(vcpu)->ept_pointer;
  }
  
- static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
  {
-       int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+       kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
  
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
  
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-       return r;
  }
  
  static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
  }
  
+ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+               struct x86_exception *fault)
+ {
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       WARN_ON(!is_guest_mode(vcpu));
+       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+       if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+               nested_vmx_vmexit(vcpu);
+       else
+               kvm_inject_page_fault(vcpu, fault);
+ }
  /*
   * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
   * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@@ -7533,6 -7572,7 +7578,7 @@@ static void prepare_vmcs02(struct kvm_v
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
+       u32 exit_control;
  
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
         * bits are further modified by vmx_set_efer() below.
         */
-       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+       exit_control = vmcs_config.vmexit_ctrl;
+       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+               exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+       vmcs_write32(VM_EXIT_CONTROLS, exit_control);
  
        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
         * emulated by vmx_set_efer(), below.
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
  
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
        /*
         * L1 may access the L2's PDPTR, so save them to construct vmcs12
         */
                vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
                vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
 -              __clear_bit(VCPU_EXREG_PDPTR,
 -                              (unsigned long *)&vcpu->arch.regs_avail);
 -              __clear_bit(VCPU_EXREG_PDPTR,
 -                              (unsigned long *)&vcpu->arch.regs_dirty);
        }
  
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
@@@ -7876,7 -7926,7 +7928,7 @@@ static int nested_vmx_run(struct kvm_vc
                return 1;
        }
  
-       if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+       if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
            ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
                nested_vmx_entry_failure(vcpu, vmcs12,
                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
  
        enter_guest_mode(vcpu);
  
+       vmx->nested.nested_run_pending = 1;
        vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
  
        cpu = get_cpu();
@@@ -8005,7 -8057,7 +8059,7 @@@ static void vmcs12_save_pending_event(s
        u32 idt_vectoring;
        unsigned int nr;
  
-       if (vcpu->arch.exception.pending) {
+       if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {
                nr = vcpu->arch.exception.nr;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
  
@@@ -8105,6 -8157,11 +8159,11 @@@ static void prepare_vmcs12(struct kvm_v
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
  
+       if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
+           (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+               vmcs12->vmx_preemption_timer_value =
+                       vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
        /*
         * In some cases (usually, nested EPT), L2 is allowed to change its
         * own CR3 without exiting. If it has changed it, we must keep it.
        vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+               vmcs12->guest_ia32_efer = vcpu->arch.efer;
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
@@@ -8201,7 -8260,7 +8262,7 @@@ static void load_vmcs12_host_state(stru
         * fpu_active (which may have changed).
         * Note that vmx_set_cr0 refers to efer set above.
         */
-       kvm_set_cr0(vcpu, vmcs12->host_cr0);
+       vmx_set_cr0(vcpu, vmcs12->host_cr0);
        /*
         * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
         * to apply the same changes to L1's vmcs. We just set cr0 correctly,
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
  
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
        if (enable_vpid) {
                /*
                 * Trivially support vpid by letting L2s share their parent
diff --combined virt/kvm/kvm_main.c
index a9dd682cf5e3f5117de017156396337a8352914f,d469114aff097dbdc791173c6ec997a48970249c..0d20c320a33daa5b20710e2fa0f7c3ab18390d27
@@@ -70,7 -70,8 +70,8 @@@ MODULE_LICENSE("GPL")
   *            kvm->lock --> kvm->slots_lock --> kvm->irq_lock
   */
  
- DEFINE_RAW_SPINLOCK(kvm_lock);
+ DEFINE_SPINLOCK(kvm_lock);
+ static DEFINE_RAW_SPINLOCK(kvm_count_lock);
  LIST_HEAD(vm_list);
  
  static cpumask_var_t cpus_hardware_enabled;
@@@ -490,9 -491,9 +491,9 @@@ static struct kvm *kvm_create_vm(unsign
        if (r)
                goto out_err;
  
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
  
        return kvm;
  
@@@ -581,9 -582,9 +582,9 @@@ static void kvm_destroy_vm(struct kvm *
        struct mm_struct *mm = kvm->mm;
  
        kvm_arch_sync_events(kvm);
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_del(&kvm->vm_list);
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
        kvm_free_irq_routing(kvm);
        for (i = 0; i < KVM_NR_BUSES; i++)
                kvm_io_bus_destroy(kvm->buses[i]);
@@@ -1064,12 -1065,10 +1065,12 @@@ EXPORT_SYMBOL_GPL(gfn_to_hva)
  unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
  {
        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 -      if (writable)
 +      unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
 +
 +      if (!kvm_is_error_hva(hva) && writable)
                *writable = !memslot_is_readonly(slot);
  
 -      return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
 +      return hva;
  }
  
  static int kvm_read_hva(void *data, void __user *hva, int len)
@@@ -2683,11 -2682,12 +2684,12 @@@ static void hardware_enable_nolock(voi
        }
  }
  
- static void hardware_enable(void *junk)
+ static void hardware_enable(void)
  {
-       raw_spin_lock(&kvm_lock);
-       hardware_enable_nolock(junk);
-       raw_spin_unlock(&kvm_lock);
+       raw_spin_lock(&kvm_count_lock);
+       if (kvm_usage_count)
+               hardware_enable_nolock(NULL);
+       raw_spin_unlock(&kvm_count_lock);
  }
  
  static void hardware_disable_nolock(void *junk)
        kvm_arch_hardware_disable(NULL);
  }
  
- static void hardware_disable(void *junk)
+ static void hardware_disable(void)
  {
-       raw_spin_lock(&kvm_lock);
-       hardware_disable_nolock(junk);
-       raw_spin_unlock(&kvm_lock);
+       raw_spin_lock(&kvm_count_lock);
+       if (kvm_usage_count)
+               hardware_disable_nolock(NULL);
+       raw_spin_unlock(&kvm_count_lock);
  }
  
  static void hardware_disable_all_nolock(void)
  
  static void hardware_disable_all(void)
  {
-       raw_spin_lock(&kvm_lock);
+       raw_spin_lock(&kvm_count_lock);
        hardware_disable_all_nolock();
-       raw_spin_unlock(&kvm_lock);
+       raw_spin_unlock(&kvm_count_lock);
  }
  
  static int hardware_enable_all(void)
  {
        int r = 0;
  
-       raw_spin_lock(&kvm_lock);
+       raw_spin_lock(&kvm_count_lock);
  
        kvm_usage_count++;
        if (kvm_usage_count == 1) {
                }
        }
  
-       raw_spin_unlock(&kvm_lock);
+       raw_spin_unlock(&kvm_count_lock);
  
        return r;
  }
@@@ -2750,20 -2751,17 +2753,17 @@@ static int kvm_cpu_hotplug(struct notif
  {
        int cpu = (long)v;
  
-       if (!kvm_usage_count)
-               return NOTIFY_OK;
        val &= ~CPU_TASKS_FROZEN;
        switch (val) {
        case CPU_DYING:
                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
                       cpu);
-               hardware_disable(NULL);
+               hardware_disable();
                break;
        case CPU_STARTING:
                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
                       cpu);
-               hardware_enable(NULL);
+               hardware_enable();
                break;
        }
        return NOTIFY_OK;
@@@ -3056,10 -3054,10 +3056,10 @@@ static int vm_stat_get(void *_offset, u
        struct kvm *kvm;
  
        *val = 0;
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                *val += *(u32 *)((void *)kvm + offset);
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
        return 0;
  }
  
@@@ -3073,12 -3071,12 +3073,12 @@@ static int vcpu_stat_get(void *_offset
        int i;
  
        *val = 0;
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                kvm_for_each_vcpu(i, vcpu, kvm)
                        *val += *(u32 *)((void *)vcpu + offset);
  
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
        return 0;
  }
  
@@@ -3133,7 -3131,7 +3133,7 @@@ static int kvm_suspend(void
  static void kvm_resume(void)
  {
        if (kvm_usage_count) {
-               WARN_ON(raw_spin_is_locked(&kvm_lock));
+               WARN_ON(raw_spin_is_locked(&kvm_count_lock));
                hardware_enable_nolock(NULL);
        }
  }