]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - arch/x86/kvm/vmx.c
Merge tag 'v2.6.38' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[mv-sheeva.git] / arch / x86 / kvm / vmx.c
index 81fcbe9515c59d7e68734d2d4c53fa23bb72624a..bf89ec2cfb823ceff3bcae33f00bf1cecfab10c5 100644 (file)
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static int __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 
+static int __read_mostly yield_on_hlt = 1;
+module_param(yield_on_hlt, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                          \
        (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK                                             \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 
+static bool cpu_has_load_ia32_efer;
+
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
 
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
        u8 error;
 
        asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
-                     : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+                     : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
                      : "cc", "memory");
        if (error)
                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
        u8 error;
 
        asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-                       : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
                        : "cc", "memory");
        if (error)
                printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 
 static unsigned long vmcs_readl(unsigned long field)
 {
-       unsigned long value;
+       unsigned long value = 0;
 
        asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
-                     : "=a"(value) : "d"(field) : "cc");
+                     : "+a"(value) : "d"(field) : "cc");
        return value;
 }
 
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
        unsigned i;
        struct msr_autoload *m = &vmx->msr_autoload;
 
+       if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+               vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+               vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+               return;
+       }
+
        for (i = 0; i < m->nr; ++i)
                if (m->guest[i].index == msr)
                        break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
        unsigned i;
        struct msr_autoload *m = &vmx->msr_autoload;
 
+       if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+               vmcs_write64(GUEST_IA32_EFER, guest_val);
+               vmcs_write64(HOST_IA32_EFER, host_val);
+               vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+               vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+               return;
+       }
+
        for (i = 0; i < m->nr; ++i)
                if (m->guest[i].index == msr)
                        break;
@@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+       /* Ensure that we clear the HLT state in the VMCS.  We don't need to
+        * explicitly skip the instruction because if the HLT state is set, then
+        * the instruction is already executing and RIP has already been
+        * advanced. */
+       if (!yield_on_hlt &&
+           vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+               vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                                bool has_error_code, u32 error_code,
                                bool reinject)
@@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                intr_info |= INTR_TYPE_HARD_EXCEPTION;
 
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+       vmx_clear_hlt(vcpu);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
                        && tboot_enabled())
                        return 1;
                if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
-                       && !tboot_enabled())
+                       && !tboot_enabled()) {
+                       printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+                               " activate TXT before enabling KVM\n");
                        return 1;
+               }
        }
 
        return 0;
@@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
        return 0;
 }
 
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+       u32 vmx_msr_low, vmx_msr_high;
+
+       rdmsr(msr, vmx_msr_low, vmx_msr_high);
+       return vmx_msr_high & ctl;
+}
+
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 {
        u32 vmx_msr_low, vmx_msr_high;
@@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                                &_pin_based_exec_control) < 0)
                return -EIO;
 
-       min = CPU_BASED_HLT_EXITING |
+       min =
 #ifdef CONFIG_X86_64
              CPU_BASED_CR8_LOAD_EXITING |
              CPU_BASED_CR8_STORE_EXITING |
@@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
              CPU_BASED_MWAIT_EXITING |
              CPU_BASED_MONITOR_EXITING |
              CPU_BASED_INVLPG_EXITING;
+
+       if (yield_on_hlt)
+               min |= CPU_BASED_HLT_EXITING;
+
        opt = CPU_BASED_TPR_SHADOW |
              CPU_BASED_USE_MSR_BITMAPS |
              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
        vmcs_conf->vmexit_ctrl         = _vmexit_control;
        vmcs_conf->vmentry_ctrl        = _vmentry_control;
 
+       cpu_has_load_ia32_efer =
+               allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+                               VM_ENTRY_LOAD_IA32_EFER)
+               && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+                                  VM_EXIT_LOAD_IA32_EFER);
+
        return 0;
 }
 
@@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
        save->limit = vmcs_read32(sf->limit);
        save->ar = vmcs_read32(sf->ar_bytes);
        vmcs_write16(sf->selector, save->base >> 4);
-       vmcs_write32(sf->base, save->base & 0xfffff);
+       vmcs_write32(sf->base, save->base & 0xffff0);
        vmcs_write32(sf->limit, 0xffff);
        vmcs_write32(sf->ar_bytes, 0xf3);
+       if (save->base & 0xf)
+               printk_once(KERN_WARNING "kvm: segment base is not paragraph"
+                           " aligned when entering protected mode (seg=%d)",
+                           seg);
 }
 
 static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
        vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
 }
 
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+       if (enable_ept && is_paging(vcpu))
+               vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
        ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
                                        unsigned long cr0,
                                        struct kvm_vcpu *vcpu)
 {
+       vmx_decache_cr3(vcpu);
        if (!(cr0 & X86_CR0_PG)) {
                /* From paging/starting to nonpaging */
                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (enable_ept) {
                eptp = construct_eptp(cr3);
                vmcs_write64(EPT_POINTER, eptp);
-               guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+               guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
                        vcpu->kvm->arch.ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
@@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmcs_writel(GUEST_IDTR_BASE, 0);
        vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
 
-       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+       vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
@@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
                return;
        }
 
+       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+               enable_irq_window(vcpu);
+               return;
+       }
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
        } else
                intr |= INTR_TYPE_EXT_INTR;
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+       vmx_clear_hlt(vcpu);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+       vmx_clear_hlt(vcpu);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
                return 0;
 
        return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                       (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
+                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+                  | GUEST_INTR_STATE_NMI));
 }
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
         * Cause the #SS fault with 0 error code in VM86 mode.
         */
        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-               if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
+               if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
                        return 1;
        /*
         * Forward all other exceptions that are valid in real mode.
@@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        }
 
        if (is_invalid_opcode(intr_info)) {
-               er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
+               er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
                if (er != EMULATE_DONE)
                        kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
@@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 
                if (kvm_event_needs_reinjection(vcpu))
                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
-               return kvm_mmu_page_fault(vcpu, cr2, error_code);
+               return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
        }
 
        if (vmx->rmode.vm86_active &&
@@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
        ++vcpu->stat.io_exits;
 
        if (string || in)
-               return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+               return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 
        port = exit_qualification >> 16;
        size = (exit_qualification & 7) + 1;
@@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
 
-static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
-       if (err)
-               kvm_inject_gp(vcpu, 0);
-       else
-               skip_emulated_instruction(vcpu);
-}
-
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification, val;
@@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                switch (cr) {
                case 0:
                        err = kvm_set_cr0(vcpu, val);
-                       complete_insn_gp(vcpu, err);
+                       kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 3:
                        err = kvm_set_cr3(vcpu, val);
-                       complete_insn_gp(vcpu, err);
+                       kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 4:
                        err = kvm_set_cr4(vcpu, val);
-                       complete_insn_gp(vcpu, err);
+                       kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 8: {
                                u8 cr8_prev = kvm_get_cr8(vcpu);
                                u8 cr8 = kvm_register_read(vcpu, reg);
-                               kvm_set_cr8(vcpu, cr8);
-                               skip_emulated_instruction(vcpu);
+                               err = kvm_set_cr8(vcpu, cr8);
+                               kvm_complete_insn_gp(vcpu, err);
                                if (irqchip_in_kernel(vcpu->kvm))
                                        return 1;
                                if (cr8_prev <= cr8)
@@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
        case 1: /*mov from cr*/
                switch (cr) {
                case 3:
-                       kvm_register_write(vcpu, reg, vcpu->arch.cr3);
-                       trace_kvm_cr_read(cr, vcpu->arch.cr3);
+                       val = kvm_read_cr3(vcpu);
+                       kvm_register_write(vcpu, reg, val);
+                       trace_kvm_cr_read(cr, val);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 8:
@@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+       return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
-       return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+       return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 }
 
 static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        trace_kvm_page_fault(gpa, exit_qualification);
-       return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+       return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
 }
 
 static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
                        return handle_interrupt_window(&vmx->vcpu);
 
-               err = emulate_instruction(vcpu, 0, 0, 0);
+               err = emulate_instruction(vcpu, 0);
 
                if (err == EMULATE_DO_MMIO) {
                        ret = 0;
@@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
        [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
        [EXIT_REASON_HLT]                     = handle_halt,
+       [EXIT_REASON_INVD]                    = handle_invd,
        [EXIT_REASON_INVLPG]                  = handle_invlpg,
        [EXIT_REASON_VMCALL]                  = handle_vmcall,
        [EXIT_REASON_VMCLEAR]                 = handle_vmx_insn,
@@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
        ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+       *info1 = vmcs_readl(EXIT_QUALIFICATION);
+       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        u32 exit_reason = vmx->exit_reason;
        u32 vectoring_info = vmx->idt_vectoring_info;
 
-       trace_kvm_exit(exit_reason, vcpu);
+       trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
 
        /* If guest state is invalid, start emulating */
        if (vmx->emulation_required && emulate_invalid_guest_state)
                return handle_invalid_guest_state(vcpu);
 
-       /* Access CR3 don't cause VMExit in paging mode, so we need
-        * to sync with guest real CR3. */
-       if (enable_ept && is_paging(vcpu))
-               vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-
        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
              );
 
        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
-                                 | (1 << VCPU_EXREG_PDPTR));
+                                 | (1 << VCPU_EXREG_PDPTR)
+                                 | (1 << VCPU_EXREG_CR3));
        vcpu->arch.regs_dirty = 0;
 
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .get_cpl = vmx_get_cpl,
        .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
        .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+       .decache_cr3 = vmx_decache_cr3,
        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
        .set_cr0 = vmx_set_cr0,
        .set_cr3 = vmx_set_cr3,
@@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .get_tdp_level = get_ept_level,
        .get_mt_mask = vmx_get_mt_mask,
 
+       .get_exit_info = vmx_get_exit_info,
        .exit_reasons_str = vmx_exit_reasons_str,
+
        .get_lpage_level = vmx_get_lpage_level,
 
        .cpuid_update = vmx_cpuid_update,
@@ -4396,8 +4472,6 @@ static int __init vmx_init(void)
 
        if (enable_ept) {
                bypass_guest_pf = 0;
-               kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
-                       VMX_EPT_WRITABLE_MASK);
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
                                VMX_EPT_EXECUTABLE_MASK);
                kvm_enable_tdp();