arch/x86/kvm/x86.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * derived from drivers/kvm/kvm_main.c
   5  *
   6  * Copyright (C) 2006 Qumranet, Inc.
   7  * Copyright (C) 2008 Qumranet, Inc.
   8  * Copyright IBM Corporation, 2008
   9  *
  10  * Authors:
  11  *   Avi Kivity   <avi@qumranet.com>
  12  *   Yaniv Kamay  <yaniv@qumranet.com>
  13  *   Amit Shah    <amit.shah@qumranet.com>
  14  *   Ben-Ami Yassour <benami@il.ibm.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include <linux/kvm_host.h>
  22 #include "irq.h"
  23 #include "mmu.h"
  24 #include "i8254.h"
  25 #include "tss.h"
  26 #include "kvm_cache_regs.h"
  27 #include "x86.h"
  28
  29 #include <linux/clocksource.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/kvm.h>
  32 #include <linux/fs.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/module.h>
  35 #include <linux/mman.h>
  36 #include <linux/highmem.h>
  37 #include <linux/iommu.h>
  38 #include <linux/intel-iommu.h>
  39 #include <linux/cpufreq.h>
  40 #include <trace/events/kvm.h>
  41 #undef TRACE_INCLUDE_FILE
  42 #define CREATE_TRACE_POINTS
  43 #include "trace.h"
  44
  45 #include <asm/uaccess.h>
  46 #include <asm/msr.h>
  47 #include <asm/desc.h>
  48 #include <asm/mtrr.h>
  49 #include <asm/mce.h>
  50
  51 #define MAX_IO_MSRS 256
  52 #define CR0_RESERVED_BITS                                               \
  53         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  54                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  55                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  56 #define CR4_RESERVED_BITS                                               \
  57         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  58                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  59                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  60                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  61
  62 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  63
  64 #define KVM_MAX_MCE_BANKS 32
  65 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
  66
  67 /* EFER defaults:
  68  * - enable syscall per default because its emulated by KVM
  69  * - enable LME and LMA per default on 64 bit KVM
  70  */
  71 #ifdef CONFIG_X86_64
  72 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  73 #else
  74 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  75 #endif
  76
  77 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  78 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  79
  80 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  81 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  82                                     struct kvm_cpuid_entry2 __user *entries);
  83
  84 struct kvm_x86_ops *kvm_x86_ops;
  85 EXPORT_SYMBOL_GPL(kvm_x86_ops);
  86
  87 int ignore_msrs = 0;
  88 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
  89
  90 struct kvm_stats_debugfs_item debugfs_entries[] = {
  91         { "pf_fixed", VCPU_STAT(pf_fixed) },
  92         { "pf_guest", VCPU_STAT(pf_guest) },
  93         { "tlb_flush", VCPU_STAT(tlb_flush) },
  94         { "invlpg", VCPU_STAT(invlpg) },
  95         { "exits", VCPU_STAT(exits) },
  96         { "io_exits", VCPU_STAT(io_exits) },
  97         { "mmio_exits", VCPU_STAT(mmio_exits) },
  98         { "signal_exits", VCPU_STAT(signal_exits) },
  99         { "irq_window", VCPU_STAT(irq_window_exits) },
 100         { "nmi_window", VCPU_STAT(nmi_window_exits) },
 101         { "halt_exits", VCPU_STAT(halt_exits) },
 102         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
 103         { "hypercalls", VCPU_STAT(hypercalls) },
 104         { "request_irq", VCPU_STAT(request_irq_exits) },
 105         { "irq_exits", VCPU_STAT(irq_exits) },
 106         { "host_state_reload", VCPU_STAT(host_state_reload) },
 107         { "efer_reload", VCPU_STAT(efer_reload) },
 108         { "fpu_reload", VCPU_STAT(fpu_reload) },
 109         { "insn_emulation", VCPU_STAT(insn_emulation) },
 110         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 111         { "irq_injections", VCPU_STAT(irq_injections) },
 112         { "nmi_injections", VCPU_STAT(nmi_injections) },
 113         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 114         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 115         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 116         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 117         { "mmu_flooded", VM_STAT(mmu_flooded) },
 118         { "mmu_recycled", VM_STAT(mmu_recycled) },
 119         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 120         { "mmu_unsync", VM_STAT(mmu_unsync) },
 121         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 122         { "largepages", VM_STAT(lpages) },
 123         { NULL }
 124 };
 125
 126 unsigned long segment_base(u16 selector)
 127 {
 128         struct descriptor_table gdt;
 129         struct desc_struct *d;
 130         unsigned long table_base;
 131         unsigned long v;
 132
 133         if (selector == 0)
 134                 return 0;
 135
 136         kvm_get_gdt(&gdt);
 137         table_base = gdt.base;
 138
 139         if (selector & 4) {           /* from ldt */
 140                 u16 ldt_selector = kvm_read_ldt();
 141
 142                 table_base = segment_base(ldt_selector);
 143         }
 144         d = (struct desc_struct *)(table_base + (selector & ~7));
 145         v = get_desc_base(d);
 146 #ifdef CONFIG_X86_64
 147         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 148                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 149 #endif
 150         return v;
 151 }
 152 EXPORT_SYMBOL_GPL(segment_base);
 153
 154 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 155 {
 156         if (irqchip_in_kernel(vcpu->kvm))
 157                 return vcpu->arch.apic_base;
 158         else
 159                 return vcpu->arch.apic_base;
 160 }
 161 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 162
 163 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 164 {
 165         /* TODO: reserve bits check */
 166         if (irqchip_in_kernel(vcpu->kvm))
 167                 kvm_lapic_set_base(vcpu, data);
 168         else
 169                 vcpu->arch.apic_base = data;
 170 }
 171 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 172
 173 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 174 {
 175         WARN_ON(vcpu->arch.exception.pending);
 176         vcpu->arch.exception.pending = true;
 177         vcpu->arch.exception.has_error_code = false;
 178         vcpu->arch.exception.nr = nr;
 179 }
 180 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 181
 182 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 183                            u32 error_code)
 184 {
 185         ++vcpu->stat.pf_guest;
 186
 187         if (vcpu->arch.exception.pending) {
 188                 switch(vcpu->arch.exception.nr) {
 189                 case DF_VECTOR:
 190                         /* triple fault -> shutdown */
 191                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 192                         return;
 193                 case PF_VECTOR:
 194                         vcpu->arch.exception.nr = DF_VECTOR;
 195                         vcpu->arch.exception.error_code = 0;
 196                         return;
 197                 default:
 198                         /* replace previous exception with a new one in a hope
 199                            that instruction re-execution will regenerate lost
 200                            exception */
 201                         vcpu->arch.exception.pending = false;
 202                         break;
 203                 }
 204         }
 205         vcpu->arch.cr2 = addr;
 206         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 207 }
 208
 209 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 210 {
 211         vcpu->arch.nmi_pending = 1;
 212 }
 213 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 214
 215 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 216 {
 217         WARN_ON(vcpu->arch.exception.pending);
 218         vcpu->arch.exception.pending = true;
 219         vcpu->arch.exception.has_error_code = true;
 220         vcpu->arch.exception.nr = nr;
 221         vcpu->arch.exception.error_code = error_code;
 222 }
 223 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 224
 225 /*
 226  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 227  * a #GP and return false.
 228  */
 229 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 230 {
 231         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
 232                 return true;
 233         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 234         return false;
 235 }
 236 EXPORT_SYMBOL_GPL(kvm_require_cpl);
 237
 238 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 239 {
 240         unsigned long rflags;
 241
 242         rflags = kvm_x86_ops->get_rflags(vcpu);
 243         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 244                 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
 245         return rflags;
 246 }
 247 EXPORT_SYMBOL_GPL(kvm_get_rflags);
 248
 249 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 250 {
 251         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 252                 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
 253         kvm_x86_ops->set_rflags(vcpu, rflags);
 254 }
 255 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 256
 257 /*
 258  * Load the pae pdptrs.  Return true is they are all valid.
 259  */
 260 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 261 {
 262         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 263         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 264         int i;
 265         int ret;
 266         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 267
 268         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 269                                   offset * sizeof(u64), sizeof(pdpte));
 270         if (ret < 0) {
 271                 ret = 0;
 272                 goto out;
 273         }
 274         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 275                 if (is_present_gpte(pdpte[i]) &&
 276                     (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 277                         ret = 0;
 278                         goto out;
 279                 }
 280         }
 281         ret = 1;
 282
 283         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 284         __set_bit(VCPU_EXREG_PDPTR,
 285                   (unsigned long *)&vcpu->arch.regs_avail);
 286         __set_bit(VCPU_EXREG_PDPTR,
 287                   (unsigned long *)&vcpu->arch.regs_dirty);
 288 out:
 289
 290         return ret;
 291 }
 292 EXPORT_SYMBOL_GPL(load_pdptrs);
 293
 294 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 295 {
 296         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 297         bool changed = true;
 298         int r;
 299
 300         if (is_long_mode(vcpu) || !is_pae(vcpu))
 301                 return false;
 302
 303         if (!test_bit(VCPU_EXREG_PDPTR,
 304                       (unsigned long *)&vcpu->arch.regs_avail))
 305                 return true;
 306
 307         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 308         if (r < 0)
 309                 goto out;
 310         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 311 out:
 312
 313         return changed;
 314 }
 315
 316 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 317 {
 318         if (cr0 & CR0_RESERVED_BITS) {
 319                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 320                        cr0, vcpu->arch.cr0);
 321                 kvm_inject_gp(vcpu, 0);
 322                 return;
 323         }
 324
 325         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 326                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 327                 kvm_inject_gp(vcpu, 0);
 328                 return;
 329         }
 330
 331         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 332                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 333                        "and a clear PE flag\n");
 334                 kvm_inject_gp(vcpu, 0);
 335                 return;
 336         }
 337
 338         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 339 #ifdef CONFIG_X86_64
 340                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
 341                         int cs_db, cs_l;
 342
 343                         if (!is_pae(vcpu)) {
 344                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 345                                        "in long mode while PAE is disabled\n");
 346                                 kvm_inject_gp(vcpu, 0);
 347                                 return;
 348                         }
 349                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 350                         if (cs_l) {
 351                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 352                                        "in long mode while CS.L == 1\n");
 353                                 kvm_inject_gp(vcpu, 0);
 354                                 return;
 355
 356                         }
 357                 } else
 358 #endif
 359                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 360                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 361                                "reserved bits\n");
 362                         kvm_inject_gp(vcpu, 0);
 363                         return;
 364                 }
 365
 366         }
 367
 368         kvm_x86_ops->set_cr0(vcpu, cr0);
 369         vcpu->arch.cr0 = cr0;
 370
 371         kvm_mmu_reset_context(vcpu);
 372         return;
 373 }
 374 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 375
 376 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 377 {
 378         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 379 }
 380 EXPORT_SYMBOL_GPL(kvm_lmsw);
 381
 382 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 383 {
 384         unsigned long old_cr4 = vcpu->arch.cr4;
 385         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 386
 387         if (cr4 & CR4_RESERVED_BITS) {
 388                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 389                 kvm_inject_gp(vcpu, 0);
 390                 return;
 391         }
 392
 393         if (is_long_mode(vcpu)) {
 394                 if (!(cr4 & X86_CR4_PAE)) {
 395                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 396                                "in long mode\n");
 397                         kvm_inject_gp(vcpu, 0);
 398                         return;
 399                 }
 400         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 401                    && ((cr4 ^ old_cr4) & pdptr_bits)
 402                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 403                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 404                 kvm_inject_gp(vcpu, 0);
 405                 return;
 406         }
 407
 408         if (cr4 & X86_CR4_VMXE) {
 409                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 410                 kvm_inject_gp(vcpu, 0);
 411                 return;
 412         }
 413         kvm_x86_ops->set_cr4(vcpu, cr4);
 414         vcpu->arch.cr4 = cr4;
 415         vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 416         kvm_mmu_reset_context(vcpu);
 417 }
 418 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 419
 420 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 421 {
 422         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 423                 kvm_mmu_sync_roots(vcpu);
 424                 kvm_mmu_flush_tlb(vcpu);
 425                 return;
 426         }
 427
 428         if (is_long_mode(vcpu)) {
 429                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 430                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 431                         kvm_inject_gp(vcpu, 0);
 432                         return;
 433                 }
 434         } else {
 435                 if (is_pae(vcpu)) {
 436                         if (cr3 & CR3_PAE_RESERVED_BITS) {
 437                                 printk(KERN_DEBUG
 438                                        "set_cr3: #GP, reserved bits\n");
 439                                 kvm_inject_gp(vcpu, 0);
 440                                 return;
 441                         }
 442                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 443                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 444                                        "reserved bits\n");
 445                                 kvm_inject_gp(vcpu, 0);
 446                                 return;
 447                         }
 448                 }
 449                 /*
 450                  * We don't check reserved bits in nonpae mode, because
 451                  * this isn't enforced, and VMware depends on this.
 452                  */
 453         }
 454
 455         /*
 456          * Does the new cr3 value map to physical memory? (Note, we
 457          * catch an invalid cr3 even in real-mode, because it would
 458          * cause trouble later on when we turn on paging anyway.)
 459          *
 460          * A real CPU would silently accept an invalid cr3 and would
 461          * attempt to use it - with largely undefined (and often hard
 462          * to debug) behavior on the guest side.
 463          */
 464         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 465                 kvm_inject_gp(vcpu, 0);
 466         else {
 467                 vcpu->arch.cr3 = cr3;
 468                 vcpu->arch.mmu.new_cr3(vcpu);
 469         }
 470 }
 471 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 472
 473 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 474 {
 475         if (cr8 & CR8_RESERVED_BITS) {
 476                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 477                 kvm_inject_gp(vcpu, 0);
 478                 return;
 479         }
 480         if (irqchip_in_kernel(vcpu->kvm))
 481                 kvm_lapic_set_tpr(vcpu, cr8);
 482         else
 483                 vcpu->arch.cr8 = cr8;
 484 }
 485 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 486
 487 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 488 {
 489         if (irqchip_in_kernel(vcpu->kvm))
 490                 return kvm_lapic_get_cr8(vcpu);
 491         else
 492                 return vcpu->arch.cr8;
 493 }
 494 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 495
 496 static inline u32 bit(int bitno)
 497 {
 498         return 1 << (bitno & 31);
 499 }
 500
 501 /*
 502  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 503  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 504  *
 505  * This list is modified at module load time to reflect the
 506  * capabilities of the host cpu. This capabilities test skips MSRs that are
 507  * kvm-specific. Those are put in the beginning of the list.
 508  */
 509
 510 #define KVM_SAVE_MSRS_BEGIN     2
 511 static u32 msrs_to_save[] = {
 512         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 513         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 514         MSR_K6_STAR,
 515 #ifdef CONFIG_X86_64
 516         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 517 #endif
 518         MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 519 };
 520
 521 static unsigned num_msrs_to_save;
 522
 523 static u32 emulated_msrs[] = {
 524         MSR_IA32_MISC_ENABLE,
 525 };
 526
 527 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 528 {
 529         if (efer & efer_reserved_bits) {
 530                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 531                        efer);
 532                 kvm_inject_gp(vcpu, 0);
 533                 return;
 534         }
 535
 536         if (is_paging(vcpu)
 537             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 538                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 539                 kvm_inject_gp(vcpu, 0);
 540                 return;
 541         }
 542
 543         if (efer & EFER_FFXSR) {
 544                 struct kvm_cpuid_entry2 *feat;
 545
 546                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 547                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
 548                         printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
 549                         kvm_inject_gp(vcpu, 0);
 550                         return;
 551                 }
 552         }
 553
 554         if (efer & EFER_SVME) {
 555                 struct kvm_cpuid_entry2 *feat;
 556
 557                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 558                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
 559                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
 560                         kvm_inject_gp(vcpu, 0);
 561                         return;
 562                 }
 563         }
 564
 565         kvm_x86_ops->set_efer(vcpu, efer);
 566
 567         efer &= ~EFER_LMA;
 568         efer |= vcpu->arch.shadow_efer & EFER_LMA;
 569
 570         vcpu->arch.shadow_efer = efer;
 571
 572         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 573         kvm_mmu_reset_context(vcpu);
 574 }
 575
 576 void kvm_enable_efer_bits(u64 mask)
 577 {
 578        efer_reserved_bits &= ~mask;
 579 }
 580 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 581
 582
 583 /*
 584  * Writes msr value into into the appropriate "register".
 585  * Returns 0 on success, non-0 otherwise.
 586  * Assumes vcpu_load() was already called.
 587  */
 588 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 589 {
 590         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 591 }
 592
 593 /*
 594  * Adapt set_msr() to msr_io()'s calling convention
 595  */
 596 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 597 {
 598         return kvm_set_msr(vcpu, index, *data);
 599 }
 600
 601 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 602 {
 603         static int version;
 604         struct pvclock_wall_clock wc;
 605         struct timespec now, sys, boot;
 606
 607         if (!wall_clock)
 608                 return;
 609
 610         version++;
 611
 612         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 613
 614         /*
 615          * The guest calculates current wall clock time by adding
 616          * system time (updated by kvm_write_guest_time below) to the
 617          * wall clock specified here.  guest system time equals host
 618          * system time for us, thus we must fill in host boot time here.
 619          */
 620         now = current_kernel_time();
 621         ktime_get_ts(&sys);
 622         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
 623
 624         wc.sec = boot.tv_sec;
 625         wc.nsec = boot.tv_nsec;
 626         wc.version = version;
 627
 628         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 629
 630         version++;
 631         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 632 }
 633
 634 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 635 {
 636         uint32_t quotient, remainder;
 637
 638         /* Don't try to replace with do_div(), this one calculates
 639          * "(dividend << 32) / divisor" */
 640         __asm__ ( "divl %4"
 641                   : "=a" (quotient), "=d" (remainder)
 642                   : "0" (0), "1" (dividend), "r" (divisor) );
 643         return quotient;
 644 }
 645
 646 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 647 {
 648         uint64_t nsecs = 1000000000LL;
 649         int32_t  shift = 0;
 650         uint64_t tps64;
 651         uint32_t tps32;
 652
 653         tps64 = tsc_khz * 1000LL;
 654         while (tps64 > nsecs*2) {
 655                 tps64 >>= 1;
 656                 shift--;
 657         }
 658
 659         tps32 = (uint32_t)tps64;
 660         while (tps32 <= (uint32_t)nsecs) {
 661                 tps32 <<= 1;
 662                 shift++;
 663         }
 664
 665         hv_clock->tsc_shift = shift;
 666         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 667
 668         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 669                  __func__, tsc_khz, hv_clock->tsc_shift,
 670                  hv_clock->tsc_to_system_mul);
 671 }
 672
 673 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 674
 675 static void kvm_write_guest_time(struct kvm_vcpu *v)
 676 {
 677         struct timespec ts;
 678         unsigned long flags;
 679         struct kvm_vcpu_arch *vcpu = &v->arch;
 680         void *shared_kaddr;
 681         unsigned long this_tsc_khz;
 682
 683         if ((!vcpu->time_page))
 684                 return;
 685
 686         this_tsc_khz = get_cpu_var(cpu_tsc_khz);
 687         if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
 688                 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
 689                 vcpu->hv_clock_tsc_khz = this_tsc_khz;
 690         }
 691         put_cpu_var(cpu_tsc_khz);
 692
 693         /* Keep irq disabled to prevent changes to the clock */
 694         local_irq_save(flags);
 695         kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
 696         ktime_get_ts(&ts);
 697         local_irq_restore(flags);
 698
 699         /* With all the info we got, fill in the values */
 700
 701         vcpu->hv_clock.system_time = ts.tv_nsec +
 702                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
 703         /*
 704          * The interface expects us to write an even number signaling that the
 705          * update is finished. Since the guest won't see the intermediate
 706          * state, we just increase by 2 at the end.
 707          */
 708         vcpu->hv_clock.version += 2;
 709
 710         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 711
 712         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 713                sizeof(vcpu->hv_clock));
 714
 715         kunmap_atomic(shared_kaddr, KM_USER0);
 716
 717         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 718 }
 719
 720 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 721 {
 722         struct kvm_vcpu_arch *vcpu = &v->arch;
 723
 724         if (!vcpu->time_page)
 725                 return 0;
 726         set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
 727         return 1;
 728 }
 729
 730 static bool msr_mtrr_valid(unsigned msr)
 731 {
 732         switch (msr) {
 733         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 734         case MSR_MTRRfix64K_00000:
 735         case MSR_MTRRfix16K_80000:
 736         case MSR_MTRRfix16K_A0000:
 737         case MSR_MTRRfix4K_C0000:
 738         case MSR_MTRRfix4K_C8000:
 739         case MSR_MTRRfix4K_D0000:
 740         case MSR_MTRRfix4K_D8000:
 741         case MSR_MTRRfix4K_E0000:
 742         case MSR_MTRRfix4K_E8000:
 743         case MSR_MTRRfix4K_F0000:
 744         case MSR_MTRRfix4K_F8000:
 745         case MSR_MTRRdefType:
 746         case MSR_IA32_CR_PAT:
 747                 return true;
 748         case 0x2f8:
 749                 return true;
 750         }
 751         return false;
 752 }
 753
 754 static bool valid_pat_type(unsigned t)
 755 {
 756         return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
 757 }
 758
 759 static bool valid_mtrr_type(unsigned t)
 760 {
 761         return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
 762 }
 763
 764 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 765 {
 766         int i;
 767
 768         if (!msr_mtrr_valid(msr))
 769                 return false;
 770
 771         if (msr == MSR_IA32_CR_PAT) {
 772                 for (i = 0; i < 8; i++)
 773                         if (!valid_pat_type((data >> (i * 8)) & 0xff))
 774                                 return false;
 775                 return true;
 776         } else if (msr == MSR_MTRRdefType) {
 777                 if (data & ~0xcff)
 778                         return false;
 779                 return valid_mtrr_type(data & 0xff);
 780         } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
 781                 for (i = 0; i < 8 ; i++)
 782                         if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
 783                                 return false;
 784                 return true;
 785         }
 786
 787         /* variable MTRRs */
 788         return valid_mtrr_type(data & 0xff);
 789 }
 790
 791 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 792 {
 793         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 794
 795         if (!mtrr_valid(vcpu, msr, data))
 796                 return 1;
 797
 798         if (msr == MSR_MTRRdefType) {
 799                 vcpu->arch.mtrr_state.def_type = data;
 800                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 801         } else if (msr == MSR_MTRRfix64K_00000)
 802                 p[0] = data;
 803         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 804                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
 805         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 806                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 807         else if (msr == MSR_IA32_CR_PAT)
 808                 vcpu->arch.pat = data;
 809         else {  /* Variable MTRRs */
 810                 int idx, is_mtrr_mask;
 811                 u64 *pt;
 812
 813                 idx = (msr - 0x200) / 2;
 814                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 815                 if (!is_mtrr_mask)
 816                         pt =
 817                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 818                 else
 819                         pt =
 820                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 821                 *pt = data;
 822         }
 823
 824         kvm_mmu_reset_context(vcpu);
 825         return 0;
 826 }
 827
 828 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 829 {
 830         u64 mcg_cap = vcpu->arch.mcg_cap;
 831         unsigned bank_num = mcg_cap & 0xff;
 832
 833         switch (msr) {
 834         case MSR_IA32_MCG_STATUS:
 835                 vcpu->arch.mcg_status = data;
 836                 break;
 837         case MSR_IA32_MCG_CTL:
 838                 if (!(mcg_cap & MCG_CTL_P))
 839                         return 1;
 840                 if (data != 0 && data != ~(u64)0)
 841                         return -1;
 842                 vcpu->arch.mcg_ctl = data;
 843                 break;
 844         default:
 845                 if (msr >= MSR_IA32_MC0_CTL &&
 846                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
 847                         u32 offset = msr - MSR_IA32_MC0_CTL;
 848                         /* only 0 or all 1s can be written to IA32_MCi_CTL */
 849                         if ((offset & 0x3) == 0 &&
 850                             data != 0 && data != ~(u64)0)
 851                                 return -1;
 852                         vcpu->arch.mce_banks[offset] = data;
 853                         break;
 854                 }
 855                 return 1;
 856         }
 857         return 0;
 858 }
 859
 860 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 861 {
 862         switch (msr) {
 863         case MSR_EFER:
 864                 set_efer(vcpu, data);
 865                 break;
 866         case MSR_K7_HWCR:
 867                 data &= ~(u64)0x40;     /* ignore flush filter disable */
 868                 if (data != 0) {
 869                         pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 870                                 data);
 871                         return 1;
 872                 }
 873                 break;
 874         case MSR_FAM10H_MMIO_CONF_BASE:
 875                 if (data != 0) {
 876                         pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
 877                                 "0x%llx\n", data);
 878                         return 1;
 879                 }
 880                 break;
 881         case MSR_AMD64_NB_CFG:
 882                 break;
 883         case MSR_IA32_DEBUGCTLMSR:
 884                 if (!data) {
 885                         /* We support the non-activated case already */
 886                         break;
 887                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 888                         /* Values other than LBR and BTF are vendor-specific,
 889                            thus reserved and should throw a #GP */
 890                         return 1;
 891                 }
 892                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 893                         __func__, data);
 894                 break;
 895         case MSR_IA32_UCODE_REV:
 896         case MSR_IA32_UCODE_WRITE:
 897         case MSR_VM_HSAVE_PA:
 898         case MSR_AMD64_PATCH_LOADER:
 899                 break;
 900         case 0x200 ... 0x2ff:
 901                 return set_msr_mtrr(vcpu, msr, data);
 902         case MSR_IA32_APICBASE:
 903                 kvm_set_apic_base(vcpu, data);
 904                 break;
 905         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 906                 return kvm_x2apic_msr_write(vcpu, msr, data);
 907         case MSR_IA32_MISC_ENABLE:
 908                 vcpu->arch.ia32_misc_enable_msr = data;
 909                 break;
 910         case MSR_KVM_WALL_CLOCK:
 911                 vcpu->kvm->arch.wall_clock = data;
 912                 kvm_write_wall_clock(vcpu->kvm, data);
 913                 break;
 914         case MSR_KVM_SYSTEM_TIME: {
 915                 if (vcpu->arch.time_page) {
 916                         kvm_release_page_dirty(vcpu->arch.time_page);
 917                         vcpu->arch.time_page = NULL;
 918                 }
 919
 920                 vcpu->arch.time = data;
 921
 922                 /* we verify if the enable bit is set... */
 923                 if (!(data & 1))
 924                         break;
 925
 926                 /* ...but clean it before doing the actual write */
 927                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 928
 929                 vcpu->arch.time_page =
 930                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 931
 932                 if (is_error_page(vcpu->arch.time_page)) {
 933                         kvm_release_page_clean(vcpu->arch.time_page);
 934                         vcpu->arch.time_page = NULL;
 935                 }
 936
 937                 kvm_request_guest_time_update(vcpu);
 938                 break;
 939         }
 940         case MSR_IA32_MCG_CTL:
 941         case MSR_IA32_MCG_STATUS:
 942         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
 943                 return set_msr_mce(vcpu, msr, data);
 944
 945         /* Performance counters are not protected by a CPUID bit,
 946          * so we should check all of them in the generic path for the sake of
 947          * cross vendor migration.
 948          * Writing a zero into the event select MSRs disables them,
 949          * which we perfectly emulate ;-). Any other value should be at least
 950          * reported, some guests depend on them.
 951          */
 952         case MSR_P6_EVNTSEL0:
 953         case MSR_P6_EVNTSEL1:
 954         case MSR_K7_EVNTSEL0:
 955         case MSR_K7_EVNTSEL1:
 956         case MSR_K7_EVNTSEL2:
 957         case MSR_K7_EVNTSEL3:
 958                 if (data != 0)
 959                         pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 960                                 "0x%x data 0x%llx\n", msr, data);
 961                 break;
 962         /* at least RHEL 4 unconditionally writes to the perfctr registers,
 963          * so we ignore writes to make it happy.
 964          */
 965         case MSR_P6_PERFCTR0:
 966         case MSR_P6_PERFCTR1:
 967         case MSR_K7_PERFCTR0:
 968         case MSR_K7_PERFCTR1:
 969         case MSR_K7_PERFCTR2:
 970         case MSR_K7_PERFCTR3:
 971                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 972                         "0x%x data 0x%llx\n", msr, data);
 973                 break;
 974         default:
 975                 if (!ignore_msrs) {
 976                         pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
 977                                 msr, data);
 978                         return 1;
 979                 } else {
 980                         pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
 981                                 msr, data);
 982                         break;
 983                 }
 984         }
 985         return 0;
 986 }
 987 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 988
 989
 990 /*
 991  * Reads an msr value (of 'msr_index') into 'pdata'.
 992  * Returns 0 on success, non-0 otherwise.
 993  * Assumes vcpu_load() was already called.
 994  */
 995 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 996 {
 997         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 998 }
 999
1000 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1001 {
1002         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1003
1004         if (!msr_mtrr_valid(msr))
1005                 return 1;
1006
1007         if (msr == MSR_MTRRdefType)
1008                 *pdata = vcpu->arch.mtrr_state.def_type +
1009                          (vcpu->arch.mtrr_state.enabled << 10);
1010         else if (msr == MSR_MTRRfix64K_00000)
1011                 *pdata = p[0];
1012         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1013                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
1014         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1015                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
1016         else if (msr == MSR_IA32_CR_PAT)
1017                 *pdata = vcpu->arch.pat;
1018         else {  /* Variable MTRRs */
1019                 int idx, is_mtrr_mask;
1020                 u64 *pt;
1021
1022                 idx = (msr - 0x200) / 2;
1023                 is_mtrr_mask = msr - 0x200 - 2 * idx;
1024                 if (!is_mtrr_mask)
1025                         pt =
1026                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1027                 else
1028                         pt =
1029                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1030                 *pdata = *pt;
1031         }
1032
1033         return 0;
1034 }
1035
1036 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1037 {
1038         u64 data;
1039         u64 mcg_cap = vcpu->arch.mcg_cap;
1040         unsigned bank_num = mcg_cap & 0xff;
1041
1042         switch (msr) {
1043         case MSR_IA32_P5_MC_ADDR:
1044         case MSR_IA32_P5_MC_TYPE:
1045                 data = 0;
1046                 break;
1047         case MSR_IA32_MCG_CAP:
1048                 data = vcpu->arch.mcg_cap;
1049                 break;
1050         case MSR_IA32_MCG_CTL:
1051                 if (!(mcg_cap & MCG_CTL_P))
1052                         return 1;
1053                 data = vcpu->arch.mcg_ctl;
1054                 break;
1055         case MSR_IA32_MCG_STATUS:
1056                 data = vcpu->arch.mcg_status;
1057                 break;
1058         default:
1059                 if (msr >= MSR_IA32_MC0_CTL &&
1060                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1061                         u32 offset = msr - MSR_IA32_MC0_CTL;
1062                         data = vcpu->arch.mce_banks[offset];
1063                         break;
1064                 }
1065                 return 1;
1066         }
1067         *pdata = data;
1068         return 0;
1069 }
1070
1071 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1072 {
1073         u64 data;
1074
1075         switch (msr) {
1076         case MSR_IA32_PLATFORM_ID:
1077         case MSR_IA32_UCODE_REV:
1078         case MSR_IA32_EBL_CR_POWERON:
1079         case MSR_IA32_DEBUGCTLMSR:
1080         case MSR_IA32_LASTBRANCHFROMIP:
1081         case MSR_IA32_LASTBRANCHTOIP:
1082         case MSR_IA32_LASTINTFROMIP:
1083         case MSR_IA32_LASTINTTOIP:
1084         case MSR_K8_SYSCFG:
1085         case MSR_K7_HWCR:
1086         case MSR_VM_HSAVE_PA:
1087         case MSR_P6_PERFCTR0:
1088         case MSR_P6_PERFCTR1:
1089         case MSR_P6_EVNTSEL0:
1090         case MSR_P6_EVNTSEL1:
1091         case MSR_K7_EVNTSEL0:
1092         case MSR_K7_PERFCTR0:
1093         case MSR_K8_INT_PENDING_MSG:
1094         case MSR_AMD64_NB_CFG:
1095         case MSR_FAM10H_MMIO_CONF_BASE:
1096                 data = 0;
1097                 break;
1098         case MSR_MTRRcap:
1099                 data = 0x500 | KVM_NR_VAR_MTRR;
1100                 break;
1101         case 0x200 ... 0x2ff:
1102                 return get_msr_mtrr(vcpu, msr, pdata);
1103         case 0xcd: /* fsb frequency */
1104                 data = 3;
1105                 break;
1106         case MSR_IA32_APICBASE:
1107                 data = kvm_get_apic_base(vcpu);
1108                 break;
1109         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1110                 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1111                 break;
1112         case MSR_IA32_MISC_ENABLE:
1113                 data = vcpu->arch.ia32_misc_enable_msr;
1114                 break;
1115         case MSR_IA32_PERF_STATUS:
1116                 /* TSC increment by tick */
1117                 data = 1000ULL;
1118                 /* CPU multiplier */
1119                 data |= (((uint64_t)4ULL) << 40);
1120                 break;
1121         case MSR_EFER:
1122                 data = vcpu->arch.shadow_efer;
1123                 break;
1124         case MSR_KVM_WALL_CLOCK:
1125                 data = vcpu->kvm->arch.wall_clock;
1126                 break;
1127         case MSR_KVM_SYSTEM_TIME:
1128                 data = vcpu->arch.time;
1129                 break;
1130         case MSR_IA32_P5_MC_ADDR:
1131         case MSR_IA32_P5_MC_TYPE:
1132         case MSR_IA32_MCG_CAP:
1133         case MSR_IA32_MCG_CTL:
1134         case MSR_IA32_MCG_STATUS:
1135         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1136                 return get_msr_mce(vcpu, msr, pdata);
1137         default:
1138                 if (!ignore_msrs) {
1139                         pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1140                         return 1;
1141                 } else {
1142                         pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1143                         data = 0;
1144                 }
1145                 break;
1146         }
1147         *pdata = data;
1148         return 0;
1149 }
1150 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1151
1152 /*
1153  * Read or write a bunch of msrs. All parameters are kernel addresses.
1154  *
1155  * @return number of msrs set successfully.
1156  */
1157 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1158                     struct kvm_msr_entry *entries,
1159                     int (*do_msr)(struct kvm_vcpu *vcpu,
1160                                   unsigned index, u64 *data))
1161 {
1162         int i;
1163
1164         vcpu_load(vcpu);
1165
1166         down_read(&vcpu->kvm->slots_lock);
1167         for (i = 0; i < msrs->nmsrs; ++i)
1168                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1169                         break;
1170         up_read(&vcpu->kvm->slots_lock);
1171
1172         vcpu_put(vcpu);
1173
1174         return i;
1175 }
1176
1177 /*
1178  * Read or write a bunch of msrs. Parameters are user addresses.
1179  *
1180  * @return number of msrs set successfully.
1181  */
1182 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1183                   int (*do_msr)(struct kvm_vcpu *vcpu,
1184                                 unsigned index, u64 *data),
1185                   int writeback)
1186 {
1187         struct kvm_msrs msrs;
1188         struct kvm_msr_entry *entries;
1189         int r, n;
1190         unsigned size;
1191
1192         r = -EFAULT;
1193         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1194                 goto out;
1195
1196         r = -E2BIG;
1197         if (msrs.nmsrs >= MAX_IO_MSRS)
1198                 goto out;
1199
1200         r = -ENOMEM;
1201         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1202         entries = vmalloc(size);
1203         if (!entries)
1204                 goto out;
1205
1206         r = -EFAULT;
1207         if (copy_from_user(entries, user_msrs->entries, size))
1208                 goto out_free;
1209
1210         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1211         if (r < 0)
1212                 goto out_free;
1213
1214         r = -EFAULT;
1215         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1216                 goto out_free;
1217
1218         r = n;
1219
1220 out_free:
1221         vfree(entries);
1222 out:
1223         return r;
1224 }
1225
1226 int kvm_dev_ioctl_check_extension(long ext)
1227 {
1228         int r;
1229
1230         switch (ext) {
1231         case KVM_CAP_IRQCHIP:
1232         case KVM_CAP_HLT:
1233         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1234         case KVM_CAP_SET_TSS_ADDR:
1235         case KVM_CAP_EXT_CPUID:
1236         case KVM_CAP_CLOCKSOURCE:
1237         case KVM_CAP_PIT:
1238         case KVM_CAP_NOP_IO_DELAY:
1239         case KVM_CAP_MP_STATE:
1240         case KVM_CAP_SYNC_MMU:
1241         case KVM_CAP_REINJECT_CONTROL:
1242         case KVM_CAP_IRQ_INJECT_STATUS:
1243         case KVM_CAP_ASSIGN_DEV_IRQ:
1244         case KVM_CAP_IRQFD:
1245         case KVM_CAP_IOEVENTFD:
1246         case KVM_CAP_PIT2:
1247         case KVM_CAP_PIT_STATE2:
1248         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1249                 r = 1;
1250                 break;
1251         case KVM_CAP_COALESCED_MMIO:
1252                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1253                 break;
1254         case KVM_CAP_VAPIC:
1255                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1256                 break;
1257         case KVM_CAP_NR_VCPUS:
1258                 r = KVM_MAX_VCPUS;
1259                 break;
1260         case KVM_CAP_NR_MEMSLOTS:
1261                 r = KVM_MEMORY_SLOTS;
1262                 break;
1263         case KVM_CAP_PV_MMU:    /* obsolete */
1264                 r = 0;
1265                 break;
1266         case KVM_CAP_IOMMU:
1267                 r = iommu_found();
1268                 break;
1269         case KVM_CAP_MCE:
1270                 r = KVM_MAX_MCE_BANKS;
1271                 break;
1272         default:
1273                 r = 0;
1274                 break;
1275         }
1276         return r;
1277
1278 }
1279
1280 long kvm_arch_dev_ioctl(struct file *filp,
1281                         unsigned int ioctl, unsigned long arg)
1282 {
1283         void __user *argp = (void __user *)arg;
1284         long r;
1285
1286         switch (ioctl) {
1287         case KVM_GET_MSR_INDEX_LIST: {
1288                 struct kvm_msr_list __user *user_msr_list = argp;
1289                 struct kvm_msr_list msr_list;
1290                 unsigned n;
1291
1292                 r = -EFAULT;
1293                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1294                         goto out;
1295                 n = msr_list.nmsrs;
1296                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1297                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1298                         goto out;
1299                 r = -E2BIG;
1300                 if (n < msr_list.nmsrs)
1301                         goto out;
1302                 r = -EFAULT;
1303                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1304                                  num_msrs_to_save * sizeof(u32)))
1305                         goto out;
1306                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1307                                  &emulated_msrs,
1308                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1309                         goto out;
1310                 r = 0;
1311                 break;
1312         }
1313         case KVM_GET_SUPPORTED_CPUID: {
1314                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1315                 struct kvm_cpuid2 cpuid;
1316
1317                 r = -EFAULT;
1318                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1319                         goto out;
1320                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1321                                                       cpuid_arg->entries);
1322                 if (r)
1323                         goto out;
1324
1325                 r = -EFAULT;
1326                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1327                         goto out;
1328                 r = 0;
1329                 break;
1330         }
1331         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
1332                 u64 mce_cap;
1333
1334                 mce_cap = KVM_MCE_CAP_SUPPORTED;
1335                 r = -EFAULT;
1336                 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
1337                         goto out;
1338                 r = 0;
1339                 break;
1340         }
1341         default:
1342                 r = -EINVAL;
1343         }
1344 out:
1345         return r;
1346 }
1347
1348 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1349 {
1350         kvm_x86_ops->vcpu_load(vcpu, cpu);
1351         if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1352                 unsigned long khz = cpufreq_quick_get(cpu);
1353                 if (!khz)
1354                         khz = tsc_khz;
1355                 per_cpu(cpu_tsc_khz, cpu) = khz;
1356         }
1357         kvm_request_guest_time_update(vcpu);
1358 }
1359
1360 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1361 {
1362         kvm_x86_ops->vcpu_put(vcpu);
1363         kvm_put_guest_fpu(vcpu);
1364 }
1365
1366 static int is_efer_nx(void)
1367 {
1368         unsigned long long efer = 0;
1369
1370         rdmsrl_safe(MSR_EFER, &efer);
1371         return efer & EFER_NX;
1372 }
1373
1374 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1375 {
1376         int i;
1377         struct kvm_cpuid_entry2 *e, *entry;
1378
1379         entry = NULL;
1380         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1381                 e = &vcpu->arch.cpuid_entries[i];
1382                 if (e->function == 0x80000001) {
1383                         entry = e;
1384                         break;
1385                 }
1386         }
1387         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1388                 entry->edx &= ~(1 << 20);
1389                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1390         }
1391 }
1392
1393 /* when an old userspace process fills a new kernel module */
1394 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1395                                     struct kvm_cpuid *cpuid,
1396                                     struct kvm_cpuid_entry __user *entries)
1397 {
1398         int r, i;
1399         struct kvm_cpuid_entry *cpuid_entries;
1400
1401         r = -E2BIG;
1402         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1403                 goto out;
1404         r = -ENOMEM;
1405         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1406         if (!cpuid_entries)
1407                 goto out;
1408         r = -EFAULT;
1409         if (copy_from_user(cpuid_entries, entries,
1410                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1411                 goto out_free;
1412         for (i = 0; i < cpuid->nent; i++) {
1413                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1414                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1415                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1416                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1417                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1418                 vcpu->arch.cpuid_entries[i].index = 0;
1419                 vcpu->arch.cpuid_entries[i].flags = 0;
1420                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1421                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1422                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1423         }
1424         vcpu->arch.cpuid_nent = cpuid->nent;
1425         cpuid_fix_nx_cap(vcpu);
1426         r = 0;
1427         kvm_apic_set_version(vcpu);
1428
1429 out_free:
1430         vfree(cpuid_entries);
1431 out:
1432         return r;
1433 }
1434
1435 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1436                                      struct kvm_cpuid2 *cpuid,
1437                                      struct kvm_cpuid_entry2 __user *entries)
1438 {
1439         int r;
1440
1441         r = -E2BIG;
1442         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1443                 goto out;
1444         r = -EFAULT;
1445         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1446                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1447                 goto out;
1448         vcpu->arch.cpuid_nent = cpuid->nent;
1449         kvm_apic_set_version(vcpu);
1450         return 0;
1451
1452 out:
1453         return r;
1454 }
1455
1456 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1457                                      struct kvm_cpuid2 *cpuid,
1458                                      struct kvm_cpuid_entry2 __user *entries)
1459 {
1460         int r;
1461
1462         r = -E2BIG;
1463         if (cpuid->nent < vcpu->arch.cpuid_nent)
1464                 goto out;
1465         r = -EFAULT;
1466         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1467                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1468                 goto out;
1469         return 0;
1470
1471 out:
1472         cpuid->nent = vcpu->arch.cpuid_nent;
1473         return r;
1474 }
1475
1476 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1477                            u32 index)
1478 {
1479         entry->function = function;
1480         entry->index = index;
1481         cpuid_count(entry->function, entry->index,
1482                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1483         entry->flags = 0;
1484 }
1485
1486 #define F(x) bit(X86_FEATURE_##x)
1487
1488 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1489                          u32 index, int *nent, int maxnent)
1490 {
1491         unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1492         unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1493 #ifdef CONFIG_X86_64
1494         unsigned f_lm = F(LM);
1495 #else
1496         unsigned f_lm = 0;
1497 #endif
1498
1499         /* cpuid 1.edx */
1500         const u32 kvm_supported_word0_x86_features =
1501                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1502                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1503                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1504                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1505                 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1506                 0 /* Reserved, DS, ACPI */ | F(MMX) |
1507                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1508                 0 /* HTT, TM, Reserved, PBE */;
1509         /* cpuid 0x80000001.edx */
1510         const u32 kvm_supported_word1_x86_features =
1511                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1512                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1513                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1514                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1515                 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1516                 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1517                 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
1518                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1519         /* cpuid 1.ecx */
1520         const u32 kvm_supported_word4_x86_features =
1521                 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1522                 0 /* DS-CPL, VMX, SMX, EST */ |
1523                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1524                 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1525                 0 /* Reserved, DCA */ | F(XMM4_1) |
1526                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1527                 0 /* Reserved, XSAVE, OSXSAVE */;
1528         /* cpuid 0x80000001.ecx */
1529         const u32 kvm_supported_word6_x86_features =
1530                 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1531                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1532                 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1533                 0 /* SKINIT */ | 0 /* WDT */;
1534
1535         /* all calls to cpuid_count() should be made on the same cpu */
1536         get_cpu();
1537         do_cpuid_1_ent(entry, function, index);
1538         ++*nent;
1539
1540         switch (function) {
1541         case 0:
1542                 entry->eax = min(entry->eax, (u32)0xb);
1543                 break;
1544         case 1:
1545                 entry->edx &= kvm_supported_word0_x86_features;
1546                 entry->ecx &= kvm_supported_word4_x86_features;
1547                 /* we support x2apic emulation even if host does not support
1548                  * it since we emulate x2apic in software */
1549                 entry->ecx |= F(X2APIC);
1550                 break;
1551         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1552          * may return different values. This forces us to get_cpu() before
1553          * issuing the first command, and also to emulate this annoying behavior
1554          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1555         case 2: {
1556                 int t, times = entry->eax & 0xff;
1557
1558                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1559                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1560                 for (t = 1; t < times && *nent < maxnent; ++t) {
1561                         do_cpuid_1_ent(&entry[t], function, 0);
1562                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1563                         ++*nent;
1564                 }
1565                 break;
1566         }
1567         /* function 4 and 0xb have additional index. */
1568         case 4: {
1569                 int i, cache_type;
1570
1571                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1572                 /* read more entries until cache_type is zero */
1573                 for (i = 1; *nent < maxnent; ++i) {
1574                         cache_type = entry[i - 1].eax & 0x1f;
1575                         if (!cache_type)
1576                                 break;
1577                         do_cpuid_1_ent(&entry[i], function, i);
1578                         entry[i].flags |=
1579                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1580                         ++*nent;
1581                 }
1582                 break;
1583         }
1584         case 0xb: {
1585                 int i, level_type;
1586
1587                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1588                 /* read more entries until level_type is zero */
1589                 for (i = 1; *nent < maxnent; ++i) {
1590                         level_type = entry[i - 1].ecx & 0xff00;
1591                         if (!level_type)
1592                                 break;
1593                         do_cpuid_1_ent(&entry[i], function, i);
1594                         entry[i].flags |=
1595                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1596                         ++*nent;
1597                 }
1598                 break;
1599         }
1600         case 0x80000000:
1601                 entry->eax = min(entry->eax, 0x8000001a);
1602                 break;
1603         case 0x80000001:
1604                 entry->edx &= kvm_supported_word1_x86_features;
1605                 entry->ecx &= kvm_supported_word6_x86_features;
1606                 break;
1607         }
1608         put_cpu();
1609 }
1610
1611 #undef F
1612
1613 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1614                                      struct kvm_cpuid_entry2 __user *entries)
1615 {
1616         struct kvm_cpuid_entry2 *cpuid_entries;
1617         int limit, nent = 0, r = -E2BIG;
1618         u32 func;
1619
1620         if (cpuid->nent < 1)
1621                 goto out;
1622         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1623                 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1624         r = -ENOMEM;
1625         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1626         if (!cpuid_entries)
1627                 goto out;
1628
1629         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1630         limit = cpuid_entries[0].eax;
1631         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1632                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1633                              &nent, cpuid->nent);
1634         r = -E2BIG;
1635         if (nent >= cpuid->nent)
1636                 goto out_free;
1637
1638         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1639         limit = cpuid_entries[nent - 1].eax;
1640         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1641                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1642                              &nent, cpuid->nent);
1643         r = -E2BIG;
1644         if (nent >= cpuid->nent)
1645                 goto out_free;
1646
1647         r = -EFAULT;
1648         if (copy_to_user(entries, cpuid_entries,
1649                          nent * sizeof(struct kvm_cpuid_entry2)))
1650                 goto out_free;
1651         cpuid->nent = nent;
1652         r = 0;
1653
1654 out_free:
1655         vfree(cpuid_entries);
1656 out:
1657         return r;
1658 }
1659
1660 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1661                                     struct kvm_lapic_state *s)
1662 {
1663         vcpu_load(vcpu);
1664         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1665         vcpu_put(vcpu);
1666
1667         return 0;
1668 }
1669
1670 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1671                                     struct kvm_lapic_state *s)
1672 {
1673         vcpu_load(vcpu);
1674         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1675         kvm_apic_post_state_restore(vcpu);
1676         update_cr8_intercept(vcpu);
1677         vcpu_put(vcpu);
1678
1679         return 0;
1680 }
1681
1682 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1683                                     struct kvm_interrupt *irq)
1684 {
1685         if (irq->irq < 0 || irq->irq >= 256)
1686                 return -EINVAL;
1687         if (irqchip_in_kernel(vcpu->kvm))
1688                 return -ENXIO;
1689         vcpu_load(vcpu);
1690
1691         kvm_queue_interrupt(vcpu, irq->irq, false);
1692
1693         vcpu_put(vcpu);
1694
1695         return 0;
1696 }
1697
1698 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1699 {
1700         vcpu_load(vcpu);
1701         kvm_inject_nmi(vcpu);
1702         vcpu_put(vcpu);
1703
1704         return 0;
1705 }
1706
1707 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1708                                            struct kvm_tpr_access_ctl *tac)
1709 {
1710         if (tac->flags)
1711                 return -EINVAL;
1712         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1713         return 0;
1714 }
1715
1716 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1717                                         u64 mcg_cap)
1718 {
1719         int r;
1720         unsigned bank_num = mcg_cap & 0xff, bank;
1721
1722         r = -EINVAL;
1723         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
1724                 goto out;
1725         if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1726                 goto out;
1727         r = 0;
1728         vcpu->arch.mcg_cap = mcg_cap;
1729         /* Init IA32_MCG_CTL to all 1s */
1730         if (mcg_cap & MCG_CTL_P)
1731                 vcpu->arch.mcg_ctl = ~(u64)0;
1732         /* Init IA32_MCi_CTL to all 1s */
1733         for (bank = 0; bank < bank_num; bank++)
1734                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
1735 out:
1736         return r;
1737 }
1738
1739 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1740                                       struct kvm_x86_mce *mce)
1741 {
1742         u64 mcg_cap = vcpu->arch.mcg_cap;
1743         unsigned bank_num = mcg_cap & 0xff;
1744         u64 *banks = vcpu->arch.mce_banks;
1745
1746         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
1747                 return -EINVAL;
1748         /*
1749          * if IA32_MCG_CTL is not all 1s, the uncorrected error
1750          * reporting is disabled
1751          */
1752         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
1753             vcpu->arch.mcg_ctl != ~(u64)0)
1754                 return 0;
1755         banks += 4 * mce->bank;
1756         /*
1757          * if IA32_MCi_CTL is not all 1s, the uncorrected error
1758          * reporting is disabled for the bank
1759          */
1760         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
1761                 return 0;
1762         if (mce->status & MCI_STATUS_UC) {
1763                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1764                     !(vcpu->arch.cr4 & X86_CR4_MCE)) {
1765                         printk(KERN_DEBUG "kvm: set_mce: "
1766                                "injects mce exception while "
1767                                "previous one is in progress!\n");
1768                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1769                         return 0;
1770                 }
1771                 if (banks[1] & MCI_STATUS_VAL)
1772                         mce->status |= MCI_STATUS_OVER;
1773                 banks[2] = mce->addr;
1774                 banks[3] = mce->misc;
1775                 vcpu->arch.mcg_status = mce->mcg_status;
1776                 banks[1] = mce->status;
1777                 kvm_queue_exception(vcpu, MC_VECTOR);
1778         } else if (!(banks[1] & MCI_STATUS_VAL)
1779                    || !(banks[1] & MCI_STATUS_UC)) {
1780                 if (banks[1] & MCI_STATUS_VAL)
1781                         mce->status |= MCI_STATUS_OVER;
1782                 banks[2] = mce->addr;
1783                 banks[3] = mce->misc;
1784                 banks[1] = mce->status;
1785         } else
1786                 banks[1] |= MCI_STATUS_OVER;
1787         return 0;
1788 }
1789
1790 long kvm_arch_vcpu_ioctl(struct file *filp,
1791                          unsigned int ioctl, unsigned long arg)
1792 {
1793         struct kvm_vcpu *vcpu = filp->private_data;
1794         void __user *argp = (void __user *)arg;
1795         int r;
1796         struct kvm_lapic_state *lapic = NULL;
1797
1798         switch (ioctl) {
1799         case KVM_GET_LAPIC: {
1800                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1801
1802                 r = -ENOMEM;
1803                 if (!lapic)
1804                         goto out;
1805                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1806                 if (r)
1807                         goto out;
1808                 r = -EFAULT;
1809                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1810                         goto out;
1811                 r = 0;
1812                 break;
1813         }
1814         case KVM_SET_LAPIC: {
1815                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1816                 r = -ENOMEM;
1817                 if (!lapic)
1818                         goto out;
1819                 r = -EFAULT;
1820                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1821                         goto out;
1822                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1823                 if (r)
1824                         goto out;
1825                 r = 0;
1826                 break;
1827         }
1828         case KVM_INTERRUPT: {
1829                 struct kvm_interrupt irq;
1830
1831                 r = -EFAULT;
1832                 if (copy_from_user(&irq, argp, sizeof irq))
1833                         goto out;
1834                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1835                 if (r)
1836                         goto out;
1837                 r = 0;
1838                 break;
1839         }
1840         case KVM_NMI: {
1841                 r = kvm_vcpu_ioctl_nmi(vcpu);
1842                 if (r)
1843                         goto out;
1844                 r = 0;
1845                 break;
1846         }
1847         case KVM_SET_CPUID: {
1848                 struct kvm_cpuid __user *cpuid_arg = argp;
1849                 struct kvm_cpuid cpuid;
1850
1851                 r = -EFAULT;
1852                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1853                         goto out;
1854                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1855                 if (r)
1856                         goto out;
1857                 break;
1858         }
1859         case KVM_SET_CPUID2: {
1860                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1861                 struct kvm_cpuid2 cpuid;
1862
1863                 r = -EFAULT;
1864                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1865                         goto out;
1866                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1867                                               cpuid_arg->entries);
1868                 if (r)
1869                         goto out;
1870                 break;
1871         }
1872         case KVM_GET_CPUID2: {
1873                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1874                 struct kvm_cpuid2 cpuid;
1875
1876                 r = -EFAULT;
1877                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1878                         goto out;
1879                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1880                                               cpuid_arg->entries);
1881                 if (r)
1882                         goto out;
1883                 r = -EFAULT;
1884                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1885                         goto out;
1886                 r = 0;
1887                 break;
1888         }
1889         case KVM_GET_MSRS:
1890                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1891                 break;
1892         case KVM_SET_MSRS:
1893                 r = msr_io(vcpu, argp, do_set_msr, 0);
1894                 break;
1895         case KVM_TPR_ACCESS_REPORTING: {
1896                 struct kvm_tpr_access_ctl tac;
1897
1898                 r = -EFAULT;
1899                 if (copy_from_user(&tac, argp, sizeof tac))
1900                         goto out;
1901                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1902                 if (r)
1903                         goto out;
1904                 r = -EFAULT;
1905                 if (copy_to_user(argp, &tac, sizeof tac))
1906                         goto out;
1907                 r = 0;
1908                 break;
1909         };
1910         case KVM_SET_VAPIC_ADDR: {
1911                 struct kvm_vapic_addr va;
1912
1913                 r = -EINVAL;
1914                 if (!irqchip_in_kernel(vcpu->kvm))
1915                         goto out;
1916                 r = -EFAULT;
1917                 if (copy_from_user(&va, argp, sizeof va))
1918                         goto out;
1919                 r = 0;
1920                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1921                 break;
1922         }
1923         case KVM_X86_SETUP_MCE: {
1924                 u64 mcg_cap;
1925
1926                 r = -EFAULT;
1927                 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
1928                         goto out;
1929                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
1930                 break;
1931         }
1932         case KVM_X86_SET_MCE: {
1933                 struct kvm_x86_mce mce;
1934
1935                 r = -EFAULT;
1936                 if (copy_from_user(&mce, argp, sizeof mce))
1937                         goto out;
1938                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1939                 break;
1940         }
1941         default:
1942                 r = -EINVAL;
1943         }
1944 out:
1945         kfree(lapic);
1946         return r;
1947 }
1948
1949 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1950 {
1951         int ret;
1952
1953         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1954                 return -1;
1955         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1956         return ret;
1957 }
1958
1959 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
1960                                               u64 ident_addr)
1961 {
1962         kvm->arch.ept_identity_map_addr = ident_addr;
1963         return 0;
1964 }
1965
1966 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1967                                           u32 kvm_nr_mmu_pages)
1968 {
1969         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1970                 return -EINVAL;
1971
1972         down_write(&kvm->slots_lock);
1973         spin_lock(&kvm->mmu_lock);
1974
1975         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1976         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1977
1978         spin_unlock(&kvm->mmu_lock);
1979         up_write(&kvm->slots_lock);
1980         return 0;
1981 }
1982
1983 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1984 {
1985         return kvm->arch.n_alloc_mmu_pages;
1986 }
1987
1988 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1989 {
1990         int i;
1991         struct kvm_mem_alias *alias;
1992
1993         for (i = 0; i < kvm->arch.naliases; ++i) {
1994                 alias = &kvm->arch.aliases[i];
1995                 if (gfn >= alias->base_gfn
1996                     && gfn < alias->base_gfn + alias->npages)
1997                         return alias->target_gfn + gfn - alias->base_gfn;
1998         }
1999         return gfn;
2000 }
2001
2002 /*
2003  * Set a new alias region.  Aliases map a portion of physical memory into
2004  * another portion.  This is useful for memory windows, for example the PC
2005  * VGA region.
2006  */
2007 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2008                                          struct kvm_memory_alias *alias)
2009 {
2010         int r, n;
2011         struct kvm_mem_alias *p;
2012
2013         r = -EINVAL;
2014         /* General sanity checks */
2015         if (alias->memory_size & (PAGE_SIZE - 1))
2016                 goto out;
2017         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2018                 goto out;
2019         if (alias->slot >= KVM_ALIAS_SLOTS)
2020                 goto out;
2021         if (alias->guest_phys_addr + alias->memory_size
2022             < alias->guest_phys_addr)
2023                 goto out;
2024         if (alias->target_phys_addr + alias->memory_size
2025             < alias->target_phys_addr)
2026                 goto out;
2027
2028         down_write(&kvm->slots_lock);
2029         spin_lock(&kvm->mmu_lock);
2030
2031         p = &kvm->arch.aliases[alias->slot];
2032         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2033         p->npages = alias->memory_size >> PAGE_SHIFT;
2034         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2035
2036         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2037                 if (kvm->arch.aliases[n - 1].npages)
2038                         break;
2039         kvm->arch.naliases = n;
2040
2041         spin_unlock(&kvm->mmu_lock);
2042         kvm_mmu_zap_all(kvm);
2043
2044         up_write(&kvm->slots_lock);
2045
2046         return 0;
2047
2048 out:
2049         return r;
2050 }
2051
2052 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2053 {
2054         int r;
2055
2056         r = 0;
2057         switch (chip->chip_id) {
2058         case KVM_IRQCHIP_PIC_MASTER:
2059                 memcpy(&chip->chip.pic,
2060                         &pic_irqchip(kvm)->pics[0],
2061                         sizeof(struct kvm_pic_state));
2062                 break;
2063         case KVM_IRQCHIP_PIC_SLAVE:
2064                 memcpy(&chip->chip.pic,
2065                         &pic_irqchip(kvm)->pics[1],
2066                         sizeof(struct kvm_pic_state));
2067                 break;
2068         case KVM_IRQCHIP_IOAPIC:
2069                 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2070                 break;
2071         default:
2072                 r = -EINVAL;
2073                 break;
2074         }
2075         return r;
2076 }
2077
2078 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2079 {
2080         int r;
2081
2082         r = 0;
2083         switch (chip->chip_id) {
2084         case KVM_IRQCHIP_PIC_MASTER:
2085                 spin_lock(&pic_irqchip(kvm)->lock);
2086                 memcpy(&pic_irqchip(kvm)->pics[0],
2087                         &chip->chip.pic,
2088                         sizeof(struct kvm_pic_state));
2089                 spin_unlock(&pic_irqchip(kvm)->lock);
2090                 break;
2091         case KVM_IRQCHIP_PIC_SLAVE:
2092                 spin_lock(&pic_irqchip(kvm)->lock);
2093                 memcpy(&pic_irqchip(kvm)->pics[1],
2094                         &chip->chip.pic,
2095                         sizeof(struct kvm_pic_state));
2096                 spin_unlock(&pic_irqchip(kvm)->lock);
2097                 break;
2098         case KVM_IRQCHIP_IOAPIC:
2099                 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2100                 break;
2101         default:
2102                 r = -EINVAL;
2103                 break;
2104         }
2105         kvm_pic_update_irq(pic_irqchip(kvm));
2106         return r;
2107 }
2108
2109 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2110 {
2111         int r = 0;
2112
2113         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2114         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
2115         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2116         return r;
2117 }
2118
2119 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2120 {
2121         int r = 0;
2122
2123         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2124         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
2125         kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
2126         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2127         return r;
2128 }
2129
2130 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2131 {
2132         int r = 0;
2133
2134         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2135         memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
2136                 sizeof(ps->channels));
2137         ps->flags = kvm->arch.vpit->pit_state.flags;
2138         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2139         return r;
2140 }
2141
2142 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2143 {
2144         int r = 0, start = 0;
2145         u32 prev_legacy, cur_legacy;
2146         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2147         prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2148         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2149         if (!prev_legacy && cur_legacy)
2150                 start = 1;
2151         memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
2152                sizeof(kvm->arch.vpit->pit_state.channels));
2153         kvm->arch.vpit->pit_state.flags = ps->flags;
2154         kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
2155         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2156         return r;
2157 }
2158
2159 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2160                                  struct kvm_reinject_control *control)
2161 {
2162         if (!kvm->arch.vpit)
2163                 return -ENXIO;
2164         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2165         kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
2166         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2167         return 0;
2168 }
2169
2170 /*
2171  * Get (and clear) the dirty memory log for a memory slot.
2172  */
2173 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2174                                       struct kvm_dirty_log *log)
2175 {
2176         int r;
2177         int n;
2178         struct kvm_memory_slot *memslot;
2179         int is_dirty = 0;
2180
2181         down_write(&kvm->slots_lock);
2182
2183         r = kvm_get_dirty_log(kvm, log, &is_dirty);
2184         if (r)
2185                 goto out;
2186
2187         /* If nothing is dirty, don't bother messing with page tables. */
2188         if (is_dirty) {
2189                 spin_lock(&kvm->mmu_lock);
2190                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2191                 spin_unlock(&kvm->mmu_lock);
2192                 memslot = &kvm->memslots[log->slot];
2193                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
2194                 memset(memslot->dirty_bitmap, 0, n);
2195         }
2196         r = 0;
2197 out:
2198         up_write(&kvm->slots_lock);
2199         return r;
2200 }
2201
2202 long kvm_arch_vm_ioctl(struct file *filp,
2203                        unsigned int ioctl, unsigned long arg)
2204 {
2205         struct kvm *kvm = filp->private_data;
2206         void __user *argp = (void __user *)arg;
2207         int r = -ENOTTY;
2208         /*
2209          * This union makes it completely explicit to gcc-3.x
2210          * that these two variables' stack usage should be
2211          * combined, not added together.
2212          */
2213         union {
2214                 struct kvm_pit_state ps;
2215                 struct kvm_pit_state2 ps2;
2216                 struct kvm_memory_alias alias;
2217                 struct kvm_pit_config pit_config;
2218         } u;
2219
2220         switch (ioctl) {
2221         case KVM_SET_TSS_ADDR:
2222                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
2223                 if (r < 0)
2224                         goto out;
2225                 break;
2226         case KVM_SET_IDENTITY_MAP_ADDR: {
2227                 u64 ident_addr;
2228
2229                 r = -EFAULT;
2230                 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
2231                         goto out;
2232                 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
2233                 if (r < 0)
2234                         goto out;
2235                 break;
2236         }
2237         case KVM_SET_MEMORY_REGION: {
2238                 struct kvm_memory_region kvm_mem;
2239                 struct kvm_userspace_memory_region kvm_userspace_mem;
2240
2241                 r = -EFAULT;
2242                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2243                         goto out;
2244                 kvm_userspace_mem.slot = kvm_mem.slot;
2245                 kvm_userspace_mem.flags = kvm_mem.flags;
2246                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2247                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2248                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2249                 if (r)
2250                         goto out;
2251                 break;
2252         }
2253         case KVM_SET_NR_MMU_PAGES:
2254                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2255                 if (r)
2256                         goto out;
2257                 break;
2258         case KVM_GET_NR_MMU_PAGES:
2259                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2260                 break;
2261         case KVM_SET_MEMORY_ALIAS:
2262                 r = -EFAULT;
2263                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2264                         goto out;
2265                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2266                 if (r)
2267                         goto out;
2268                 break;
2269         case KVM_CREATE_IRQCHIP:
2270                 r = -ENOMEM;
2271                 kvm->arch.vpic = kvm_create_pic(kvm);
2272                 if (kvm->arch.vpic) {
2273                         r = kvm_ioapic_init(kvm);
2274                         if (r) {
2275                                 kfree(kvm->arch.vpic);
2276                                 kvm->arch.vpic = NULL;
2277                                 goto out;
2278                         }
2279                 } else
2280                         goto out;
2281                 r = kvm_setup_default_irq_routing(kvm);
2282                 if (r) {
2283                         kfree(kvm->arch.vpic);
2284                         kfree(kvm->arch.vioapic);
2285                         goto out;
2286                 }
2287                 break;
2288         case KVM_CREATE_PIT:
2289                 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2290                 goto create_pit;
2291         case KVM_CREATE_PIT2:
2292                 r = -EFAULT;
2293                 if (copy_from_user(&u.pit_config, argp,
2294                                    sizeof(struct kvm_pit_config)))
2295                         goto out;
2296         create_pit:
2297                 down_write(&kvm->slots_lock);
2298                 r = -EEXIST;
2299                 if (kvm->arch.vpit)
2300                         goto create_pit_unlock;
2301                 r = -ENOMEM;
2302                 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
2303                 if (kvm->arch.vpit)
2304                         r = 0;
2305         create_pit_unlock:
2306                 up_write(&kvm->slots_lock);
2307                 break;
2308         case KVM_IRQ_LINE_STATUS:
2309         case KVM_IRQ_LINE: {
2310                 struct kvm_irq_level irq_event;
2311
2312                 r = -EFAULT;
2313                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2314                         goto out;
2315                 if (irqchip_in_kernel(kvm)) {
2316                         __s32 status;
2317                         status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2318                                         irq_event.irq, irq_event.level);
2319                         if (ioctl == KVM_IRQ_LINE_STATUS) {
2320                                 irq_event.status = status;
2321                                 if (copy_to_user(argp, &irq_event,
2322                                                         sizeof irq_event))
2323                                         goto out;
2324                         }
2325                         r = 0;
2326                 }
2327                 break;
2328         }
2329         case KVM_GET_IRQCHIP: {
2330                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2331                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2332
2333                 r = -ENOMEM;
2334                 if (!chip)
2335                         goto out;
2336                 r = -EFAULT;
2337                 if (copy_from_user(chip, argp, sizeof *chip))
2338                         goto get_irqchip_out;
2339                 r = -ENXIO;
2340                 if (!irqchip_in_kernel(kvm))
2341                         goto get_irqchip_out;
2342                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
2343                 if (r)
2344                         goto get_irqchip_out;
2345                 r = -EFAULT;
2346                 if (copy_to_user(argp, chip, sizeof *chip))
2347                         goto get_irqchip_out;
2348                 r = 0;
2349         get_irqchip_out:
2350                 kfree(chip);
2351                 if (r)
2352                         goto out;
2353                 break;
2354         }
2355         case KVM_SET_IRQCHIP: {
2356                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2357                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2358
2359                 r = -ENOMEM;
2360                 if (!chip)
2361                         goto out;
2362                 r = -EFAULT;
2363                 if (copy_from_user(chip, argp, sizeof *chip))
2364                         goto set_irqchip_out;
2365                 r = -ENXIO;
2366                 if (!irqchip_in_kernel(kvm))
2367                         goto set_irqchip_out;
2368                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
2369                 if (r)
2370                         goto set_irqchip_out;
2371                 r = 0;
2372         set_irqchip_out:
2373                 kfree(chip);
2374                 if (r)
2375                         goto out;
2376                 break;
2377         }
2378         case KVM_GET_PIT: {
2379                 r = -EFAULT;
2380                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
2381                         goto out;
2382                 r = -ENXIO;
2383                 if (!kvm->arch.vpit)
2384                         goto out;
2385                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
2386                 if (r)
2387                         goto out;
2388                 r = -EFAULT;
2389                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
2390                         goto out;
2391                 r = 0;
2392                 break;
2393         }
2394         case KVM_SET_PIT: {
2395                 r = -EFAULT;
2396                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
2397                         goto out;
2398                 r = -ENXIO;
2399                 if (!kvm->arch.vpit)
2400                         goto out;
2401                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2402                 if (r)
2403                         goto out;
2404                 r = 0;
2405                 break;
2406         }
2407         case KVM_GET_PIT2: {
2408                 r = -ENXIO;
2409                 if (!kvm->arch.vpit)
2410                         goto out;
2411                 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
2412                 if (r)
2413                         goto out;
2414                 r = -EFAULT;
2415                 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
2416                         goto out;
2417                 r = 0;
2418                 break;
2419         }
2420         case KVM_SET_PIT2: {
2421                 r = -EFAULT;
2422                 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
2423                         goto out;
2424                 r = -ENXIO;
2425                 if (!kvm->arch.vpit)
2426                         goto out;
2427                 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
2428                 if (r)
2429                         goto out;
2430                 r = 0;
2431                 break;
2432         }
2433         case KVM_REINJECT_CONTROL: {
2434                 struct kvm_reinject_control control;
2435                 r =  -EFAULT;
2436                 if (copy_from_user(&control, argp, sizeof(control)))
2437                         goto out;
2438                 r = kvm_vm_ioctl_reinject(kvm, &control);
2439                 if (r)
2440                         goto out;
2441                 r = 0;
2442                 break;
2443         }
2444         default:
2445                 ;
2446         }
2447 out:
2448         return r;
2449 }
2450
2451 static void kvm_init_msr_list(void)
2452 {
2453         u32 dummy[2];
2454         unsigned i, j;
2455
2456         /* skip the first msrs in the list. KVM-specific */
2457         for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2458                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2459                         continue;
2460                 if (j < i)
2461                         msrs_to_save[j] = msrs_to_save[i];
2462                 j++;
2463         }
2464         num_msrs_to_save = j;
2465 }
2466
2467 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2468                            const void *v)
2469 {
2470         if (vcpu->arch.apic &&
2471             !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2472                 return 0;
2473
2474         return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
2475 }
2476
2477 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2478 {
2479         if (vcpu->arch.apic &&
2480             !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2481                 return 0;
2482
2483         return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
2484 }
2485
2486 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2487                                struct kvm_vcpu *vcpu)
2488 {
2489         void *data = val;
2490         int r = X86EMUL_CONTINUE;
2491
2492         while (bytes) {
2493                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2494                 unsigned offset = addr & (PAGE_SIZE-1);
2495                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2496                 int ret;
2497
2498                 if (gpa == UNMAPPED_GVA) {
2499                         r = X86EMUL_PROPAGATE_FAULT;
2500                         goto out;
2501                 }
2502                 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2503                 if (ret < 0) {
2504                         r = X86EMUL_UNHANDLEABLE;
2505                         goto out;
2506                 }
2507
2508                 bytes -= toread;
2509                 data += toread;
2510                 addr += toread;
2511         }
2512 out:
2513         return r;
2514 }
2515
2516 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2517                                 struct kvm_vcpu *vcpu)
2518 {
2519         void *data = val;
2520         int r = X86EMUL_CONTINUE;
2521
2522         while (bytes) {
2523                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2524                 unsigned offset = addr & (PAGE_SIZE-1);
2525                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2526                 int ret;
2527
2528                 if (gpa == UNMAPPED_GVA) {
2529                         r = X86EMUL_PROPAGATE_FAULT;
2530                         goto out;
2531                 }
2532                 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2533                 if (ret < 0) {
2534                         r = X86EMUL_UNHANDLEABLE;
2535                         goto out;
2536                 }
2537
2538                 bytes -= towrite;
2539                 data += towrite;
2540                 addr += towrite;
2541         }
2542 out:
2543         return r;
2544 }
2545
2546
2547 static int emulator_read_emulated(unsigned long addr,
2548                                   void *val,
2549                                   unsigned int bytes,
2550                                   struct kvm_vcpu *vcpu)
2551 {
2552         gpa_t                 gpa;
2553
2554         if (vcpu->mmio_read_completed) {
2555                 memcpy(val, vcpu->mmio_data, bytes);
2556                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
2557                                vcpu->mmio_phys_addr, *(u64 *)val);
2558                 vcpu->mmio_read_completed = 0;
2559                 return X86EMUL_CONTINUE;
2560         }
2561
2562         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2563
2564         /* For APIC access vmexit */
2565         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2566                 goto mmio;
2567
2568         if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2569                                 == X86EMUL_CONTINUE)
2570                 return X86EMUL_CONTINUE;
2571         if (gpa == UNMAPPED_GVA)
2572                 return X86EMUL_PROPAGATE_FAULT;
2573
2574 mmio:
2575         /*
2576          * Is this MMIO handled locally?
2577          */
2578         if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
2579                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
2580                 return X86EMUL_CONTINUE;
2581         }
2582
2583         trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
2584
2585         vcpu->mmio_needed = 1;
2586         vcpu->mmio_phys_addr = gpa;
2587         vcpu->mmio_size = bytes;
2588         vcpu->mmio_is_write = 0;
2589
2590         return X86EMUL_UNHANDLEABLE;
2591 }
2592
2593 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2594                           const void *val, int bytes)
2595 {
2596         int ret;
2597
2598         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2599         if (ret < 0)
2600                 return 0;
2601         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2602         return 1;
2603 }
2604
2605 static int emulator_write_emulated_onepage(unsigned long addr,
2606                                            const void *val,
2607                                            unsigned int bytes,
2608                                            struct kvm_vcpu *vcpu)
2609 {
2610         gpa_t                 gpa;
2611
2612         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2613
2614         if (gpa == UNMAPPED_GVA) {
2615                 kvm_inject_page_fault(vcpu, addr, 2);
2616                 return X86EMUL_PROPAGATE_FAULT;
2617         }
2618
2619         /* For APIC access vmexit */
2620         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2621                 goto mmio;
2622
2623         if (emulator_write_phys(vcpu, gpa, val, bytes))
2624                 return X86EMUL_CONTINUE;
2625
2626 mmio:
2627         trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
2628         /*
2629          * Is this MMIO handled locally?
2630          */
2631         if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
2632                 return X86EMUL_CONTINUE;
2633
2634         vcpu->mmio_needed = 1;
2635         vcpu->mmio_phys_addr = gpa;
2636         vcpu->mmio_size = bytes;
2637         vcpu->mmio_is_write = 1;
2638         memcpy(vcpu->mmio_data, val, bytes);
2639
2640         return X86EMUL_CONTINUE;
2641 }
2642
2643 int emulator_write_emulated(unsigned long addr,
2644                                    const void *val,
2645                                    unsigned int bytes,
2646                                    struct kvm_vcpu *vcpu)
2647 {
2648         /* Crossing a page boundary? */
2649         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2650                 int rc, now;
2651
2652                 now = -addr & ~PAGE_MASK;
2653                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2654                 if (rc != X86EMUL_CONTINUE)
2655                         return rc;
2656                 addr += now;
2657                 val += now;
2658                 bytes -= now;
2659         }
2660         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2661 }
2662 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2663
2664 static int emulator_cmpxchg_emulated(unsigned long addr,
2665                                      const void *old,
2666                                      const void *new,
2667                                      unsigned int bytes,
2668                                      struct kvm_vcpu *vcpu)
2669 {
2670         printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
2671 #ifndef CONFIG_X86_64
2672         /* guests cmpxchg8b have to be emulated atomically */
2673         if (bytes == 8) {
2674                 gpa_t gpa;
2675                 struct page *page;
2676                 char *kaddr;
2677                 u64 val;
2678
2679                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2680
2681                 if (gpa == UNMAPPED_GVA ||
2682                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2683                         goto emul_write;
2684
2685                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2686                         goto emul_write;
2687
2688                 val = *(u64 *)new;
2689
2690                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2691
2692                 kaddr = kmap_atomic(page, KM_USER0);
2693                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2694                 kunmap_atomic(kaddr, KM_USER0);
2695                 kvm_release_page_dirty(page);
2696         }
2697 emul_write:
2698 #endif
2699
2700         return emulator_write_emulated(addr, new, bytes, vcpu);
2701 }
2702
2703 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2704 {
2705         return kvm_x86_ops->get_segment_base(vcpu, seg);
2706 }
2707
2708 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2709 {
2710         kvm_mmu_invlpg(vcpu, address);
2711         return X86EMUL_CONTINUE;
2712 }
2713
2714 int emulate_clts(struct kvm_vcpu *vcpu)
2715 {
2716         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2717         return X86EMUL_CONTINUE;
2718 }
2719
2720 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2721 {
2722         struct kvm_vcpu *vcpu = ctxt->vcpu;
2723
2724         switch (dr) {
2725         case 0 ... 3:
2726                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2727                 return X86EMUL_CONTINUE;
2728         default:
2729                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2730                 return X86EMUL_UNHANDLEABLE;
2731         }
2732 }
2733
2734 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2735 {
2736         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2737         int exception;
2738
2739         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2740         if (exception) {
2741                 /* FIXME: better handling */
2742                 return X86EMUL_UNHANDLEABLE;
2743         }
2744         return X86EMUL_CONTINUE;
2745 }
2746
2747 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2748 {
2749         u8 opcodes[4];
2750         unsigned long rip = kvm_rip_read(vcpu);
2751         unsigned long rip_linear;
2752
2753         if (!printk_ratelimit())
2754                 return;
2755
2756         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2757
2758         kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2759
2760         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2761                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2762 }
2763 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2764
2765 static struct x86_emulate_ops emulate_ops = {
2766         .read_std            = kvm_read_guest_virt,
2767         .read_emulated       = emulator_read_emulated,
2768         .write_emulated      = emulator_write_emulated,
2769         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2770 };
2771
2772 static void cache_all_regs(struct kvm_vcpu *vcpu)
2773 {
2774         kvm_register_read(vcpu, VCPU_REGS_RAX);
2775         kvm_register_read(vcpu, VCPU_REGS_RSP);
2776         kvm_register_read(vcpu, VCPU_REGS_RIP);
2777         vcpu->arch.regs_dirty = ~0;
2778 }
2779
2780 int emulate_instruction(struct kvm_vcpu *vcpu,
2781                         unsigned long cr2,
2782                         u16 error_code,
2783                         int emulation_type)
2784 {
2785         int r, shadow_mask;
2786         struct decode_cache *c;
2787         struct kvm_run *run = vcpu->run;
2788
2789         kvm_clear_exception_queue(vcpu);
2790         vcpu->arch.mmio_fault_cr2 = cr2;
2791         /*
2792          * TODO: fix emulate.c to use guest_read/write_register
2793          * instead of direct ->regs accesses, can save hundred cycles
2794          * on Intel for instructions that don't read/change RSP, for
2795          * for example.
2796          */
2797         cache_all_regs(vcpu);
2798
2799         vcpu->mmio_is_write = 0;
2800         vcpu->arch.pio.string = 0;
2801
2802         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2803                 int cs_db, cs_l;
2804                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2805
2806                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2807                 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2808                 vcpu->arch.emulate_ctxt.mode =
2809                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2810                         ? X86EMUL_MODE_REAL : cs_l
2811                         ? X86EMUL_MODE_PROT64 : cs_db
2812                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2813
2814                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2815
2816                 /* Only allow emulation of specific instructions on #UD
2817                  * (namely VMMCALL, sysenter, sysexit, syscall)*/
2818                 c = &vcpu->arch.emulate_ctxt.decode;
2819                 if (emulation_type & EMULTYPE_TRAP_UD) {
2820                         if (!c->twobyte)
2821                                 return EMULATE_FAIL;
2822                         switch (c->b) {
2823                         case 0x01: /* VMMCALL */
2824                                 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2825                                         return EMULATE_FAIL;
2826                                 break;
2827                         case 0x34: /* sysenter */
2828                         case 0x35: /* sysexit */
2829                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2830                                         return EMULATE_FAIL;
2831                                 break;
2832                         case 0x05: /* syscall */
2833                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2834                                         return EMULATE_FAIL;
2835                                 break;
2836                         default:
2837                                 return EMULATE_FAIL;
2838                         }
2839
2840                         if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
2841                                 return EMULATE_FAIL;
2842                 }
2843
2844                 ++vcpu->stat.insn_emulation;
2845                 if (r)  {
2846                         ++vcpu->stat.insn_emulation_fail;
2847                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2848                                 return EMULATE_DONE;
2849                         return EMULATE_FAIL;
2850                 }
2851         }
2852
2853         if (emulation_type & EMULTYPE_SKIP) {
2854                 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2855                 return EMULATE_DONE;
2856         }
2857
2858         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2859         shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2860
2861         if (r == 0)
2862                 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2863
2864         if (vcpu->arch.pio.string)
2865                 return EMULATE_DO_MMIO;
2866
2867         if ((r || vcpu->mmio_is_write) && run) {
2868                 run->exit_reason = KVM_EXIT_MMIO;
2869                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2870                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2871                 run->mmio.len = vcpu->mmio_size;
2872                 run->mmio.is_write = vcpu->mmio_is_write;
2873         }
2874
2875         if (r) {
2876                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2877                         return EMULATE_DONE;
2878                 if (!vcpu->mmio_needed) {
2879                         kvm_report_emulation_failure(vcpu, "mmio");
2880                         return EMULATE_FAIL;
2881                 }
2882                 return EMULATE_DO_MMIO;
2883         }
2884
2885         kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2886
2887         if (vcpu->mmio_is_write) {
2888                 vcpu->mmio_needed = 0;
2889                 return EMULATE_DO_MMIO;
2890         }
2891
2892         return EMULATE_DONE;
2893 }
2894 EXPORT_SYMBOL_GPL(emulate_instruction);
2895
2896 static int pio_copy_data(struct kvm_vcpu *vcpu)
2897 {
2898         void *p = vcpu->arch.pio_data;
2899         gva_t q = vcpu->arch.pio.guest_gva;
2900         unsigned bytes;
2901         int ret;
2902
2903         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2904         if (vcpu->arch.pio.in)
2905                 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2906         else
2907                 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2908         return ret;
2909 }
2910
2911 int complete_pio(struct kvm_vcpu *vcpu)
2912 {
2913         struct kvm_pio_request *io = &vcpu->arch.pio;
2914         long delta;
2915         int r;
2916         unsigned long val;
2917
2918         if (!io->string) {
2919                 if (io->in) {
2920                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2921                         memcpy(&val, vcpu->arch.pio_data, io->size);
2922                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2923                 }
2924         } else {
2925                 if (io->in) {
2926                         r = pio_copy_data(vcpu);
2927                         if (r)
2928                                 return r;
2929                 }
2930
2931                 delta = 1;
2932                 if (io->rep) {
2933                         delta *= io->cur_count;
2934                         /*
2935                          * The size of the register should really depend on
2936                          * current address size.
2937                          */
2938                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2939                         val -= delta;
2940                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2941                 }
2942                 if (io->down)
2943                         delta = -delta;
2944                 delta *= io->size;
2945                 if (io->in) {
2946                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2947                         val += delta;
2948                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2949                 } else {
2950                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2951                         val += delta;
2952                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2953                 }
2954         }
2955
2956         io->count -= io->cur_count;
2957         io->cur_count = 0;
2958
2959         return 0;
2960 }
2961
2962 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
2963 {
2964         /* TODO: String I/O for in kernel device */
2965         int r;
2966
2967         if (vcpu->arch.pio.in)
2968                 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
2969                                     vcpu->arch.pio.size, pd);
2970         else
2971                 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
2972                                      vcpu->arch.pio.size, pd);
2973         return r;
2974 }
2975
2976 static int pio_string_write(struct kvm_vcpu *vcpu)
2977 {
2978         struct kvm_pio_request *io = &vcpu->arch.pio;
2979         void *pd = vcpu->arch.pio_data;
2980         int i, r = 0;
2981
2982         for (i = 0; i < io->cur_count; i++) {
2983                 if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
2984                                      io->port, io->size, pd)) {
2985                         r = -EOPNOTSUPP;
2986                         break;
2987                 }
2988                 pd += io->size;
2989         }
2990         return r;
2991 }
2992
2993 int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2994 {
2995         unsigned long val;
2996
2997         vcpu->run->exit_reason = KVM_EXIT_IO;
2998         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2999         vcpu->run->io.size = vcpu->arch.pio.size = size;
3000         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3001         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3002         vcpu->run->io.port = vcpu->arch.pio.port = port;
3003         vcpu->arch.pio.in = in;
3004         vcpu->arch.pio.string = 0;
3005         vcpu->arch.pio.down = 0;
3006         vcpu->arch.pio.rep = 0;
3007
3008         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3009                       size, 1);
3010
3011         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3012         memcpy(vcpu->arch.pio_data, &val, 4);
3013
3014         if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3015                 complete_pio(vcpu);
3016                 return 1;
3017         }
3018         return 0;
3019 }
3020 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3021
3022 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3023                   int size, unsigned long count, int down,
3024                   gva_t address, int rep, unsigned port)
3025 {
3026         unsigned now, in_page;
3027         int ret = 0;
3028
3029         vcpu->run->exit_reason = KVM_EXIT_IO;
3030         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3031         vcpu->run->io.size = vcpu->arch.pio.size = size;
3032         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3033         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3034         vcpu->run->io.port = vcpu->arch.pio.port = port;
3035         vcpu->arch.pio.in = in;
3036         vcpu->arch.pio.string = 1;
3037         vcpu->arch.pio.down = down;
3038         vcpu->arch.pio.rep = rep;
3039
3040         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3041                       size, count);
3042
3043         if (!count) {
3044                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3045                 return 1;
3046         }
3047
3048         if (!down)
3049                 in_page = PAGE_SIZE - offset_in_page(address);
3050         else
3051                 in_page = offset_in_page(address) + size;
3052         now = min(count, (unsigned long)in_page / size);
3053         if (!now)
3054                 now = 1;
3055         if (down) {
3056                 /*
3057                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
3058                  */
3059                 pr_unimpl(vcpu, "guest string pio down\n");
3060                 kvm_inject_gp(vcpu, 0);
3061                 return 1;
3062         }
3063         vcpu->run->io.count = now;
3064         vcpu->arch.pio.cur_count = now;
3065
3066         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3067                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3068
3069         vcpu->arch.pio.guest_gva = address;
3070
3071         if (!vcpu->arch.pio.in) {
3072                 /* string PIO write */
3073                 ret = pio_copy_data(vcpu);
3074                 if (ret == X86EMUL_PROPAGATE_FAULT) {
3075                         kvm_inject_gp(vcpu, 0);
3076                         return 1;
3077                 }
3078                 if (ret == 0 && !pio_string_write(vcpu)) {
3079                         complete_pio(vcpu);
3080                         if (vcpu->arch.pio.count == 0)
3081                                 ret = 1;
3082                 }
3083         }
3084         /* no string PIO read support yet */
3085
3086         return ret;
3087 }
3088 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
3089
3090 static void bounce_off(void *info)
3091 {
3092         /* nothing */
3093 }
3094
3095 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3096                                      void *data)
3097 {
3098         struct cpufreq_freqs *freq = data;
3099         struct kvm *kvm;
3100         struct kvm_vcpu *vcpu;
3101         int i, send_ipi = 0;
3102
3103         if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3104                 return 0;
3105         if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3106                 return 0;
3107         per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3108
3109         spin_lock(&kvm_lock);
3110         list_for_each_entry(kvm, &vm_list, vm_list) {
3111                 kvm_for_each_vcpu(i, vcpu, kvm) {
3112                         if (vcpu->cpu != freq->cpu)
3113                                 continue;
3114                         if (!kvm_request_guest_time_update(vcpu))
3115                                 continue;
3116                         if (vcpu->cpu != smp_processor_id())
3117                                 send_ipi++;
3118                 }
3119         }
3120         spin_unlock(&kvm_lock);
3121
3122         if (freq->old < freq->new && send_ipi) {
3123                 /*
3124                  * We upscale the frequency.  Must make the guest
3125                  * doesn't see old kvmclock values while running with
3126                  * the new frequency, otherwise we risk the guest sees
3127                  * time go backwards.
3128                  *
3129                  * In case we update the frequency for another cpu
3130                  * (which might be in guest context) send an interrupt
3131                  * to kick the cpu out of guest context.  Next time
3132                  * guest context is entered kvmclock will be updated,
3133                  * so the guest will not see stale values.
3134                  */
3135                 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
3136         }
3137         return 0;
3138 }
3139
3140 static struct notifier_block kvmclock_cpufreq_notifier_block = {
3141         .notifier_call  = kvmclock_cpufreq_notifier
3142 };
3143
3144 static void kvm_timer_init(void)
3145 {
3146         int cpu;
3147
3148         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3149                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3150                                           CPUFREQ_TRANSITION_NOTIFIER);
3151                 for_each_online_cpu(cpu) {
3152                         unsigned long khz = cpufreq_get(cpu);
3153                         if (!khz)
3154                                 khz = tsc_khz;
3155                         per_cpu(cpu_tsc_khz, cpu) = khz;
3156                 }
3157         } else {
3158                 for_each_possible_cpu(cpu)
3159                         per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3160         }
3161 }
3162
3163 int kvm_arch_init(void *opaque)
3164 {
3165         int r;
3166         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3167
3168         if (kvm_x86_ops) {
3169                 printk(KERN_ERR "kvm: already loaded the other module\n");
3170                 r = -EEXIST;
3171                 goto out;
3172         }
3173
3174         if (!ops->cpu_has_kvm_support()) {
3175                 printk(KERN_ERR "kvm: no hardware support\n");
3176                 r = -EOPNOTSUPP;
3177                 goto out;
3178         }
3179         if (ops->disabled_by_bios()) {
3180                 printk(KERN_ERR "kvm: disabled by bios\n");
3181                 r = -EOPNOTSUPP;
3182                 goto out;
3183         }
3184
3185         r = kvm_mmu_module_init();
3186         if (r)
3187                 goto out;
3188
3189         kvm_init_msr_list();
3190
3191         kvm_x86_ops = ops;
3192         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
3193         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
3194         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3195                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
3196
3197         kvm_timer_init();
3198
3199         return 0;
3200
3201 out:
3202         return r;
3203 }
3204
3205 void kvm_arch_exit(void)
3206 {
3207         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
3208                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
3209                                             CPUFREQ_TRANSITION_NOTIFIER);
3210         kvm_x86_ops = NULL;
3211         kvm_mmu_module_exit();
3212 }
3213
3214 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
3215 {
3216         ++vcpu->stat.halt_exits;
3217         if (irqchip_in_kernel(vcpu->kvm)) {
3218                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
3219                 return 1;
3220         } else {
3221                 vcpu->run->exit_reason = KVM_EXIT_HLT;
3222                 return 0;
3223         }
3224 }
3225 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
3226
3227 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3228                            unsigned long a1)
3229 {
3230         if (is_long_mode(vcpu))
3231                 return a0;
3232         else
3233                 return a0 | ((gpa_t)a1 << 32);
3234 }
3235
3236 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3237 {
3238         unsigned long nr, a0, a1, a2, a3, ret;
3239         int r = 1;
3240
3241         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3242         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3243         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
3244         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
3245         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
3246
3247         trace_kvm_hypercall(nr, a0, a1, a2, a3);
3248
3249         if (!is_long_mode(vcpu)) {
3250                 nr &= 0xFFFFFFFF;
3251                 a0 &= 0xFFFFFFFF;
3252                 a1 &= 0xFFFFFFFF;
3253                 a2 &= 0xFFFFFFFF;
3254                 a3 &= 0xFFFFFFFF;
3255         }
3256
3257         if (kvm_x86_ops->get_cpl(vcpu) != 0) {
3258                 ret = -KVM_EPERM;
3259                 goto out;
3260         }
3261
3262         switch (nr) {
3263         case KVM_HC_VAPIC_POLL_IRQ:
3264                 ret = 0;
3265                 break;
3266         case KVM_HC_MMU_OP:
3267                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
3268                 break;
3269         default:
3270                 ret = -KVM_ENOSYS;
3271                 break;
3272         }
3273 out:
3274         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3275         ++vcpu->stat.hypercalls;
3276         return r;
3277 }
3278 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3279
3280 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3281 {
3282         char instruction[3];
3283         int ret = 0;
3284         unsigned long rip = kvm_rip_read(vcpu);
3285
3286
3287         /*
3288          * Blow out the MMU to ensure that no other VCPU has an active mapping
3289          * to ensure that the updated hypercall appears atomically across all
3290          * VCPUs.
3291          */
3292         kvm_mmu_zap_all(vcpu->kvm);
3293
3294         kvm_x86_ops->patch_hypercall(vcpu, instruction);
3295         if (emulator_write_emulated(rip, instruction, 3, vcpu)
3296             != X86EMUL_CONTINUE)
3297                 ret = -EFAULT;
3298
3299         return ret;
3300 }
3301
3302 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3303 {
3304         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3305 }
3306
3307 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3308 {
3309         struct descriptor_table dt = { limit, base };
3310
3311         kvm_x86_ops->set_gdt(vcpu, &dt);
3312 }
3313
3314 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3315 {
3316         struct descriptor_table dt = { limit, base };
3317
3318         kvm_x86_ops->set_idt(vcpu, &dt);
3319 }
3320
3321 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3322                    unsigned long *rflags)
3323 {
3324         kvm_lmsw(vcpu, msw);
3325         *rflags = kvm_get_rflags(vcpu);
3326 }
3327
3328 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3329 {
3330         unsigned long value;
3331
3332         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3333         switch (cr) {
3334         case 0:
3335                 value = vcpu->arch.cr0;
3336                 break;
3337         case 2:
3338                 value = vcpu->arch.cr2;
3339                 break;
3340         case 3:
3341                 value = vcpu->arch.cr3;
3342                 break;
3343         case 4:
3344                 value = vcpu->arch.cr4;
3345                 break;
3346         case 8:
3347                 value = kvm_get_cr8(vcpu);
3348                 break;
3349         default:
3350                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3351                 return 0;
3352         }
3353
3354         return value;
3355 }
3356
3357 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3358                      unsigned long *rflags)
3359 {
3360         switch (cr) {
3361         case 0:
3362                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3363                 *rflags = kvm_get_rflags(vcpu);
3364                 break;
3365         case 2:
3366                 vcpu->arch.cr2 = val;
3367                 break;
3368         case 3:
3369                 kvm_set_cr3(vcpu, val);
3370                 break;
3371         case 4:
3372                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
3373                 break;
3374         case 8:
3375                 kvm_set_cr8(vcpu, val & 0xfUL);
3376                 break;
3377         default:
3378                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3379         }
3380 }
3381
3382 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3383 {
3384         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3385         int j, nent = vcpu->arch.cpuid_nent;
3386
3387         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3388         /* when no next entry is found, the current entry[i] is reselected */
3389         for (j = i + 1; ; j = (j + 1) % nent) {
3390                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3391                 if (ej->function == e->function) {
3392                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3393                         return j;
3394                 }
3395         }
3396         return 0; /* silence gcc, even though control never reaches here */
3397 }
3398
3399 /* find an entry with matching function, matching index (if needed), and that
3400  * should be read next (if it's stateful) */
3401 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3402         u32 function, u32 index)
3403 {
3404         if (e->function != function)
3405                 return 0;
3406         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3407                 return 0;
3408         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3409             !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3410                 return 0;
3411         return 1;
3412 }
3413
3414 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3415                                               u32 function, u32 index)
3416 {
3417         int i;
3418         struct kvm_cpuid_entry2 *best = NULL;
3419
3420         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3421                 struct kvm_cpuid_entry2 *e;
3422
3423                 e = &vcpu->arch.cpuid_entries[i];
3424                 if (is_matching_cpuid_entry(e, function, index)) {
3425                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3426                                 move_to_next_stateful_cpuid_entry(vcpu, i);
3427                         best = e;
3428                         break;
3429                 }
3430                 /*
3431                  * Both basic or both extended?
3432                  */
3433                 if (((e->function ^ function) & 0x80000000) == 0)
3434                         if (!best || e->function > best->function)
3435                                 best = e;
3436         }
3437         return best;
3438 }
3439
3440 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3441 {
3442         struct kvm_cpuid_entry2 *best;
3443
3444         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3445         if (best)
3446                 return best->eax & 0xff;
3447         return 36;
3448 }
3449
3450 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3451 {
3452         u32 function, index;
3453         struct kvm_cpuid_entry2 *best;
3454
3455         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3456         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3457         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3458         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3459         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3460         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3461         best = kvm_find_cpuid_entry(vcpu, function, index);
3462         if (best) {
3463                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3464                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3465                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3466                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3467         }
3468         kvm_x86_ops->skip_emulated_instruction(vcpu);
3469         trace_kvm_cpuid(function,
3470                         kvm_register_read(vcpu, VCPU_REGS_RAX),
3471                         kvm_register_read(vcpu, VCPU_REGS_RBX),
3472                         kvm_register_read(vcpu, VCPU_REGS_RCX),
3473                         kvm_register_read(vcpu, VCPU_REGS_RDX));
3474 }
3475 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3476
3477 /*
3478  * Check if userspace requested an interrupt window, and that the
3479  * interrupt window is open.
3480  *
3481  * No need to exit to userspace if we already have an interrupt queued.
3482  */
3483 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3484 {
3485         return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3486                 vcpu->run->request_interrupt_window &&
3487                 kvm_arch_interrupt_allowed(vcpu));
3488 }
3489
3490 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3491 {
3492         struct kvm_run *kvm_run = vcpu->run;
3493
3494         kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3495         kvm_run->cr8 = kvm_get_cr8(vcpu);
3496         kvm_run->apic_base = kvm_get_apic_base(vcpu);
3497         if (irqchip_in_kernel(vcpu->kvm))
3498                 kvm_run->ready_for_interrupt_injection = 1;
3499         else
3500                 kvm_run->ready_for_interrupt_injection =
3501                         kvm_arch_interrupt_allowed(vcpu) &&
3502                         !kvm_cpu_has_interrupt(vcpu) &&
3503                         !kvm_event_needs_reinjection(vcpu);
3504 }
3505
3506 static void vapic_enter(struct kvm_vcpu *vcpu)
3507 {
3508         struct kvm_lapic *apic = vcpu->arch.apic;
3509         struct page *page;
3510
3511         if (!apic || !apic->vapic_addr)
3512                 return;
3513
3514         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3515
3516         vcpu->arch.apic->vapic_page = page;
3517 }
3518
3519 static void vapic_exit(struct kvm_vcpu *vcpu)
3520 {
3521         struct kvm_lapic *apic = vcpu->arch.apic;
3522
3523         if (!apic || !apic->vapic_addr)
3524                 return;
3525
3526         down_read(&vcpu->kvm->slots_lock);
3527         kvm_release_page_dirty(apic->vapic_page);
3528         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3529         up_read(&vcpu->kvm->slots_lock);
3530 }
3531
3532 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3533 {
3534         int max_irr, tpr;
3535
3536         if (!kvm_x86_ops->update_cr8_intercept)
3537                 return;
3538
3539         if (!vcpu->arch.apic)
3540                 return;
3541
3542         if (!vcpu->arch.apic->vapic_addr)
3543                 max_irr = kvm_lapic_find_highest_irr(vcpu);
3544         else
3545                 max_irr = -1;
3546
3547         if (max_irr != -1)
3548                 max_irr >>= 4;
3549
3550         tpr = kvm_lapic_get_cr8(vcpu);
3551
3552         kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3553 }
3554
3555 static void inject_pending_event(struct kvm_vcpu *vcpu)
3556 {
3557         /* try to reinject previous events if any */
3558         if (vcpu->arch.exception.pending) {
3559                 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
3560                                           vcpu->arch.exception.has_error_code,
3561                                           vcpu->arch.exception.error_code);
3562                 return;
3563         }
3564
3565         if (vcpu->arch.nmi_injected) {
3566                 kvm_x86_ops->set_nmi(vcpu);
3567                 return;
3568         }
3569
3570         if (vcpu->arch.interrupt.pending) {
3571                 kvm_x86_ops->set_irq(vcpu);
3572                 return;
3573         }
3574
3575         /* try to inject new event if pending */
3576         if (vcpu->arch.nmi_pending) {
3577                 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3578                         vcpu->arch.nmi_pending = false;
3579                         vcpu->arch.nmi_injected = true;
3580                         kvm_x86_ops->set_nmi(vcpu);
3581                 }
3582         } else if (kvm_cpu_has_interrupt(vcpu)) {
3583                 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3584                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3585                                             false);
3586                         kvm_x86_ops->set_irq(vcpu);
3587                 }
3588         }
3589 }
3590
3591 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3592 {
3593         int r;
3594         bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3595                 vcpu->run->request_interrupt_window;
3596
3597         if (vcpu->requests)
3598                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3599                         kvm_mmu_unload(vcpu);
3600
3601         r = kvm_mmu_reload(vcpu);
3602         if (unlikely(r))
3603                 goto out;
3604
3605         if (vcpu->requests) {
3606                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3607                         __kvm_migrate_timers(vcpu);
3608                 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3609                         kvm_write_guest_time(vcpu);
3610                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3611                         kvm_mmu_sync_roots(vcpu);
3612                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3613                         kvm_x86_ops->tlb_flush(vcpu);
3614                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3615                                        &vcpu->requests)) {
3616                         vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3617                         r = 0;
3618                         goto out;
3619                 }
3620                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3621                         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3622                         r = 0;
3623                         goto out;
3624                 }
3625         }
3626
3627         preempt_disable();
3628
3629         kvm_x86_ops->prepare_guest_switch(vcpu);
3630         kvm_load_guest_fpu(vcpu);
3631
3632         local_irq_disable();
3633
3634         clear_bit(KVM_REQ_KICK, &vcpu->requests);
3635         smp_mb__after_clear_bit();
3636
3637         if (vcpu->requests || need_resched() || signal_pending(current)) {
3638                 set_bit(KVM_REQ_KICK, &vcpu->requests);
3639                 local_irq_enable();
3640                 preempt_enable();
3641                 r = 1;
3642                 goto out;
3643         }
3644
3645         inject_pending_event(vcpu);
3646
3647         /* enable NMI/IRQ window open exits if needed */
3648         if (vcpu->arch.nmi_pending)
3649                 kvm_x86_ops->enable_nmi_window(vcpu);
3650         else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3651                 kvm_x86_ops->enable_irq_window(vcpu);
3652
3653         if (kvm_lapic_enabled(vcpu)) {
3654                 update_cr8_intercept(vcpu);
3655                 kvm_lapic_sync_to_vapic(vcpu);
3656         }
3657
3658         up_read(&vcpu->kvm->slots_lock);
3659
3660         kvm_guest_enter();
3661
3662         if (unlikely(vcpu->arch.switch_db_regs)) {
3663                 set_debugreg(0, 7);
3664                 set_debugreg(vcpu->arch.eff_db[0], 0);
3665                 set_debugreg(vcpu->arch.eff_db[1], 1);
3666                 set_debugreg(vcpu->arch.eff_db[2], 2);
3667                 set_debugreg(vcpu->arch.eff_db[3], 3);
3668         }
3669
3670         trace_kvm_entry(vcpu->vcpu_id);
3671         kvm_x86_ops->run(vcpu);
3672
3673         if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
3674                 set_debugreg(current->thread.debugreg0, 0);
3675                 set_debugreg(current->thread.debugreg1, 1);
3676                 set_debugreg(current->thread.debugreg2, 2);
3677                 set_debugreg(current->thread.debugreg3, 3);
3678                 set_debugreg(current->thread.debugreg6, 6);
3679                 set_debugreg(current->thread.debugreg7, 7);
3680         }
3681
3682         set_bit(KVM_REQ_KICK, &vcpu->requests);
3683         local_irq_enable();
3684
3685         ++vcpu->stat.exits;
3686
3687         /*
3688          * We must have an instruction between local_irq_enable() and
3689          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3690          * the interrupt shadow.  The stat.exits increment will do nicely.
3691          * But we need to prevent reordering, hence this barrier():
3692          */
3693         barrier();
3694
3695         kvm_guest_exit();
3696
3697         preempt_enable();
3698
3699         down_read(&vcpu->kvm->slots_lock);
3700
3701         /*
3702          * Profile KVM exit RIPs:
3703          */
3704         if (unlikely(prof_on == KVM_PROFILING)) {
3705                 unsigned long rip = kvm_rip_read(vcpu);
3706                 profile_hit(KVM_PROFILING, (void *)rip);
3707         }
3708
3709
3710         kvm_lapic_sync_from_vapic(vcpu);
3711
3712         r = kvm_x86_ops->handle_exit(vcpu);
3713 out:
3714         return r;
3715 }
3716
3717
3718 static int __vcpu_run(struct kvm_vcpu *vcpu)
3719 {
3720         int r;
3721
3722         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3723                 pr_debug("vcpu %d received sipi with vector # %x\n",
3724                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3725                 kvm_lapic_reset(vcpu);
3726                 r = kvm_arch_vcpu_reset(vcpu);
3727                 if (r)
3728                         return r;
3729                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3730         }
3731
3732         down_read(&vcpu->kvm->slots_lock);
3733         vapic_enter(vcpu);
3734
3735         r = 1;
3736         while (r > 0) {
3737                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3738                         r = vcpu_enter_guest(vcpu);
3739                 else {
3740                         up_read(&vcpu->kvm->slots_lock);
3741                         kvm_vcpu_block(vcpu);
3742                         down_read(&vcpu->kvm->slots_lock);
3743                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3744                         {
3745                                 switch(vcpu->arch.mp_state) {
3746                                 case KVM_MP_STATE_HALTED:
3747                                         vcpu->arch.mp_state =
3748                                                 KVM_MP_STATE_RUNNABLE;
3749                                 case KVM_MP_STATE_RUNNABLE:
3750                                         break;
3751                                 case KVM_MP_STATE_SIPI_RECEIVED:
3752                                 default:
3753                                         r = -EINTR;
3754                                         break;
3755                                 }
3756                         }
3757                 }
3758
3759                 if (r <= 0)
3760                         break;
3761
3762                 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3763                 if (kvm_cpu_has_pending_timer(vcpu))
3764                         kvm_inject_pending_timer_irqs(vcpu);
3765
3766                 if (dm_request_for_irq_injection(vcpu)) {
3767                         r = -EINTR;
3768                         vcpu->run->exit_reason = KVM_EXIT_INTR;
3769                         ++vcpu->stat.request_irq_exits;
3770                 }
3771                 if (signal_pending(current)) {
3772                         r = -EINTR;
3773                         vcpu->run->exit_reason = KVM_EXIT_INTR;
3774                         ++vcpu->stat.signal_exits;
3775                 }
3776                 if (need_resched()) {
3777                         up_read(&vcpu->kvm->slots_lock);
3778                         kvm_resched(vcpu);
3779                         down_read(&vcpu->kvm->slots_lock);
3780                 }
3781         }
3782
3783         up_read(&vcpu->kvm->slots_lock);
3784         post_kvm_run_save(vcpu);
3785
3786         vapic_exit(vcpu);
3787
3788         return r;
3789 }
3790
3791 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3792 {
3793         int r;
3794         sigset_t sigsaved;
3795
3796         vcpu_load(vcpu);
3797
3798         if (vcpu->sigset_active)
3799                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3800
3801         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3802                 kvm_vcpu_block(vcpu);
3803                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3804                 r = -EAGAIN;
3805                 goto out;
3806         }
3807
3808         /* re-sync apic's tpr */
3809         if (!irqchip_in_kernel(vcpu->kvm))
3810                 kvm_set_cr8(vcpu, kvm_run->cr8);
3811
3812         if (vcpu->arch.pio.cur_count) {
3813                 r = complete_pio(vcpu);
3814                 if (r)
3815                         goto out;
3816         }
3817 #if CONFIG_HAS_IOMEM
3818         if (vcpu->mmio_needed) {
3819                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3820                 vcpu->mmio_read_completed = 1;
3821                 vcpu->mmio_needed = 0;
3822
3823                 down_read(&vcpu->kvm->slots_lock);
3824                 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3825                                         EMULTYPE_NO_DECODE);
3826                 up_read(&vcpu->kvm->slots_lock);
3827                 if (r == EMULATE_DO_MMIO) {
3828                         /*
3829                          * Read-modify-write.  Back to userspace.
3830                          */
3831                         r = 0;
3832                         goto out;
3833                 }
3834         }
3835 #endif
3836         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3837                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3838                                      kvm_run->hypercall.ret);
3839
3840         r = __vcpu_run(vcpu);
3841
3842 out:
3843         if (vcpu->sigset_active)
3844                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3845
3846         vcpu_put(vcpu);
3847         return r;
3848 }
3849
3850 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3851 {
3852         vcpu_load(vcpu);
3853
3854         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3855         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3856         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3857         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3858         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3859         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3860         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3861         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3862 #ifdef CONFIG_X86_64
3863         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3864         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3865         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3866         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3867         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3868         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3869         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3870         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3871 #endif
3872
3873         regs->rip = kvm_rip_read(vcpu);
3874         regs->rflags = kvm_get_rflags(vcpu);
3875
3876         vcpu_put(vcpu);
3877
3878         return 0;
3879 }
3880
3881 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3882 {
3883         vcpu_load(vcpu);
3884
3885         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3886         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3887         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3888         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3889         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3890         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3891         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3892         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3893 #ifdef CONFIG_X86_64
3894         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3895         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3896         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3897         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3898         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3899         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3900         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3901         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3902 #endif
3903
3904         kvm_rip_write(vcpu, regs->rip);
3905         kvm_set_rflags(vcpu, regs->rflags);
3906
3907         vcpu->arch.exception.pending = false;
3908
3909         vcpu_put(vcpu);
3910
3911         return 0;
3912 }
3913
3914 void kvm_get_segment(struct kvm_vcpu *vcpu,
3915                      struct kvm_segment *var, int seg)
3916 {
3917         kvm_x86_ops->get_segment(vcpu, var, seg);
3918 }
3919
3920 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3921 {
3922         struct kvm_segment cs;
3923
3924         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3925         *db = cs.db;
3926         *l = cs.l;
3927 }
3928 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3929
3930 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3931                                   struct kvm_sregs *sregs)
3932 {
3933         struct descriptor_table dt;
3934
3935         vcpu_load(vcpu);
3936
3937         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3938         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3939         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3940         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3941         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3942         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3943
3944         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3945         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3946
3947         kvm_x86_ops->get_idt(vcpu, &dt);
3948         sregs->idt.limit = dt.limit;
3949         sregs->idt.base = dt.base;
3950         kvm_x86_ops->get_gdt(vcpu, &dt);
3951         sregs->gdt.limit = dt.limit;
3952         sregs->gdt.base = dt.base;
3953
3954         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3955         sregs->cr0 = vcpu->arch.cr0;
3956         sregs->cr2 = vcpu->arch.cr2;
3957         sregs->cr3 = vcpu->arch.cr3;
3958         sregs->cr4 = vcpu->arch.cr4;
3959         sregs->cr8 = kvm_get_cr8(vcpu);
3960         sregs->efer = vcpu->arch.shadow_efer;
3961         sregs->apic_base = kvm_get_apic_base(vcpu);
3962
3963         memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3964
3965         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3966                 set_bit(vcpu->arch.interrupt.nr,
3967                         (unsigned long *)sregs->interrupt_bitmap);
3968
3969         vcpu_put(vcpu);
3970
3971         return 0;
3972 }
3973
3974 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3975                                     struct kvm_mp_state *mp_state)
3976 {
3977         vcpu_load(vcpu);
3978         mp_state->mp_state = vcpu->arch.mp_state;
3979         vcpu_put(vcpu);
3980         return 0;
3981 }
3982
3983 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3984                                     struct kvm_mp_state *mp_state)
3985 {
3986         vcpu_load(vcpu);
3987         vcpu->arch.mp_state = mp_state->mp_state;
3988         vcpu_put(vcpu);
3989         return 0;
3990 }
3991
3992 static void kvm_set_segment(struct kvm_vcpu *vcpu,
3993                         struct kvm_segment *var, int seg)
3994 {
3995         kvm_x86_ops->set_segment(vcpu, var, seg);
3996 }
3997
3998 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3999                                    struct kvm_segment *kvm_desct)
4000 {
4001         kvm_desct->base = get_desc_base(seg_desc);
4002         kvm_desct->limit = get_desc_limit(seg_desc);
4003         if (seg_desc->g) {
4004                 kvm_desct->limit <<= 12;
4005                 kvm_desct->limit |= 0xfff;
4006         }
4007         kvm_desct->selector = selector;
4008         kvm_desct->type = seg_desc->type;
4009         kvm_desct->present = seg_desc->p;
4010         kvm_desct->dpl = seg_desc->dpl;
4011         kvm_desct->db = seg_desc->d;
4012         kvm_desct->s = seg_desc->s;
4013         kvm_desct->l = seg_desc->l;
4014         kvm_desct->g = seg_desc->g;
4015         kvm_desct->avl = seg_desc->avl;
4016         if (!selector)
4017                 kvm_desct->unusable = 1;
4018         else
4019                 kvm_desct->unusable = 0;
4020         kvm_desct->padding = 0;
4021 }
4022
4023 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4024                                           u16 selector,
4025                                           struct descriptor_table *dtable)
4026 {
4027         if (selector & 1 << 2) {
4028                 struct kvm_segment kvm_seg;
4029
4030                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4031
4032                 if (kvm_seg.unusable)
4033                         dtable->limit = 0;
4034                 else
4035                         dtable->limit = kvm_seg.limit;
4036                 dtable->base = kvm_seg.base;
4037         }
4038         else
4039                 kvm_x86_ops->get_gdt(vcpu, dtable);
4040 }
4041
4042 /* allowed just for 8 bytes segments */
4043 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4044                                          struct desc_struct *seg_desc)
4045 {
4046         struct descriptor_table dtable;
4047         u16 index = selector >> 3;
4048
4049         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4050
4051         if (dtable.limit < index * 8 + 7) {
4052                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4053                 return 1;
4054         }
4055         return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4056 }
4057
4058 /* allowed just for 8 bytes segments */
4059 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4060                                          struct desc_struct *seg_desc)
4061 {
4062         struct descriptor_table dtable;
4063         u16 index = selector >> 3;
4064
4065         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4066
4067         if (dtable.limit < index * 8 + 7)
4068                 return 1;
4069         return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4070 }
4071
4072 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
4073                              struct desc_struct *seg_desc)
4074 {
4075         u32 base_addr = get_desc_base(seg_desc);
4076
4077         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
4078 }
4079
4080 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4081 {
4082         struct kvm_segment kvm_seg;
4083
4084         kvm_get_segment(vcpu, &kvm_seg, seg);
4085         return kvm_seg.selector;
4086 }
4087
4088 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4089                                                 u16 selector,
4090                                                 struct kvm_segment *kvm_seg)
4091 {
4092         struct desc_struct seg_desc;
4093
4094         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4095                 return 1;
4096         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4097         return 0;
4098 }
4099
4100 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4101 {
4102         struct kvm_segment segvar = {
4103                 .base = selector << 4,
4104                 .limit = 0xffff,
4105                 .selector = selector,
4106                 .type = 3,
4107                 .present = 1,
4108                 .dpl = 3,
4109                 .db = 0,
4110                 .s = 1,
4111                 .l = 0,
4112                 .g = 0,
4113                 .avl = 0,
4114                 .unusable = 0,
4115         };
4116         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4117         return 0;
4118 }
4119
4120 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4121 {
4122         return (seg != VCPU_SREG_LDTR) &&
4123                 (seg != VCPU_SREG_TR) &&
4124                 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4125 }
4126
4127 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4128                                 int type_bits, int seg)
4129 {
4130         struct kvm_segment kvm_seg;
4131
4132         if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
4133                 return kvm_load_realmode_segment(vcpu, selector, seg);
4134         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4135                 return 1;
4136         kvm_seg.type |= type_bits;
4137
4138         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
4139             seg != VCPU_SREG_LDTR)
4140                 if (!kvm_seg.s)
4141                         kvm_seg.unusable = 1;
4142
4143         kvm_set_segment(vcpu, &kvm_seg, seg);
4144         return 0;
4145 }
4146
4147 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4148                                 struct tss_segment_32 *tss)
4149 {
4150         tss->cr3 = vcpu->arch.cr3;
4151         tss->eip = kvm_rip_read(vcpu);
4152         tss->eflags = kvm_get_rflags(vcpu);
4153         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4154         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4155         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4156         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4157         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4158         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4159         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4160         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4161         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4162         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4163         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4164         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4165         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4166         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4167         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4168 }
4169
4170 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4171                                   struct tss_segment_32 *tss)
4172 {
4173         kvm_set_cr3(vcpu, tss->cr3);
4174
4175         kvm_rip_write(vcpu, tss->eip);
4176         kvm_set_rflags(vcpu, tss->eflags | 2);
4177
4178         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4179         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
4180         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
4181         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
4182         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
4183         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
4184         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4185         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4186
4187         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
4188                 return 1;
4189
4190         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4191                 return 1;
4192
4193         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4194                 return 1;
4195
4196         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4197                 return 1;
4198
4199         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4200                 return 1;
4201
4202         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
4203                 return 1;
4204
4205         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
4206                 return 1;
4207         return 0;
4208 }
4209
4210 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4211                                 struct tss_segment_16 *tss)
4212 {
4213         tss->ip = kvm_rip_read(vcpu);
4214         tss->flag = kvm_get_rflags(vcpu);
4215         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4216         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4217         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4218         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4219         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4220         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4221         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
4222         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
4223
4224         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4225         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4226         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4227         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4228         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4229 }
4230
4231 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4232                                  struct tss_segment_16 *tss)
4233 {
4234         kvm_rip_write(vcpu, tss->ip);
4235         kvm_set_rflags(vcpu, tss->flag | 2);
4236         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4237         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4238         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
4239         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
4240         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
4241         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
4242         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4243         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4244
4245         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
4246                 return 1;
4247
4248         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4249                 return 1;
4250
4251         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4252                 return 1;
4253
4254         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4255                 return 1;
4256
4257         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4258                 return 1;
4259         return 0;
4260 }
4261
4262 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4263                               u16 old_tss_sel, u32 old_tss_base,
4264                               struct desc_struct *nseg_desc)
4265 {
4266         struct tss_segment_16 tss_segment_16;
4267         int ret = 0;
4268
4269         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4270                            sizeof tss_segment_16))
4271                 goto out;
4272
4273         save_state_to_tss16(vcpu, &tss_segment_16);
4274
4275         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4276                             sizeof tss_segment_16))
4277                 goto out;
4278
4279         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4280                            &tss_segment_16, sizeof tss_segment_16))
4281                 goto out;
4282
4283         if (old_tss_sel != 0xffff) {
4284                 tss_segment_16.prev_task_link = old_tss_sel;
4285
4286                 if (kvm_write_guest(vcpu->kvm,
4287                                     get_tss_base_addr(vcpu, nseg_desc),
4288                                     &tss_segment_16.prev_task_link,
4289                                     sizeof tss_segment_16.prev_task_link))
4290                         goto out;
4291         }
4292
4293         if (load_state_from_tss16(vcpu, &tss_segment_16))
4294                 goto out;
4295
4296         ret = 1;
4297 out:
4298         return ret;
4299 }
4300
4301 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4302                        u16 old_tss_sel, u32 old_tss_base,
4303                        struct desc_struct *nseg_desc)
4304 {
4305         struct tss_segment_32 tss_segment_32;
4306         int ret = 0;
4307
4308         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4309                            sizeof tss_segment_32))
4310                 goto out;
4311
4312         save_state_to_tss32(vcpu, &tss_segment_32);
4313
4314         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4315                             sizeof tss_segment_32))
4316                 goto out;
4317
4318         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4319                            &tss_segment_32, sizeof tss_segment_32))
4320                 goto out;
4321
4322         if (old_tss_sel != 0xffff) {
4323                 tss_segment_32.prev_task_link = old_tss_sel;
4324
4325                 if (kvm_write_guest(vcpu->kvm,
4326                                     get_tss_base_addr(vcpu, nseg_desc),
4327                                     &tss_segment_32.prev_task_link,
4328                                     sizeof tss_segment_32.prev_task_link))
4329                         goto out;
4330         }
4331
4332         if (load_state_from_tss32(vcpu, &tss_segment_32))
4333                 goto out;
4334
4335         ret = 1;
4336 out:
4337         return ret;
4338 }
4339
4340 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4341 {
4342         struct kvm_segment tr_seg;
4343         struct desc_struct cseg_desc;
4344         struct desc_struct nseg_desc;
4345         int ret = 0;
4346         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4347         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4348
4349         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
4350
4351         /* FIXME: Handle errors. Failure to read either TSS or their
4352          * descriptors should generate a pagefault.
4353          */
4354         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
4355                 goto out;
4356
4357         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
4358                 goto out;
4359
4360         if (reason != TASK_SWITCH_IRET) {
4361                 int cpl;
4362
4363                 cpl = kvm_x86_ops->get_cpl(vcpu);
4364                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
4365                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4366                         return 1;
4367                 }
4368         }
4369
4370         if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
4371                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4372                 return 1;
4373         }
4374
4375         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
4376                 cseg_desc.type &= ~(1 << 1); //clear the B flag
4377                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
4378         }
4379
4380         if (reason == TASK_SWITCH_IRET) {
4381                 u32 eflags = kvm_get_rflags(vcpu);
4382                 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4383         }
4384
4385         /* set back link to prev task only if NT bit is set in eflags
4386            note that old_tss_sel is not used afetr this point */
4387         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4388                 old_tss_sel = 0xffff;
4389
4390         /* set back link to prev task only if NT bit is set in eflags
4391            note that old_tss_sel is not used afetr this point */
4392         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4393                 old_tss_sel = 0xffff;
4394
4395         if (nseg_desc.type & 8)
4396                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4397                                          old_tss_base, &nseg_desc);
4398         else
4399                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
4400                                          old_tss_base, &nseg_desc);
4401
4402         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4403                 u32 eflags = kvm_get_rflags(vcpu);
4404                 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4405         }
4406
4407         if (reason != TASK_SWITCH_IRET) {
4408                 nseg_desc.type |= (1 << 1);
4409                 save_guest_segment_descriptor(vcpu, tss_selector,
4410                                               &nseg_desc);
4411         }
4412
4413         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
4414         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4415         tr_seg.type = 11;
4416         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
4417 out:
4418         return ret;
4419 }
4420 EXPORT_SYMBOL_GPL(kvm_task_switch);
4421
4422 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4423                                   struct kvm_sregs *sregs)
4424 {
4425         int mmu_reset_needed = 0;
4426         int pending_vec, max_bits;
4427         struct descriptor_table dt;
4428
4429         vcpu_load(vcpu);
4430
4431         dt.limit = sregs->idt.limit;
4432         dt.base = sregs->idt.base;
4433         kvm_x86_ops->set_idt(vcpu, &dt);
4434         dt.limit = sregs->gdt.limit;
4435         dt.base = sregs->gdt.base;
4436         kvm_x86_ops->set_gdt(vcpu, &dt);
4437
4438         vcpu->arch.cr2 = sregs->cr2;
4439         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4440         vcpu->arch.cr3 = sregs->cr3;
4441
4442         kvm_set_cr8(vcpu, sregs->cr8);
4443
4444         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4445         kvm_x86_ops->set_efer(vcpu, sregs->efer);
4446         kvm_set_apic_base(vcpu, sregs->apic_base);
4447
4448         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4449
4450         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4451         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4452         vcpu->arch.cr0 = sregs->cr0;
4453
4454         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4455         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4456         if (!is_long_mode(vcpu) && is_pae(vcpu))
4457                 load_pdptrs(vcpu, vcpu->arch.cr3);
4458
4459         if (mmu_reset_needed)
4460                 kvm_mmu_reset_context(vcpu);
4461
4462         max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4463         pending_vec = find_first_bit(
4464                 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
4465         if (pending_vec < max_bits) {
4466                 kvm_queue_interrupt(vcpu, pending_vec, false);
4467                 pr_debug("Set back pending irq %d\n", pending_vec);
4468                 if (irqchip_in_kernel(vcpu->kvm))
4469                         kvm_pic_clear_isr_ack(vcpu->kvm);
4470         }
4471
4472         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4473         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4474         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4475         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4476         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4477         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4478
4479         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4480         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4481
4482         update_cr8_intercept(vcpu);
4483
4484         /* Older userspace won't unhalt the vcpu on reset. */
4485         if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4486             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4487             !(vcpu->arch.cr0 & X86_CR0_PE))
4488                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4489
4490         vcpu_put(vcpu);
4491
4492         return 0;
4493 }
4494
4495 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4496                                         struct kvm_guest_debug *dbg)
4497 {
4498         unsigned long rflags;
4499         int i;
4500
4501         vcpu_load(vcpu);
4502
4503         /*
4504          * Read rflags as long as potentially injected trace flags are still
4505          * filtered out.
4506          */
4507         rflags = kvm_get_rflags(vcpu);
4508
4509         vcpu->guest_debug = dbg->control;
4510         if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4511                 vcpu->guest_debug = 0;
4512
4513         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4514                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4515                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4516                 vcpu->arch.switch_db_regs =
4517                         (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4518         } else {
4519                 for (i = 0; i < KVM_NR_DB_REGS; i++)
4520                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4521                 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4522         }
4523
4524         /*
4525          * Trigger an rflags update that will inject or remove the trace
4526          * flags.
4527          */
4528         kvm_set_rflags(vcpu, rflags);
4529
4530         kvm_x86_ops->set_guest_debug(vcpu, dbg);
4531
4532         if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_DB)
4533                 kvm_queue_exception(vcpu, DB_VECTOR);
4534         else if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_BP)
4535                 kvm_queue_exception(vcpu, BP_VECTOR);
4536
4537         vcpu_put(vcpu);
4538
4539         return 0;
4540 }
4541
4542 /*
4543  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4544  * we have asm/x86/processor.h
4545  */
4546 struct fxsave {
4547         u16     cwd;
4548         u16     swd;
4549         u16     twd;
4550         u16     fop;
4551         u64     rip;
4552         u64     rdp;
4553         u32     mxcsr;
4554         u32     mxcsr_mask;
4555         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4556 #ifdef CONFIG_X86_64
4557         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4558 #else
4559         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4560 #endif
4561 };
4562
4563 /*
4564  * Translate a guest virtual address to a guest physical address.
4565  */
4566 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4567                                     struct kvm_translation *tr)
4568 {
4569         unsigned long vaddr = tr->linear_address;
4570         gpa_t gpa;
4571
4572         vcpu_load(vcpu);
4573         down_read(&vcpu->kvm->slots_lock);
4574         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4575         up_read(&vcpu->kvm->slots_lock);
4576         tr->physical_address = gpa;
4577         tr->valid = gpa != UNMAPPED_GVA;
4578         tr->writeable = 1;
4579         tr->usermode = 0;
4580         vcpu_put(vcpu);
4581
4582         return 0;
4583 }
4584
4585 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4586 {
4587         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4588
4589         vcpu_load(vcpu);
4590
4591         memcpy(fpu->fpr, fxsave->st_space, 128);
4592         fpu->fcw = fxsave->cwd;
4593         fpu->fsw = fxsave->swd;
4594         fpu->ftwx = fxsave->twd;
4595         fpu->last_opcode = fxsave->fop;
4596         fpu->last_ip = fxsave->rip;
4597         fpu->last_dp = fxsave->rdp;
4598         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4599
4600         vcpu_put(vcpu);
4601
4602         return 0;
4603 }
4604
4605 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4606 {
4607         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4608
4609         vcpu_load(vcpu);
4610
4611         memcpy(fxsave->st_space, fpu->fpr, 128);
4612         fxsave->cwd = fpu->fcw;
4613         fxsave->swd = fpu->fsw;
4614         fxsave->twd = fpu->ftwx;
4615         fxsave->fop = fpu->last_opcode;
4616         fxsave->rip = fpu->last_ip;
4617         fxsave->rdp = fpu->last_dp;
4618         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4619
4620         vcpu_put(vcpu);
4621
4622         return 0;
4623 }
4624
4625 void fx_init(struct kvm_vcpu *vcpu)
4626 {
4627         unsigned after_mxcsr_mask;
4628
4629         /*
4630          * Touch the fpu the first time in non atomic context as if
4631          * this is the first fpu instruction the exception handler
4632          * will fire before the instruction returns and it'll have to
4633          * allocate ram with GFP_KERNEL.
4634          */
4635         if (!used_math())
4636                 kvm_fx_save(&vcpu->arch.host_fx_image);
4637
4638         /* Initialize guest FPU by resetting ours and saving into guest's */
4639         preempt_disable();
4640         kvm_fx_save(&vcpu->arch.host_fx_image);
4641         kvm_fx_finit();
4642         kvm_fx_save(&vcpu->arch.guest_fx_image);
4643         kvm_fx_restore(&vcpu->arch.host_fx_image);
4644         preempt_enable();
4645
4646         vcpu->arch.cr0 |= X86_CR0_ET;
4647         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4648         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4649         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4650                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4651 }
4652 EXPORT_SYMBOL_GPL(fx_init);
4653
4654 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4655 {
4656         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4657                 return;
4658
4659         vcpu->guest_fpu_loaded = 1;
4660         kvm_fx_save(&vcpu->arch.host_fx_image);
4661         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4662 }
4663 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4664
4665 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4666 {
4667         if (!vcpu->guest_fpu_loaded)
4668                 return;
4669
4670         vcpu->guest_fpu_loaded = 0;
4671         kvm_fx_save(&vcpu->arch.guest_fx_image);
4672         kvm_fx_restore(&vcpu->arch.host_fx_image);
4673         ++vcpu->stat.fpu_reload;
4674 }
4675 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4676
4677 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4678 {
4679         if (vcpu->arch.time_page) {
4680                 kvm_release_page_dirty(vcpu->arch.time_page);
4681                 vcpu->arch.time_page = NULL;
4682         }
4683
4684         kvm_x86_ops->vcpu_free(vcpu);
4685 }
4686
4687 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4688                                                 unsigned int id)
4689 {
4690         return kvm_x86_ops->vcpu_create(kvm, id);
4691 }
4692
4693 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4694 {
4695         int r;
4696
4697         /* We do fxsave: this must be aligned. */
4698         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4699
4700         vcpu->arch.mtrr_state.have_fixed = 1;
4701         vcpu_load(vcpu);
4702         r = kvm_arch_vcpu_reset(vcpu);
4703         if (r == 0)
4704                 r = kvm_mmu_setup(vcpu);
4705         vcpu_put(vcpu);
4706         if (r < 0)
4707                 goto free_vcpu;
4708
4709         return 0;
4710 free_vcpu:
4711         kvm_x86_ops->vcpu_free(vcpu);
4712         return r;
4713 }
4714
4715 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4716 {
4717         vcpu_load(vcpu);
4718         kvm_mmu_unload(vcpu);
4719         vcpu_put(vcpu);
4720
4721         kvm_x86_ops->vcpu_free(vcpu);
4722 }
4723
4724 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4725 {
4726         vcpu->arch.nmi_pending = false;
4727         vcpu->arch.nmi_injected = false;
4728
4729         vcpu->arch.switch_db_regs = 0;
4730         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4731         vcpu->arch.dr6 = DR6_FIXED_1;
4732         vcpu->arch.dr7 = DR7_FIXED_1;
4733
4734         return kvm_x86_ops->vcpu_reset(vcpu);
4735 }
4736
4737 int kvm_arch_hardware_enable(void *garbage)
4738 {
4739         /*
4740          * Since this may be called from a hotplug notifcation,
4741          * we can't get the CPU frequency directly.
4742          */
4743         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4744                 int cpu = raw_smp_processor_id();
4745                 per_cpu(cpu_tsc_khz, cpu) = 0;
4746         }
4747         return kvm_x86_ops->hardware_enable(garbage);
4748 }
4749
4750 void kvm_arch_hardware_disable(void *garbage)
4751 {
4752         kvm_x86_ops->hardware_disable(garbage);
4753 }
4754
4755 int kvm_arch_hardware_setup(void)
4756 {
4757         return kvm_x86_ops->hardware_setup();
4758 }
4759
4760 void kvm_arch_hardware_unsetup(void)
4761 {
4762         kvm_x86_ops->hardware_unsetup();
4763 }
4764
4765 void kvm_arch_check_processor_compat(void *rtn)
4766 {
4767         kvm_x86_ops->check_processor_compatibility(rtn);
4768 }
4769
4770 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4771 {
4772         struct page *page;
4773         struct kvm *kvm;
4774         int r;
4775
4776         BUG_ON(vcpu->kvm == NULL);
4777         kvm = vcpu->kvm;
4778
4779         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4780         if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
4781                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4782         else
4783                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4784
4785         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4786         if (!page) {
4787                 r = -ENOMEM;
4788                 goto fail;
4789         }
4790         vcpu->arch.pio_data = page_address(page);
4791
4792         r = kvm_mmu_create(vcpu);
4793         if (r < 0)
4794                 goto fail_free_pio_data;
4795
4796         if (irqchip_in_kernel(kvm)) {
4797                 r = kvm_create_lapic(vcpu);
4798                 if (r < 0)
4799                         goto fail_mmu_destroy;
4800         }
4801
4802         vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
4803                                        GFP_KERNEL);
4804         if (!vcpu->arch.mce_banks) {
4805                 r = -ENOMEM;
4806                 goto fail_mmu_destroy;
4807         }
4808         vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4809
4810         return 0;
4811
4812 fail_mmu_destroy:
4813         kvm_mmu_destroy(vcpu);
4814 fail_free_pio_data:
4815         free_page((unsigned long)vcpu->arch.pio_data);
4816 fail:
4817         return r;
4818 }
4819
4820 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4821 {
4822         kvm_free_lapic(vcpu);
4823         down_read(&vcpu->kvm->slots_lock);
4824         kvm_mmu_destroy(vcpu);
4825         up_read(&vcpu->kvm->slots_lock);
4826         free_page((unsigned long)vcpu->arch.pio_data);
4827 }
4828
4829 struct  kvm *kvm_arch_create_vm(void)
4830 {
4831         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4832
4833         if (!kvm)
4834                 return ERR_PTR(-ENOMEM);
4835
4836         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4837         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4838
4839         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4840         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4841
4842         rdtscll(kvm->arch.vm_init_tsc);
4843
4844         return kvm;
4845 }
4846
4847 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4848 {
4849         vcpu_load(vcpu);
4850         kvm_mmu_unload(vcpu);
4851         vcpu_put(vcpu);
4852 }
4853
4854 static void kvm_free_vcpus(struct kvm *kvm)
4855 {
4856         unsigned int i;
4857         struct kvm_vcpu *vcpu;
4858
4859         /*
4860          * Unpin any mmu pages first.
4861          */
4862         kvm_for_each_vcpu(i, vcpu, kvm)
4863                 kvm_unload_vcpu_mmu(vcpu);
4864         kvm_for_each_vcpu(i, vcpu, kvm)
4865                 kvm_arch_vcpu_free(vcpu);
4866
4867         mutex_lock(&kvm->lock);
4868         for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
4869                 kvm->vcpus[i] = NULL;
4870
4871         atomic_set(&kvm->online_vcpus, 0);
4872         mutex_unlock(&kvm->lock);
4873 }
4874
4875 void kvm_arch_sync_events(struct kvm *kvm)
4876 {
4877         kvm_free_all_assigned_devices(kvm);
4878 }
4879
4880 void kvm_arch_destroy_vm(struct kvm *kvm)
4881 {
4882         kvm_iommu_unmap_guest(kvm);
4883         kvm_free_pit(kvm);
4884         kfree(kvm->arch.vpic);
4885         kfree(kvm->arch.vioapic);
4886         kvm_free_vcpus(kvm);
4887         kvm_free_physmem(kvm);
4888         if (kvm->arch.apic_access_page)
4889                 put_page(kvm->arch.apic_access_page);
4890         if (kvm->arch.ept_identity_pagetable)
4891                 put_page(kvm->arch.ept_identity_pagetable);
4892         kfree(kvm);
4893 }
4894
4895 int kvm_arch_set_memory_region(struct kvm *kvm,
4896                                 struct kvm_userspace_memory_region *mem,
4897                                 struct kvm_memory_slot old,
4898                                 int user_alloc)
4899 {
4900         int npages = mem->memory_size >> PAGE_SHIFT;
4901         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4902
4903         /*To keep backward compatibility with older userspace,
4904          *x86 needs to hanlde !user_alloc case.
4905          */
4906         if (!user_alloc) {
4907                 if (npages && !old.rmap) {
4908                         unsigned long userspace_addr;
4909
4910                         down_write(&current->mm->mmap_sem);
4911                         userspace_addr = do_mmap(NULL, 0,
4912                                                  npages * PAGE_SIZE,
4913                                                  PROT_READ | PROT_WRITE,
4914                                                  MAP_PRIVATE | MAP_ANONYMOUS,
4915                                                  0);
4916                         up_write(&current->mm->mmap_sem);
4917
4918                         if (IS_ERR((void *)userspace_addr))
4919                                 return PTR_ERR((void *)userspace_addr);
4920
4921                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
4922                         spin_lock(&kvm->mmu_lock);
4923                         memslot->userspace_addr = userspace_addr;
4924                         spin_unlock(&kvm->mmu_lock);
4925                 } else {
4926                         if (!old.user_alloc && old.rmap) {
4927                                 int ret;
4928
4929                                 down_write(&current->mm->mmap_sem);
4930                                 ret = do_munmap(current->mm, old.userspace_addr,
4931                                                 old.npages * PAGE_SIZE);
4932                                 up_write(&current->mm->mmap_sem);
4933                                 if (ret < 0)
4934                                         printk(KERN_WARNING
4935                                        "kvm_vm_ioctl_set_memory_region: "
4936                                        "failed to munmap memory\n");
4937                         }
4938                 }
4939         }
4940
4941         spin_lock(&kvm->mmu_lock);
4942         if (!kvm->arch.n_requested_mmu_pages) {
4943                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4944                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4945         }
4946
4947         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4948         spin_unlock(&kvm->mmu_lock);
4949
4950         return 0;
4951 }
4952
4953 void kvm_arch_flush_shadow(struct kvm *kvm)
4954 {
4955         kvm_mmu_zap_all(kvm);
4956         kvm_reload_remote_mmus(kvm);
4957 }
4958
4959 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4960 {
4961         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4962                 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4963                 || vcpu->arch.nmi_pending ||
4964                 (kvm_arch_interrupt_allowed(vcpu) &&
4965                  kvm_cpu_has_interrupt(vcpu));
4966 }
4967
4968 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4969 {
4970         int me;
4971         int cpu = vcpu->cpu;
4972
4973         if (waitqueue_active(&vcpu->wq)) {
4974                 wake_up_interruptible(&vcpu->wq);
4975                 ++vcpu->stat.halt_wakeup;
4976         }
4977
4978         me = get_cpu();
4979         if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4980                 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4981                         smp_send_reschedule(cpu);
4982         put_cpu();
4983 }
4984
4985 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4986 {
4987         return kvm_x86_ops->interrupt_allowed(vcpu);
4988 }
4989
4990 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4991 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4992 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4993 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4994 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
4995 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
4996 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
4997 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
4998 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
4999 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5000 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);