]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge remote-tracking branch 'kvm/linux-next'
authorStephen Rothwell <sfr@canb.auug.org.au>
Mon, 7 Mar 2016 04:54:35 +0000 (15:54 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Mon, 7 Mar 2016 04:54:35 +0000 (15:54 +1100)
17 files changed:
1  2 
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/smp.h
arch/powerpc/include/asm/xics.h
arch/powerpc/kernel/smp.c
arch/powerpc/kvm/book3s_64_vio.c
arch/powerpc/kvm/book3s_64_vio_hv.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/interrupt.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
virt/kvm/async_pf.c
virt/kvm/kvm_main.c

index c98afa538b3aeca91901e858c02884e820ca3dfa,2e7c79101652ef863ee049da91b6047a193a440b..d7b343170453df82b4f31429020c8680e1f949bf
@@@ -182,7 -182,10 +182,10 @@@ struct kvmppc_spapr_tce_table 
        struct list_head list;
        struct kvm *kvm;
        u64 liobn;
-       u32 window_size;
+       struct rcu_head rcu;
+       u32 page_shift;
+       u64 offset;             /* in pages */
+       u64 size;               /* window size in pages */
        struct page *pages[0];
  };
  
@@@ -289,7 -292,7 +292,7 @@@ struct kvmppc_vcore 
        struct list_head runnable_threads;
        struct list_head preempt_list;
        spinlock_t lock;
 -      wait_queue_head_t wq;
 +      struct swait_queue_head wq;
        spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
        u64 stolen_tb;
        u64 preempt_tb;
@@@ -629,7 -632,7 +632,7 @@@ struct kvm_vcpu_arch 
        u8 prodded;
        u32 last_inst;
  
 -      wait_queue_head_t *wqp;
 +      struct swait_queue_head *wqp;
        struct kvmppc_vcore *vcore;
        int ret;
        int trap;
index 174271ef2767c0c748daae4db37eb464dec874e2,78083ed20792f63844ad82941168c1f9d005f1b5..e1afd4c4f695f37037dfe3eac8c3730a87dfbc9d
@@@ -67,9 -67,6 +67,9 @@@ void generic_cpu_die(unsigned int cpu)
  void generic_set_cpu_dead(unsigned int cpu);
  void generic_set_cpu_up(unsigned int cpu);
  int generic_check_cpu_restart(unsigned int cpu);
 +int is_cpu_dead(unsigned int cpu);
 +#else
 +#define generic_set_cpu_up(i) do { } while (0)
  #endif
  
  #ifdef CONFIG_PPC64
@@@ -117,6 -114,9 +117,9 @@@ extern int cpu_to_core_id(int cpu)
  #define PPC_MSG_TICK_BROADCAST        2
  #define PPC_MSG_DEBUGGER_BREAK  3
  
+ /* This is only used by the powernv kernel */
+ #define PPC_MSG_RM_HOST_ACTION        4
  /* for irq controllers that have dedicated ipis per message (4) */
  extern int smp_request_message_ipi(int virq, int message);
  extern const char *smp_ipi_name[];
  /* for irq controllers with only a single ipi */
  extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
  extern void smp_muxed_ipi_message_pass(int cpu, int msg);
+ extern void smp_muxed_ipi_set_message(int cpu, int msg);
  extern irqreturn_t smp_ipi_demux(void);
  
  void smp_init_pSeries(void);
@@@ -200,7 -201,6 +204,7 @@@ extern void generic_secondary_thread_in
  extern unsigned long __secondary_hold_spinloop;
  extern unsigned long __secondary_hold_acknowledge;
  extern char __secondary_hold;
 +extern unsigned int booting_thread_hwid;
  
  extern void __early_start(void);
  #endif /* __ASSEMBLY__ */
index 5d61bbced6a11d67e387fc30b76e47d8f0bc5dfd,254604856e69e1cb1ced217b4191b21b664834ba..04ef3ae511da85104ba01570deb9050c5f2f4dd0
@@@ -1,5 -1,5 +1,5 @@@
  /*
 - * Common definitions accross all variants of ICP and ICS interrupt
 + * Common definitions across all variants of ICP and ICS interrupt
   * controllers.
   */
  
@@@ -30,6 -30,7 +30,7 @@@
  #ifdef CONFIG_PPC_ICP_NATIVE
  extern int icp_native_init(void);
  extern void icp_native_flush_interrupt(void);
+ extern void icp_native_cause_ipi_rm(int cpu);
  #else
  static inline int icp_native_init(void) { return -ENODEV; }
  #endif
index a3cc75baddccb0ecf70c496b6231459ca0e5980b,cb8be5dc118a72876dc0e93c5bb510bf4e307a49..8cac1eb414661ad6e3340a8469361fe9a885d117
@@@ -206,7 -206,7 +206,7 @@@ int smp_request_message_ipi(int virq, i
  
  #ifdef CONFIG_PPC_SMP_MUXED_IPI
  struct cpu_messages {
-       int messages;                   /* current messages */
+       long messages;                  /* current messages */
        unsigned long data;             /* data for cause ipi */
  };
  static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
@@@ -218,7 -218,7 +218,7 @@@ void smp_muxed_ipi_set_data(int cpu, un
        info->data = data;
  }
  
- void smp_muxed_ipi_message_pass(int cpu, int msg)
+ void smp_muxed_ipi_set_message(int cpu, int msg)
  {
        struct cpu_messages *info = &per_cpu(ipi_message, cpu);
        char *message = (char *)&info->messages;
         */
        smp_mb();
        message[msg] = 1;
+ }
+ void smp_muxed_ipi_message_pass(int cpu, int msg)
+ {
+       struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+       smp_muxed_ipi_set_message(cpu, msg);
        /*
         * cause_ipi functions are required to include a full barrier
         * before doing whatever causes the IPI.
  }
  
  #ifdef __BIG_ENDIAN__
- #define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+ #define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
  #else
- #define IPI_MESSAGE(A) (1 << (8 * (A)))
+ #define IPI_MESSAGE(A) (1uL << (8 * (A)))
  #endif
  
  irqreturn_t smp_ipi_demux(void)
  {
        struct cpu_messages *info = this_cpu_ptr(&ipi_message);
-       unsigned int all;
+       unsigned long all;
  
        mb();   /* order any irq clear */
  
        do {
                all = xchg(&info->messages, 0);
+ #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+               /*
+                * Must check for PPC_MSG_RM_HOST_ACTION messages
+                * before PPC_MSG_CALL_FUNCTION messages because when
+                * a VM is destroyed, we call kick_all_cpus_sync()
+                * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+                * messages have completed before we free any VCPUs.
+                */
+               if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+                       kvmppc_xics_ipi_action();
+ #endif
                if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
                        generic_smp_call_function_interrupt();
                if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
@@@ -427,7 -445,7 +445,7 @@@ void generic_cpu_die(unsigned int cpu
  
        for (i = 0; i < 100; i++) {
                smp_rmb();
 -              if (per_cpu(cpu_state, cpu) == CPU_DEAD)
 +              if (is_cpu_dead(cpu))
                        return;
                msleep(100);
        }
@@@ -454,11 -472,6 +472,11 @@@ int generic_check_cpu_restart(unsigned 
        return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
  }
  
 +int is_cpu_dead(unsigned int cpu)
 +{
 +      return per_cpu(cpu_state, cpu) == CPU_DEAD;
 +}
 +
  static bool secondaries_inhibited(void)
  {
        return kvm_hv_mode_active();
@@@ -732,7 -745,7 +750,7 @@@ void start_secondary(void *unused
  
        local_irq_enable();
  
 -      cpu_startup_entry(CPUHP_ONLINE);
 +      cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
  
        BUG();
  }
index 9c3b76bb69d93ad647c693fa94703237603446ef,2c2d1030843acf5736e4ac0dd08c87b3522f2b17..82970042295eb6ca30b78c57f2b1ba650cebb6f1
@@@ -14,6 -14,7 +14,7 @@@
   *
   * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+  * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
   */
  
  #include <linux/types.h>
  #include <asm/tlbflush.h>
  #include <asm/kvm_ppc.h>
  #include <asm/kvm_book3s.h>
 -#include <asm/mmu-hash64.h>
 +#include <asm/book3s/64/mmu-hash.h>
  #include <asm/hvcall.h>
  #include <asm/synch.h>
  #include <asm/ppc-opcode.h>
  #include <asm/kvm_host.h>
  #include <asm/udbg.h>
+ #include <asm/iommu.h>
+ #include <asm/tce.h>
  
- #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
+ static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
+ {
+       return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+ }
  
- static long kvmppc_stt_npages(unsigned long window_size)
+ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
  {
-       return ALIGN((window_size >> SPAPR_TCE_SHIFT)
-                    * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+       unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) +
+                       (tce_pages * sizeof(struct page *));
+       return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
  }
  
- static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
  {
-       struct kvm *kvm = stt->kvm;
-       int i;
+       long ret = 0;
  
-       mutex_lock(&kvm->lock);
-       list_del(&stt->list);
-       for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+       if (!current || !current->mm)
+               return ret; /* process exited */
+       down_write(&current->mm->mmap_sem);
+       if (inc) {
+               unsigned long locked, lock_limit;
+               locked = current->mm->locked_vm + stt_pages;
+               lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+               if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                       ret = -ENOMEM;
+               else
+                       current->mm->locked_vm += stt_pages;
+       } else {
+               if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
+                       stt_pages = current->mm->locked_vm;
+               current->mm->locked_vm -= stt_pages;
+       }
+       pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
+                       inc ? '+' : '-',
+                       stt_pages << PAGE_SHIFT,
+                       current->mm->locked_vm << PAGE_SHIFT,
+                       rlimit(RLIMIT_MEMLOCK),
+                       ret ? " - exceeded" : "");
+       up_write(&current->mm->mmap_sem);
+       return ret;
+ }
+ static void release_spapr_tce_table(struct rcu_head *head)
+ {
+       struct kvmppc_spapr_tce_table *stt = container_of(head,
+                       struct kvmppc_spapr_tce_table, rcu);
+       unsigned long i, npages = kvmppc_tce_pages(stt->size);
+       for (i = 0; i < npages; i++)
                __free_page(stt->pages[i]);
-       kfree(stt);
-       mutex_unlock(&kvm->lock);
  
-       kvm_put_kvm(kvm);
+       kfree(stt);
  }
  
  static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
        struct page *page;
  
-       if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+       if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
                return VM_FAULT_SIGBUS;
  
        page = stt->pages[vmf->pgoff];
@@@ -88,7 -130,14 +130,14 @@@ static int kvm_spapr_tce_release(struc
  {
        struct kvmppc_spapr_tce_table *stt = filp->private_data;
  
-       release_spapr_tce_table(stt);
+       list_del_rcu(&stt->list);
+       kvm_put_kvm(stt->kvm);
+       kvmppc_account_memlimit(
+               kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
+       call_rcu(&stt->rcu, release_spapr_tce_table);
        return 0;
  }
  
@@@ -98,20 -147,29 +147,29 @@@ static const struct file_operations kvm
  };
  
  long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-                                  struct kvm_create_spapr_tce *args)
+                                  struct kvm_create_spapr_tce_64 *args)
  {
        struct kvmppc_spapr_tce_table *stt = NULL;
-       long npages;
+       unsigned long npages, size;
        int ret = -ENOMEM;
        int i;
  
+       if (!args->size)
+               return -EINVAL;
        /* Check this LIOBN hasn't been previously allocated */
        list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
                if (stt->liobn == args->liobn)
                        return -EBUSY;
        }
  
-       npages = kvmppc_stt_npages(args->window_size);
+       size = args->size;
+       npages = kvmppc_tce_pages(size);
+       ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+       if (ret) {
+               stt = NULL;
+               goto fail;
+       }
  
        stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
                      GFP_KERNEL);
                goto fail;
  
        stt->liobn = args->liobn;
-       stt->window_size = args->window_size;
+       stt->page_shift = args->page_shift;
+       stt->offset = args->offset;
+       stt->size = size;
        stt->kvm = kvm;
  
        for (i = 0; i < npages; i++) {
        kvm_get_kvm(kvm);
  
        mutex_lock(&kvm->lock);
-       list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+       list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
  
        mutex_unlock(&kvm->lock);
  
@@@ -148,3 -208,59 +208,59 @@@ fail
        }
        return ret;
  }
+ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_list, unsigned long npages)
+ {
+       struct kvmppc_spapr_tce_table *stt;
+       long i, ret = H_SUCCESS, idx;
+       unsigned long entry, ua = 0;
+       u64 __user *tces, tce;
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+       entry = ioba >> stt->page_shift;
+       /*
+        * SPAPR spec says that the maximum size of the list is 512 TCEs
+        * so the whole table fits in 4K page
+        */
+       if (npages > 512)
+               return H_PARAMETER;
+       if (tce_list & (SZ_4K - 1))
+               return H_PARAMETER;
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+       if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+               ret = H_TOO_HARD;
+               goto unlock_exit;
+       }
+       tces = (u64 __user *) ua;
+       for (i = 0; i < npages; ++i) {
+               if (get_user(tce, tces + i)) {
+                       ret = H_TOO_HARD;
+                       goto unlock_exit;
+               }
+               tce = be64_to_cpu(tce);
+               ret = kvmppc_tce_validate(stt, tce);
+               if (ret != H_SUCCESS)
+                       goto unlock_exit;
+               kvmppc_tce_put(stt, entry + i, tce);
+       }
+ unlock_exit:
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);
index 039028d3ccb5ab38a39ea52dfa1bad00a6052d74,44be73e6aa26b6563d1cc4cc91e1a332a84271c3..f88b859af53b5c85d71a35e3d11b4dace211674e
@@@ -14,6 -14,7 +14,7 @@@
   *
   * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+  * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
   */
  
  #include <linux/types.h>
  #include <asm/tlbflush.h>
  #include <asm/kvm_ppc.h>
  #include <asm/kvm_book3s.h>
 -#include <asm/mmu-hash64.h>
 +#include <asm/book3s/64/mmu-hash.h>
+ #include <asm/mmu_context.h>
  #include <asm/hvcall.h>
  #include <asm/synch.h>
  #include <asm/ppc-opcode.h>
  #include <asm/kvm_host.h>
  #include <asm/udbg.h>
+ #include <asm/iommu.h>
+ #include <asm/tce.h>
+ #include <asm/iommu.h>
  
  #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
  
- /* WARNING: This will be called in real-mode on HV KVM and virtual
+ /*
+  * Finds a TCE table descriptor by LIOBN.
+  *
+  * WARNING: This will be called in real or virtual mode on HV KVM and virtual
   *          mode on PR KVM
   */
long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-                     unsigned long ioba, unsigned long tce)
struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+               unsigned long liobn)
  {
        struct kvm *kvm = vcpu->kvm;
        struct kvmppc_spapr_tce_table *stt;
  
+       list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
+               if (stt->liobn == liobn)
+                       return stt;
+       return NULL;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_find_table);
+ /*
+  * Validates IO address.
+  *
+  * WARNING: This will be called in real-mode on HV KVM and virtual
+  *          mode on PR KVM
+  */
+ long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+               unsigned long ioba, unsigned long npages)
+ {
+       unsigned long mask = (1ULL << stt->page_shift) - 1;
+       unsigned long idx = ioba >> stt->page_shift;
+       if ((ioba & mask) || (idx < stt->offset) ||
+                       (idx - stt->offset + npages > stt->size) ||
+                       (idx + npages < idx))
+               return H_PARAMETER;
+       return H_SUCCESS;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
+ /*
+  * Validates TCE address.
+  * At the moment flags and page mask are validated.
+  * As the host kernel does not access those addresses (just puts them
+  * to the table and user space is supposed to process them), we can skip
+  * checking other things (such as TCE is a guest RAM address or the page
+  * was actually allocated).
+  *
+  * WARNING: This will be called in real-mode on HV KVM and virtual
+  *          mode on PR KVM
+  */
+ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+ {
+       unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
+       unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+       if (tce & mask)
+               return H_PARAMETER;
+       return H_SUCCESS;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+ /* Note on the use of page_address() in real mode,
+  *
+  * It is safe to use page_address() in real mode on ppc64 because
+  * page_address() is always defined as lowmem_page_address()
+  * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic
+  * operation and does not access page struct.
+  *
+  * Theoretically page_address() could be defined different
+  * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+  * would have to be enabled.
+  * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+  * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+  * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+  * is not expected to be enabled on ppc32, page_address()
+  * is safe for ppc32 as well.
+  *
+  * WARNING: This will be called in real-mode on HV KVM and virtual
+  *          mode on PR KVM
+  */
+ static u64 *kvmppc_page_address(struct page *page)
+ {
+ #if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+ #error TODO: fix to avoid page_address() here
+ #endif
+       return (u64 *) page_address(page);
+ }
+ /*
+  * Handles TCE requests for emulated devices.
+  * Puts guest TCE values to the table and expects user space to convert them.
+  * Called in both real and virtual modes.
+  * Cannot fail so kvmppc_tce_validate must be called before it.
+  *
+  * WARNING: This will be called in real-mode on HV KVM and virtual
+  *          mode on PR KVM
+  */
+ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+               unsigned long idx, unsigned long tce)
+ {
+       struct page *page;
+       u64 *tbl;
+       idx -= stt->offset;
+       page = stt->pages[idx / TCES_PER_PAGE];
+       tbl = kvmppc_page_address(page);
+       tbl[idx % TCES_PER_PAGE] = tce;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_tce_put);
+ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+               unsigned long *ua, unsigned long **prmap)
+ {
+       unsigned long gfn = gpa >> PAGE_SHIFT;
+       struct kvm_memory_slot *memslot;
+       memslot = search_memslots(kvm_memslots(kvm), gfn);
+       if (!memslot)
+               return -EINVAL;
+       *ua = __gfn_to_hva_memslot(memslot, gfn) |
+               (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       if (prmap)
+               *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ #endif
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
+ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba, unsigned long tce)
+ {
+       struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+       long ret;
        /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
        /*          liobn, ioba, tce); */
  
-       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-               if (stt->liobn == liobn) {
-                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-                       struct page *page;
-                       u64 *tbl;
-                       /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
-                       /*          liobn, stt, stt->window_size); */
-                       if (ioba >= stt->window_size)
-                               return H_PARAMETER;
-                       page = stt->pages[idx / TCES_PER_PAGE];
-                       tbl = (u64 *)page_address(page);
-                       /* FIXME: Need to validate the TCE itself */
-                       /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
-                       tbl[idx % TCES_PER_PAGE] = tce;
-                       return H_SUCCESS;
-               }
-       }
+       if (!stt)
+               return H_TOO_HARD;
+       ret = kvmppc_ioba_validate(stt, ioba, 1);
+       if (ret != H_SUCCESS)
+               return ret;
  
-       /* Didn't find the liobn, punt it to userspace */
-       return H_TOO_HARD;
+       ret = kvmppc_tce_validate(stt, tce);
+       if (ret != H_SUCCESS)
+               return ret;
+       kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+       return H_SUCCESS;
  }
  EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
  
- long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-                     unsigned long ioba)
+ static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
+               unsigned long ua, unsigned long *phpa)
+ {
+       pte_t *ptep, pte;
+       unsigned shift = 0;
+       ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift);
+       if (!ptep || !pte_present(*ptep))
+               return -ENXIO;
+       pte = *ptep;
+       if (!shift)
+               shift = PAGE_SHIFT;
+       /* Avoid handling anything potentially complicated in realmode */
+       if (shift > PAGE_SHIFT)
+               return -EAGAIN;
+       if (!pte_young(pte))
+               return -EAGAIN;
+       *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) |
+                       (ua & ~PAGE_MASK);
+       return 0;
+ }
+ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_list, unsigned long npages)
  {
-       struct kvm *kvm = vcpu->kvm;
        struct kvmppc_spapr_tce_table *stt;
+       long i, ret = H_SUCCESS;
+       unsigned long tces, entry, ua = 0;
+       unsigned long *rmap = NULL;
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+       entry = ioba >> stt->page_shift;
+       /*
+        * The spec says that the maximum size of the list is 512 TCEs
+        * so the whole table addressed resides in 4K page
+        */
+       if (npages > 512)
+               return H_PARAMETER;
+       if (tce_list & (SZ_4K - 1))
+               return H_PARAMETER;
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
  
-       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-               if (stt->liobn == liobn) {
-                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-                       struct page *page;
-                       u64 *tbl;
+       if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+               return H_TOO_HARD;
  
-                       if (ioba >= stt->window_size)
-                               return H_PARAMETER;
+       rmap = (void *) vmalloc_to_phys(rmap);
  
-                       page = stt->pages[idx / TCES_PER_PAGE];
-                       tbl = (u64 *)page_address(page);
+       /*
+        * Synchronize with the MMU notifier callbacks in
+        * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+        * While we have the rmap lock, code running on other CPUs
+        * cannot finish unmapping the host real page that backs
+        * this guest real page, so we are OK to access the host
+        * real page.
+        */
+       lock_rmap(rmap);
+       if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+               ret = H_TOO_HARD;
+               goto unlock_exit;
+       }
+       for (i = 0; i < npages; ++i) {
+               unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
+               ret = kvmppc_tce_validate(stt, tce);
+               if (ret != H_SUCCESS)
+                       goto unlock_exit;
  
-                       vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
-                       return H_SUCCESS;
-               }
+               kvmppc_tce_put(stt, entry + i, tce);
        }
  
-       /* Didn't find the liobn, punt it to userspace */
-       return H_TOO_HARD;
+ unlock_exit:
+       unlock_rmap(rmap);
+       return ret;
+ }
+ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_value, unsigned long npages)
+ {
+       struct kvmppc_spapr_tce_table *stt;
+       long i, ret;
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
+       /* Check permission bits only to allow userspace poison TCE for debug */
+       if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
+               return H_PARAMETER;
+       for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
+               kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+       return H_SUCCESS;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
+ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba)
+ {
+       struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+       long ret;
+       unsigned long idx;
+       struct page *page;
+       u64 *tbl;
+       if (!stt)
+               return H_TOO_HARD;
+       ret = kvmppc_ioba_validate(stt, ioba, 1);
+       if (ret != H_SUCCESS)
+               return ret;
+       idx = (ioba >> stt->page_shift) - stt->offset;
+       page = stt->pages[idx / TCES_PER_PAGE];
+       tbl = (u64 *)page_address(page);
+       vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+       return H_SUCCESS;
  }
  EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
+ #endif /* KVM_BOOK3S_HV_POSSIBLE */
index f1187bb6dd4d7f5960e57aea111bd1c12021408d,f47fffefadc1fb8f0a53d13bfdb9d7cbcf55d80d..84fb4fcfaa41b802a614515c67539b0d2d7ee3cf
@@@ -81,6 -81,17 +81,17 @@@ static int target_smt_mode
  module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
  MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
  
+ #ifdef CONFIG_KVM_XICS
+ static struct kernel_param_ops module_param_ops = {
+       .set = param_set_int,
+       .get = param_get_int,
+ };
+ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
+                                                       S_IRUGO | S_IWUSR);
+ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+ #endif
  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
  
@@@ -114,11 -125,11 +125,11 @@@ static bool kvmppc_ipi_thread(int cpu
  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  {
        int cpu;
 -      wait_queue_head_t *wqp;
 +      struct swait_queue_head *wqp;
  
        wqp = kvm_arch_vcpu_wq(vcpu);
 -      if (waitqueue_active(wqp)) {
 -              wake_up_interruptible(wqp);
 +      if (swait_active(wqp)) {
 +              swake_up(wqp);
                ++vcpu->stat.halt_wakeup;
        }
  
@@@ -701,8 -712,8 +712,8 @@@ int kvmppc_pseries_do_hcall(struct kvm_
                tvcpu->arch.prodded = 1;
                smp_mb();
                if (vcpu->arch.ceded) {
 -                      if (waitqueue_active(&vcpu->wq)) {
 -                              wake_up_interruptible(&vcpu->wq);
 +                      if (swait_active(&vcpu->wq)) {
 +                              swake_up(&vcpu->wq);
                                vcpu->stat.halt_wakeup++;
                        }
                }
                if (kvmppc_xics_enabled(vcpu)) {
                        ret = kvmppc_xics_hcall(vcpu, req);
                        break;
-               } /* fallthrough */
+               }
+               return RESUME_HOST;
+       case H_PUT_TCE:
+               ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_PUT_TCE_INDIRECT:
+               ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_STUFF_TCE:
+               ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
        default:
                return RESUME_HOST;
        }
@@@ -1459,7 -1494,7 +1494,7 @@@ static struct kvmppc_vcore *kvmppc_vcor
        INIT_LIST_HEAD(&vcore->runnable_threads);
        spin_lock_init(&vcore->lock);
        spin_lock_init(&vcore->stoltb_lock);
 -      init_waitqueue_head(&vcore->wq);
 +      init_swait_queue_head(&vcore->wq);
        vcore->preempt_tb = TB_NIL;
        vcore->lpcr = kvm->arch.lpcr;
        vcore->first_vcpuid = core * threads_per_subcore;
@@@ -2278,6 -2313,46 +2313,46 @@@ static void post_guest_process(struct k
        spin_unlock(&vc->lock);
  }
  
+ /*
+  * Clear core from the list of active host cores as we are about to
+  * enter the guest. Only do this if it is the primary thread of the
+  * core (not if a subcore) that is entering the guest.
+  */
+ static inline void kvmppc_clear_host_core(int cpu)
+ {
+       int core;
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+       /*
+        * Memory barrier can be omitted here as we will do a smp_wmb()
+        * later in kvmppc_start_thread and we need ensure that state is
+        * visible to other CPUs only after we enter guest.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+ }
+ /*
+  * Advertise this core as an active host core since we exited the guest
+  * Only need to do this if it is the primary thread of the core that is
+  * exiting.
+  */
+ static inline void kvmppc_set_host_core(int cpu)
+ {
+       int core;
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+       /*
+        * Memory barrier can be omitted here because we do a spin_unlock
+        * immediately after this which provides the memory barrier.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+ }
  /*
   * Run a set of guest threads on a physical core.
   * Called with vc->lock held.
@@@ -2390,6 -2465,8 +2465,8 @@@ static noinline void kvmppc_run_core(st
                }
        }
  
+       kvmppc_clear_host_core(pcpu);
        /* Start all the threads */
        active = 0;
        for (sub = 0; sub < core_info.n_subcores; ++sub) {
                        kvmppc_ipi_thread(pcpu + i);
        }
  
+       kvmppc_set_host_core(pcpu);
        spin_unlock(&vc->lock);
  
        /* make sure updates to secondary vcpu structs are visible now */
@@@ -2531,9 -2610,10 +2610,9 @@@ static void kvmppc_vcore_blocked(struc
  {
        struct kvm_vcpu *vcpu;
        int do_sleep = 1;
 +      DECLARE_SWAITQUEUE(wait);
  
 -      DEFINE_WAIT(wait);
 -
 -      prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 +      prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
  
        /*
         * Check one last time for pending exceptions and ceded state after
        }
  
        if (!do_sleep) {
 -              finish_wait(&vc->wq, &wait);
 +              finish_swait(&vc->wq, &wait);
                return;
        }
  
        trace_kvmppc_vcore_blocked(vc, 0);
        spin_unlock(&vc->lock);
        schedule();
 -      finish_wait(&vc->wq, &wait);
 +      finish_swait(&vc->wq, &wait);
        spin_lock(&vc->lock);
        vc->vcore_state = VCORE_INACTIVE;
        trace_kvmppc_vcore_blocked(vc, 1);
@@@ -2611,7 -2691,7 +2690,7 @@@ static int kvmppc_run_vcpu(struct kvm_r
                        kvmppc_start_thread(vcpu, vc);
                        trace_kvm_guest_enter(vcpu);
                } else if (vc->vcore_state == VCORE_SLEEPING) {
 -                      wake_up(&vc->wq);
 +                      swake_up(&vc->wq);
                }
  
        }
@@@ -2983,6 -3063,114 +3062,114 @@@ static int kvmppc_hv_setup_htab_rma(str
        goto out_srcu;
  }
  
+ #ifdef CONFIG_KVM_XICS
+ static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+                       void *hcpu)
+ {
+       unsigned long cpu = (long)hcpu;
+       switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               kvmppc_set_host_core(cpu);
+               break;
+ #ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+               kvmppc_clear_host_core(cpu);
+               break;
+ #endif
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+ }
+ static struct notifier_block kvmppc_cpu_notifier = {
+           .notifier_call = kvmppc_cpu_notify,
+ };
+ /*
+  * Allocate a per-core structure for managing state about which cores are
+  * running in the host versus the guest and for exchanging data between
+  * real mode KVM and CPU running in the host.
+  * This is only done for the first VM.
+  * The allocated structure stays even if all VMs have stopped.
+  * It is only freed when the kvm-hv module is unloaded.
+  * It's OK for this routine to fail, we just don't support host
+  * core operations like redirecting H_IPI wakeups.
+  */
+ void kvmppc_alloc_host_rm_ops(void)
+ {
+       struct kvmppc_host_rm_ops *ops;
+       unsigned long l_ops;
+       int cpu, core;
+       int size;
+       /* Not the first time here ? */
+       if (kvmppc_host_rm_ops_hv != NULL)
+               return;
+       ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+       if (!ops)
+               return;
+       size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+       ops->rm_core = kzalloc(size, GFP_KERNEL);
+       if (!ops->rm_core) {
+               kfree(ops);
+               return;
+       }
+       get_online_cpus();
+       for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+               if (!cpu_online(cpu))
+                       continue;
+               core = cpu >> threads_shift;
+               ops->rm_core[core].rm_state.in_host = 1;
+       }
+       ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+       /*
+        * Make the contents of the kvmppc_host_rm_ops structure visible
+        * to other CPUs before we assign it to the global variable.
+        * Do an atomic assignment (no locks used here), but if someone
+        * beats us to it, just free our copy and return.
+        */
+       smp_wmb();
+       l_ops = (unsigned long) ops;
+       if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+               put_online_cpus();
+               kfree(ops->rm_core);
+               kfree(ops);
+               return;
+       }
+       register_cpu_notifier(&kvmppc_cpu_notifier);
+       put_online_cpus();
+ }
+ void kvmppc_free_host_rm_ops(void)
+ {
+       if (kvmppc_host_rm_ops_hv) {
+               unregister_cpu_notifier(&kvmppc_cpu_notifier);
+               kfree(kvmppc_host_rm_ops_hv->rm_core);
+               kfree(kvmppc_host_rm_ops_hv);
+               kvmppc_host_rm_ops_hv = NULL;
+       }
+ }
+ #endif
  static int kvmppc_core_init_vm_hv(struct kvm *kvm)
  {
        unsigned long lpcr, lpid;
                return -ENOMEM;
        kvm->arch.lpid = lpid;
  
+       kvmppc_alloc_host_rm_ops();
        /*
         * Since we don't flush the TLB when tearing down a VM,
         * and this lpid might have previously been used,
@@@ -3228,6 -3418,7 +3417,7 @@@ static int kvmppc_book3s_init_hv(void
  
  static void kvmppc_book3s_exit_hv(void)
  {
+       kvmppc_free_host_rm_ops();
        kvmppc_hv_ops = NULL;
  }
  
index c613fee0b9f7fa7e5accc99976362ddca1b668d4,ed16182a008b7f10b7aa53c3af2d0f3fd167fe25..b20b2071372b3ae7249c9a00699fdfd3277be69d
@@@ -27,7 -27,7 +27,7 @@@
  #include <asm/asm-offsets.h>
  #include <asm/exception-64s.h>
  #include <asm/kvm_book3s_asm.h>
 -#include <asm/mmu-hash64.h>
 +#include <asm/book3s/64/mmu-hash.h>
  #include <asm/tm.h>
  
  #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
@@@ -2006,8 -2006,8 +2006,8 @@@ hcall_real_table
        .long   0               /* 0x12c */
        .long   0               /* 0x130 */
        .long   DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
-       .long   0               /* 0x138 */
-       .long   0               /* 0x13c */
+       .long   DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+       .long   DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
        .long   0               /* 0x140 */
        .long   0               /* 0x144 */
        .long   0               /* 0x148 */
index b0c8ad0799c7f0c09607420441ea87735c88c6d9,727e7f7b33fddbb977c91b72e7d922399e7cb1b0..59174b16dd98783b158553af7d90142575689880
@@@ -229,17 -229,11 +229,11 @@@ struct kvm_s390_itdb 
        __u8    data[256];
  } __packed;
  
- struct kvm_s390_vregs {
-       __vector128 vrs[32];
-       __u8    reserved200[512];       /* for future vector expansion */
- } __packed;
  struct sie_page {
        struct kvm_s390_sie_block sie_block;
        __u8 reserved200[1024];         /* 0x0200 */
        struct kvm_s390_itdb itdb;      /* 0x0600 */
-       __u8 reserved700[1280];         /* 0x0700 */
-       struct kvm_s390_vregs vregs;    /* 0x0c00 */
+       __u8 reserved700[2304];         /* 0x0700 */
  } __packed;
  
  struct kvm_vcpu_stat {
@@@ -467,7 -461,7 +461,7 @@@ struct kvm_s390_irq_payload 
  struct kvm_s390_local_interrupt {
        spinlock_t lock;
        struct kvm_s390_float_interrupt *float_int;
 -      wait_queue_head_t *wq;
 +      struct swait_queue_head *wq;
        atomic_t *cpuflags;
        DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
        struct kvm_s390_irq_payload irq;
index 9ffc7322179213f031939fa184bc6c93545af559,87e2d1a89d74eaba5e398392ee2bcd86cdbb0acb..3105390865c87cabd0f0a8b4e03e60534f9bb07b
@@@ -335,23 -335,6 +335,6 @@@ static void set_intercept_indicators(st
        set_intercept_indicators_stop(vcpu);
  }
  
- static u16 get_ilc(struct kvm_vcpu *vcpu)
- {
-       switch (vcpu->arch.sie_block->icptcode) {
-       case ICPT_INST:
-       case ICPT_INSTPROGI:
-       case ICPT_OPEREXC:
-       case ICPT_PARTEXEC:
-       case ICPT_IOINST:
-               /* last instruction only stored for these icptcodes */
-               return insn_length(vcpu->arch.sie_block->ipa >> 8);
-       case ICPT_PROGI:
-               return vcpu->arch.sie_block->pgmilc;
-       default:
-               return 0;
-       }
- }
  static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
  {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@@ -588,7 -571,7 +571,7 @@@ static int __must_check __deliver_prog(
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        struct kvm_s390_pgm_info pgm_info;
        int rc = 0, nullifying = false;
-       u16 ilc = get_ilc(vcpu);
+       u16 ilen;
  
        spin_lock(&li->lock);
        pgm_info = li->irq.pgm;
        memset(&li->irq.pgm, 0, sizeof(pgm_info));
        spin_unlock(&li->lock);
  
-       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d",
-                  pgm_info.code, ilc);
+       ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
+       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
+                  pgm_info.code, ilen);
        vcpu->stat.deliver_program_int++;
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                         pgm_info.code, 0);
                                   (u8 *) __LC_PER_ACCESS_ID);
        }
  
-       if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
-               kvm_s390_rewind_psw(vcpu, ilc);
+       if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND))
+               kvm_s390_rewind_psw(vcpu, ilen);
  
-       rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
+       /* bit 1+2 of the target are the ilc, so we can directly use ilen */
+       rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
        rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
                                 (u64 *) __LC_LAST_BREAK);
        rc |= put_guest_lc(vcpu, pgm_info.code,
@@@ -966,13 -951,13 +951,13 @@@ no_timer
  
  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
  {
 -      if (waitqueue_active(&vcpu->wq)) {
 +      if (swait_active(&vcpu->wq)) {
                /*
                 * The vcpu gave up the cpu voluntarily, mark it as a good
                 * yield-candidate.
                 */
                vcpu->preempted = true;
 -              wake_up_interruptible(&vcpu->wq);
 +              swake_up(&vcpu->wq);
                vcpu->stat.halt_wakeup++;
        }
  }
@@@ -1059,8 -1044,16 +1044,16 @@@ static int __inject_prog(struct kvm_vcp
        trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                   irq->u.pgm.code, 0);
  
+       if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) {
+               /* auto detection if no valid ILC was given */
+               irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK;
+               irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu);
+               irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID;
+       }
        if (irq->u.pgm.code == PGM_PER) {
                li->irq.pgm.code |= PGM_PER;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                /* only modify PER related information */
                li->irq.pgm.per_address = irq->u.pgm.per_address;
                li->irq.pgm.per_code = irq->u.pgm.per_code;
        } else if (!(irq->u.pgm.code & PGM_PER)) {
                li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
                                   irq->u.pgm.code;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                /* only modify non-PER information */
                li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
                li->irq.pgm.mon_code = irq->u.pgm.mon_code;
diff --combined arch/x86/kvm/lapic.c
index 3a045f39ed8114e24e375521135cb7d2296e9e7e,d9ae1ce2a6a03e0e8ebac52ea88c94dd913fe5f7..443d2a57ad3d9620246097a48ed3cd7de9e02f50
@@@ -281,7 -281,7 +281,7 @@@ void kvm_apic_set_version(struct kvm_vc
        struct kvm_cpuid_entry2 *feat;
        u32 v = APIC_VERSION;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
  
        feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@@ -475,26 -475,20 +475,20 @@@ static inline void apic_clear_isr(int v
  
  int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
  {
-       int highest_irr;
        /* This may race with setting of irr in __apic_accept_irq() and
         * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
         * will cause vmexit immediately and the value will be recalculated
         * on the next vmentry.
         */
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
-       highest_irr = apic_find_highest_irr(vcpu->arch.apic);
-       return highest_irr;
+       return apic_find_highest_irr(vcpu->arch.apic);
  }
  
  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode,
-                            unsigned long *dest_map);
+                            struct dest_map *dest_map);
  
  int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-               unsigned long *dest_map)
+                    struct dest_map *dest_map)
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
@@@ -675,8 -669,33 +669,33 @@@ bool kvm_apic_match_dest(struct kvm_vcp
        }
  }
  
+ int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+                      const unsigned long *bitmap, u32 bitmap_size)
+ {
+       u32 mod;
+       int i, idx = -1;
+       mod = vector % dest_vcpus;
+       for (i = 0; i <= mod; i++) {
+               idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+               BUG_ON(idx == bitmap_size);
+       }
+       return idx;
+ }
+ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+ {
+       if (!kvm->arch.disabled_lapic_found) {
+               kvm->arch.disabled_lapic_found = true;
+               printk(KERN_INFO
+                      "Disabled LAPIC found during irq injection\n");
+       }
+ }
  bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
  {
        struct kvm_apic_map *map;
        unsigned long bitmap = 1;
  
                dst = map->logical_map[cid];
  
-               if (kvm_lowest_prio_delivery(irq)) {
+               if (!kvm_lowest_prio_delivery(irq))
+                       goto set_irq;
+               if (!kvm_vector_hashing_enabled()) {
                        int l = -1;
                        for_each_set_bit(i, &bitmap, 16) {
                                if (!dst[i])
                                        continue;
                                if (l < 0)
                                        l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
+                                                       dst[l]->vcpu) < 0)
                                        l = i;
                        }
                        bitmap = (l >= 0) ? 1 << l : 0;
+               } else {
+                       int idx;
+                       unsigned int dest_vcpus;
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
+                               goto out;
+                       idx = kvm_vector_to_index(irq->vector,
+                               dest_vcpus, &bitmap, 16);
+                       if (!dst[idx]) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
+                       bitmap = (idx >= 0) ? 1 << idx : 0;
                }
        }
  
+ set_irq:
        for_each_set_bit(i, &bitmap, 16) {
                if (!dst[i])
                        continue;
@@@ -754,6 -794,20 +794,20 @@@ out
        return ret;
  }
  
+ /*
+  * This routine tries to handler interrupts in posted mode, here is how
+  * it deals with different cases:
+  * - For single-destination interrupts, handle it in posted mode
+  * - Else if vector hashing is enabled and it is a lowest-priority
+  *   interrupt, handle it in posted mode and use the following mechanism
+  *   to find the destinaiton vCPU.
+  *    1. For lowest-priority interrupts, store all the possible
+  *       destination vCPUs in an array.
+  *    2. Use "guest vector % max number of destination vCPUs" to find
+  *       the right destination vCPU in the array for the lowest-priority
+  *       interrupt.
+  * - Otherwise, use remapped mode to inject the interrupt.
+  */
  bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu)
  {
                if (cid >= ARRAY_SIZE(map->logical_map))
                        goto out;
  
-               for_each_set_bit(i, &bitmap, 16) {
-                       dst = map->logical_map[cid][i];
-                       if (++r == 2)
+               if (kvm_vector_hashing_enabled() &&
+                               kvm_lowest_prio_delivery(irq)) {
+                       int idx;
+                       unsigned int dest_vcpus;
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
                                goto out;
-               }
  
-               if (dst && kvm_apic_present(dst->vcpu))
+                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+                                                 &bitmap, 16);
+                       dst = map->logical_map[cid][idx];
+                       if (!dst) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
                        *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
+               } else {
+                       for_each_set_bit(i, &bitmap, 16) {
+                               dst = map->logical_map[cid][i];
+                               if (++r == 2)
+                                       goto out;
+                       }
+                       if (dst && kvm_apic_present(dst->vcpu))
+                               *dest_vcpu = dst->vcpu;
+                       else
+                               goto out;
+               }
        }
  
        ret = true;
@@@ -819,7 -894,7 +894,7 @@@ out
   */
  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode,
-                            unsigned long *dest_map)
+                            struct dest_map *dest_map)
  {
        int result = 0;
        struct kvm_vcpu *vcpu = apic->vcpu;
  
                result = 1;
  
-               if (dest_map)
-                       __set_bit(vcpu->vcpu_id, dest_map);
+               if (dest_map) {
+                       __set_bit(vcpu->vcpu_id, dest_map->map);
+                       dest_map->vectors[vcpu->vcpu_id] = vector;
+               }
  
                if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
                        if (trig_mode)
@@@ -1195,7 -1272,7 +1272,7 @@@ static void apic_update_lvtt(struct kvm
  static void apic_timer_expired(struct kvm_lapic *apic)
  {
        struct kvm_vcpu *vcpu = apic->vcpu;
 -      wait_queue_head_t *q = &vcpu->wq;
 +      struct swait_queue_head *q = &vcpu->wq;
        struct kvm_timer *ktimer = &apic->lapic_timer;
  
        if (atomic_read(&apic->lapic_timer.pending))
        atomic_inc(&apic->lapic_timer.pending);
        kvm_set_pending_timer(vcpu);
  
 -      if (waitqueue_active(q))
 -              wake_up_interruptible(q);
 +      if (swait_active(q))
 +              swake_up(q);
  
        if (apic_lvtt_tscdeadline(apic))
                ktimer->expired_tscdeadline = ktimer->tscdeadline;
@@@ -1239,7 -1316,7 +1316,7 @@@ void wait_lapic_expire(struct kvm_vcpu 
        struct kvm_lapic *apic = vcpu->arch.apic;
        u64 guest_tsc, tsc_deadline;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
  
        if (apic->lapic_timer.expired_tscdeadline == 0)
@@@ -1515,8 -1592,7 +1592,7 @@@ static int apic_mmio_write(struct kvm_v
  
  void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
  {
-       if (kvm_vcpu_has_lapic(vcpu))
-               apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+       apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
  }
  EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
  
@@@ -1566,7 -1642,7 +1642,7 @@@ u64 kvm_get_lapic_tscdeadline_msr(struc
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                        apic_lvtt_period(apic))
                return 0;
  
@@@ -1577,7 -1653,7 +1653,7 @@@ void kvm_set_lapic_tscdeadline_msr(stru
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                        apic_lvtt_period(apic))
                return;
  
@@@ -1590,9 -1666,6 +1666,6 @@@ void kvm_lapic_set_tpr(struct kvm_vcpu 
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
        apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
                     | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
  }
@@@ -1601,9 -1674,6 +1674,6 @@@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *
  {
        u64 tpr;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
        tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
  
        return (tpr & 0xf0) >> 4;
@@@ -1728,8 -1798,7 +1798,7 @@@ int apic_has_pending_timer(struct kvm_v
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
-                       apic_lvt_enabled(apic, APIC_LVTT))
+       if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
                return atomic_read(&apic->lapic_timer.pending);
  
        return 0;
@@@ -1826,7 -1895,7 +1895,7 @@@ int kvm_apic_has_interrupt(struct kvm_v
        struct kvm_lapic *apic = vcpu->arch.apic;
        int highest_irr;
  
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+       if (!apic_enabled(apic))
                return -1;
  
        apic_update_ppr(apic);
@@@ -1854,9 -1923,6 +1923,6 @@@ void kvm_inject_apic_timer_irqs(struct 
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
        if (atomic_read(&apic->lapic_timer.pending) > 0) {
                kvm_apic_local_deliver(apic, APIC_LVTT);
                if (apic_lvtt_tscdeadline(apic))
@@@ -1932,7 -1998,7 +1998,7 @@@ void __kvm_migrate_apic_timer(struct kv
  {
        struct hrtimer *timer;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
  
        timer = &vcpu->arch.apic->lapic_timer.timer;
@@@ -2105,7 -2171,7 +2171,7 @@@ int kvm_hv_vapic_msr_write(struct kvm_v
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return 1;
  
        /* if this is ICR write vector before command */
@@@ -2119,7 -2185,7 +2185,7 @@@ int kvm_hv_vapic_msr_read(struct kvm_vc
        struct kvm_lapic *apic = vcpu->arch.apic;
        u32 low, high = 0;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return 1;
  
        if (apic_reg_read(apic, reg, 4, &low))
@@@ -2151,7 -2217,7 +2217,7 @@@ void kvm_apic_accept_events(struct kvm_
        u8 sipi_vector;
        unsigned long pe;
  
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+       if (!lapic_in_kernel(vcpu) || !apic->pending_events)
                return;
  
        /*
diff --combined arch/x86/kvm/mmu.c
index e1bb320dd5b256caee31c0342ff8789731da0299,0a4dc9b541810c58eb6c88bfead1ba45323fc894..af631279a2e64f4b5283822f28916178b45a3ad8
@@@ -41,6 -41,7 +41,7 @@@
  #include <asm/cmpxchg.h>
  #include <asm/io.h>
  #include <asm/vmx.h>
+ #include <asm/kvm_page_track.h>
  
  /*
   * When setting this variable to true it enables Two-Dimensional-Paging
@@@ -478,7 -479,7 +479,7 @@@ static bool spte_is_locklessly_modifiab
  static bool spte_has_volatile_bits(u64 spte)
  {
        /*
 -       * Always atomicly update spte if it can be updated
 +       * Always atomically update spte if it can be updated
         * out of mmu-lock, it can ensure dirty bit is not lost,
         * also, it can help us to get a stable is_writable_pte()
         * to ensure tlb flush is not missed.
@@@ -549,7 -550,7 +550,7 @@@ static bool mmu_spte_update(u64 *sptep
  
        /*
         * For the spte updated out of mmu-lock is safe, since
 -       * we always atomicly update it, see the comments in
 +       * we always atomically update it, see the comments in
         * spte_has_volatile_bits().
         */
        if (spte_is_locklessly_modifiable(old_spte) &&
@@@ -776,62 -777,85 +777,85 @@@ static struct kvm_lpage_info *lpage_inf
        return &slot->arch.lpage_info[level - 2][idx];
  }
  
+ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+                                           gfn_t gfn, int count)
+ {
+       struct kvm_lpage_info *linfo;
+       int i;
+       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+               linfo = lpage_info_slot(gfn, slot, i);
+               linfo->disallow_lpage += count;
+               WARN_ON(linfo->disallow_lpage < 0);
+       }
+ }
+ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       update_gfn_disallow_lpage_count(slot, gfn, 1);
+ }
+ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       update_gfn_disallow_lpage_count(slot, gfn, -1);
+ }
  static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
        gfn_t gfn;
-       int i;
  
+       kvm->arch.indirect_shadow_pages++;
        gfn = sp->gfn;
        slots = kvm_memslots_for_spte_role(kvm, sp->role);
        slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count += 1;
-       }
-       kvm->arch.indirect_shadow_pages++;
+       /* the non-leaf shadow pages are keeping readonly. */
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_add_page(kvm, slot, gfn,
+                                                   KVM_PAGE_TRACK_WRITE);
+       kvm_mmu_gfn_disallow_lpage(slot, gfn);
  }
  
  static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
        gfn_t gfn;
-       int i;
  
+       kvm->arch.indirect_shadow_pages--;
        gfn = sp->gfn;
        slots = kvm_memslots_for_spte_role(kvm, sp->role);
        slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count -= 1;
-               WARN_ON(linfo->write_count < 0);
-       }
-       kvm->arch.indirect_shadow_pages--;
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+                                                      KVM_PAGE_TRACK_WRITE);
+       kvm_mmu_gfn_allow_lpage(slot, gfn);
  }
  
- static int __has_wrprotected_page(gfn_t gfn, int level,
-                                 struct kvm_memory_slot *slot)
+ static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+                                         struct kvm_memory_slot *slot)
  {
        struct kvm_lpage_info *linfo;
  
        if (slot) {
                linfo = lpage_info_slot(gfn, slot, level);
-               return linfo->write_count;
+               return !!linfo->disallow_lpage;
        }
  
-       return 1;
+       return true;
  }
  
- static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                       int level)
  {
        struct kvm_memory_slot *slot;
  
        slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-       return __has_wrprotected_page(gfn, level, slot);
+       return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
  }
  
  static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@@ -897,7 -921,7 +921,7 @@@ static int mapping_level(struct kvm_vcp
        max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
  
        for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-               if (__has_wrprotected_page(large_gfn, level, slot))
+               if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
                        break;
  
        return level - 1;
@@@ -1323,23 -1347,29 +1347,29 @@@ void kvm_arch_mmu_enable_log_dirty_pt_m
                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
  }
  
- static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+                                   struct kvm_memory_slot *slot, u64 gfn)
  {
-       struct kvm_memory_slot *slot;
        struct kvm_rmap_head *rmap_head;
        int i;
        bool write_protected = false;
  
-       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                rmap_head = __gfn_to_rmap(gfn, i, slot);
-               write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+               write_protected |= __rmap_write_protect(kvm, rmap_head, true);
        }
  
        return write_protected;
  }
  
+ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ {
+       struct kvm_memory_slot *slot;
+       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+ }
  static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
  {
        u64 *sptep;
@@@ -1840,13 -1870,16 +1870,16 @@@ static int __mmu_unsync_walk(struct kvm
        return nr_unsync_leaf;
  }
  
+ #define INVALID_INDEX (-1)
  static int mmu_unsync_walk(struct kvm_mmu_page *sp,
                           struct kvm_mmu_pages *pvec)
  {
+       pvec->nr = 0;
        if (!sp->unsync_children)
                return 0;
  
-       mmu_pages_add(pvec, sp, 0);
+       mmu_pages_add(pvec, sp, INVALID_INDEX);
        return __mmu_unsync_walk(sp, pvec);
  }
  
@@@ -1956,13 -1989,12 +1989,12 @@@ static void kvm_sync_pages(struct kvm_v
  }
  
  struct mmu_page_path {
-       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-       unsigned int idx[PT64_ROOT_LEVEL-1];
+       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+       unsigned int idx[PT64_ROOT_LEVEL];
  };
  
  #define for_each_sp(pvec, sp, parents, i)                     \
-               for (i = mmu_pages_next(&pvec, &parents, -1),   \
-                       sp = pvec.page[i].sp;                   \
+               for (i = mmu_pages_first(&pvec, &parents);      \
                        i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
                        i = mmu_pages_next(&pvec, &parents, i))
  
@@@ -1974,19 -2006,43 +2006,43 @@@ static int mmu_pages_next(struct kvm_mm
  
        for (n = i+1; n < pvec->nr; n++) {
                struct kvm_mmu_page *sp = pvec->page[n].sp;
+               unsigned idx = pvec->page[n].idx;
+               int level = sp->role.level;
  
-               if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-                       parents->idx[0] = pvec->page[n].idx;
-                       return n;
-               }
+               parents->idx[level-1] = idx;
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
  
-               parents->parent[sp->role.level-2] = sp;
-               parents->idx[sp->role.level-1] = pvec->page[n].idx;
+               parents->parent[level-2] = sp;
        }
  
        return n;
  }
  
+ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+                          struct mmu_page_path *parents)
+ {
+       struct kvm_mmu_page *sp;
+       int level;
+       if (pvec->nr == 0)
+               return 0;
+       WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+       sp = pvec->page[0].sp;
+       level = sp->role.level;
+       WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+       parents->parent[level-2] = sp;
+       /* Also set up a sentinel.  Further entries in pvec are all
+        * children of sp, so this element is never overwritten.
+        */
+       parents->parent[level-1] = NULL;
+       return mmu_pages_next(pvec, parents, 0);
+ }
  static void mmu_pages_clear_parents(struct mmu_page_path *parents)
  {
        struct kvm_mmu_page *sp;
  
        do {
                unsigned int idx = parents->idx[level];
                sp = parents->parent[level];
                if (!sp)
                        return;
  
+               WARN_ON(idx == INVALID_INDEX);
                clear_unsync_child_bit(sp, idx);
                level++;
-       } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
- }
- static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
-                              struct mmu_page_path *parents,
-                              struct kvm_mmu_pages *pvec)
- {
-       parents->parent[parent->role.level-1] = NULL;
-       pvec->nr = 0;
+       } while (!sp->unsync_children);
  }
  
  static void mmu_sync_children(struct kvm_vcpu *vcpu,
        struct kvm_mmu_pages pages;
        LIST_HEAD(invalid_list);
  
-       kvm_mmu_pages_init(parent, &parents, &pages);
        while (mmu_unsync_walk(parent, &pages)) {
                bool protected = false;
  
                }
                kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
                cond_resched_lock(&vcpu->kvm->mmu_lock);
-               kvm_mmu_pages_init(parent, &parents, &pages);
        }
  }
  
  static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
  {
-       sp->write_flooding_count = 0;
+       atomic_set(&sp->write_flooding_count,  0);
  }
  
  static void clear_sp_write_flooding_count(u64 *spte)
@@@ -2112,12 -2158,18 +2158,18 @@@ static struct kvm_mmu_page *kvm_mmu_get
        hlist_add_head(&sp->hash_link,
                &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
        if (!direct) {
-               if (rmap_write_protect(vcpu, gfn))
+               /*
+                * we should do write protection before syncing pages
+                * otherwise the content of the synced shadow page may
+                * be inconsistent with guest page table.
+                */
+               account_shadowed(vcpu->kvm, sp);
+               if (level == PT_PAGE_TABLE_LEVEL &&
+                     rmap_write_protect(vcpu, gfn))
                        kvm_flush_remote_tlbs(vcpu->kvm);
                if (level > PT_PAGE_TABLE_LEVEL && need_sync)
                        kvm_sync_pages(vcpu, gfn);
-               account_shadowed(vcpu->kvm, sp);
        }
        sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
        clear_page(sp->spt);
@@@ -2269,7 -2321,6 +2321,6 @@@ static int mmu_zap_unsync_children(stru
        if (parent->role.level == PT_PAGE_TABLE_LEVEL)
                return 0;
  
-       kvm_mmu_pages_init(parent, &parents, &pages);
        while (mmu_unsync_walk(parent, &pages)) {
                struct kvm_mmu_page *sp;
  
                        mmu_pages_clear_parents(&parents);
                        zapped++;
                }
-               kvm_mmu_pages_init(parent, &parents, &pages);
        }
  
        return zapped;
@@@ -2354,8 -2404,8 +2404,8 @@@ static bool prepare_zap_oldest_mmu_page
        if (list_empty(&kvm->arch.active_mmu_pages))
                return false;
  
-       sp = list_entry(kvm->arch.active_mmu_pages.prev,
-                       struct kvm_mmu_page, link);
+       sp = list_last_entry(&kvm->arch.active_mmu_pages,
+                            struct kvm_mmu_page, link);
        kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
  
        return true;
@@@ -2408,7 -2458,7 +2458,7 @@@ int kvm_mmu_unprotect_page(struct kvm *
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
  
- static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  {
        trace_kvm_mmu_unsync_page(sp);
        ++vcpu->kvm->stat.mmu_unsync;
        kvm_mmu_mark_parents_unsync(sp);
  }
  
- static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                  bool can_unsync)
  {
-       struct kvm_mmu_page *s;
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
-               if (s->unsync)
-                       continue;
-               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-               __kvm_unsync_page(vcpu, s);
-       }
- }
+       struct kvm_mmu_page *sp;
  
- static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
-                                 bool can_unsync)
- {
-       struct kvm_mmu_page *s;
-       bool need_unsync = false;
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
  
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                if (!can_unsync)
-                       return 1;
+                       return true;
  
-               if (s->role.level != PT_PAGE_TABLE_LEVEL)
-                       return 1;
+               if (sp->unsync)
+                       continue;
  
-               if (!s->unsync)
-                       need_unsync = true;
+               WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+               kvm_unsync_page(vcpu, sp);
        }
-       if (need_unsync)
-               kvm_unsync_pages(vcpu, gfn);
-       return 0;
+       return false;
  }
  
  static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@@ -2503,7 -2542,7 +2542,7 @@@ static int set_spte(struct kvm_vcpu *vc
                 * be fixed if guest refault.
                 */
                if (level > PT_PAGE_TABLE_LEVEL &&
-                   has_wrprotected_page(vcpu, gfn, level))
+                   mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
                        goto done;
  
                spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@@ -2768,7 -2807,7 +2807,7 @@@ static void transparent_hugepage_adjust
        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
            level == PT_PAGE_TABLE_LEVEL &&
            PageTransCompound(pfn_to_page(pfn)) &&
-           !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+           !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                unsigned long mask;
                /*
                 * mmu_notifier_retry was successful and we hold the
  static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                kvm_pfn_t pfn, unsigned access, int *ret_val)
  {
-       bool ret = true;
        /* The pfn is invalid, report the error! */
        if (unlikely(is_error_pfn(pfn))) {
                *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
-               goto exit;
+               return true;
        }
  
        if (unlikely(is_noslot_pfn(pfn)))
                vcpu_cache_mmio_info(vcpu, gva, gfn, access);
  
-       ret = false;
- exit:
-       return ret;
+       return false;
  }
  
  static bool page_fault_can_be_fast(u32 error_code)
@@@ -3273,7 -3308,7 +3308,7 @@@ static bool is_shadow_zero_bits_set(str
        return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
  }
  
- static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
  {
        if (direct)
                return vcpu_match_mmio_gpa(vcpu, addr);
@@@ -3332,7 -3367,7 +3367,7 @@@ int handle_mmio_page_fault(struct kvm_v
        u64 spte;
        bool reserved;
  
-       if (quickly_check_mmio_pf(vcpu, addr, direct))
+       if (mmio_info_in_cache(vcpu, addr, direct))
                return RET_MMIO_PF_EMULATE;
  
        reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
  }
  EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
  
+ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+                                        u32 error_code, gfn_t gfn)
+ {
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return false;
+       if (!(error_code & PFERR_PRESENT_MASK) ||
+             !(error_code & PFERR_WRITE_MASK))
+               return false;
+       /*
+        * guest is writing the page which is write tracked which can
+        * not be fixed by page fault handler.
+        */
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
+       return false;
+ }
+ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+ {
+       struct kvm_shadow_walk_iterator iterator;
+       u64 spte;
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+               clear_sp_write_flooding_count(iterator.sptep);
+               if (!is_shadow_present_pte(spte))
+                       break;
+       }
+       walk_shadow_page_lockless_end(vcpu);
+ }
  static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                u32 error_code, bool prefault)
  {
-       gfn_t gfn;
+       gfn_t gfn = gva >> PAGE_SHIFT;
        int r;
  
        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
  
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gva, true);
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
  
        r = mmu_topup_memory_caches(vcpu);
        if (r)
  
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
  
-       gfn = gva >> PAGE_SHIFT;
  
        return nonpaging_map(vcpu, gva & PAGE_MASK,
                             error_code, gfn, prefault);
@@@ -3460,12 -3527,8 +3527,8 @@@ static int tdp_page_fault(struct kvm_vc
  
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
  
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gpa, true);
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
  
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@@ -4186,7 -4249,8 +4249,8 @@@ static bool detect_write_flooding(struc
        if (sp->role.level == PT_PAGE_TABLE_LEVEL)
                return false;
  
-       return ++sp->write_flooding_count >= 3;
+       atomic_inc(&sp->write_flooding_count);
+       return atomic_read(&sp->write_flooding_count) >= 3;
  }
  
  /*
@@@ -4248,8 -4312,8 +4312,8 @@@ static u64 *get_written_sptes(struct kv
        return spte;
  }
  
- void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes)
static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                             const u8 *new, int bytes)
  {
        gfn_t gfn = gpa >> PAGE_SHIFT;
        struct kvm_mmu_page *sp;
@@@ -4354,32 -4418,34 +4418,34 @@@ static void make_mmu_pages_available(st
        kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
  }
  
- static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
- {
-       if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
-               return vcpu_match_mmio_gpa(vcpu, addr);
-       return vcpu_match_mmio_gva(vcpu, addr);
- }
  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
                       void *insn, int insn_len)
  {
        int r, emulation_type = EMULTYPE_RETRY;
        enum emulation_result er;
+       bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+       if (unlikely(error_code & PFERR_RSVD_MASK)) {
+               r = handle_mmio_page_fault(vcpu, cr2, direct);
+               if (r == RET_MMIO_PF_EMULATE) {
+                       emulation_type = 0;
+                       goto emulate;
+               }
+               if (r == RET_MMIO_PF_RETRY)
+                       return 1;
+               if (r < 0)
+                       return r;
+       }
  
        r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
        if (r < 0)
-               goto out;
-       if (!r) {
-               r = 1;
-               goto out;
-       }
+               return r;
+       if (!r)
+               return 1;
  
-       if (is_mmio_page_fault(vcpu, cr2))
+       if (mmio_info_in_cache(vcpu, cr2, direct))
                emulation_type = 0;
+ emulate:
        er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
  
        switch (er) {
        default:
                BUG();
        }
- out:
-       return r;
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
  
@@@ -4463,6 -4527,21 +4527,21 @@@ void kvm_mmu_setup(struct kvm_vcpu *vcp
        init_kvm_mmu(vcpu);
  }
  
+ void kvm_mmu_init_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+       node->track_write = kvm_mmu_pte_write;
+       kvm_page_track_register_notifier(kvm, node);
+ }
+ void kvm_mmu_uninit_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+       kvm_page_track_unregister_notifier(kvm, node);
+ }
  /* The return value indicates if tlb flush on all vcpus is needed. */
  typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
  
index 2ce4f05e81d3804cef4ded542a105a5a917b8bed,4174cf290fa3f71fde863339eaba71efdb4e8269..49f1c0b9082babf4ef0b23887e72ba8ecc185ff3
@@@ -249,7 -249,7 +249,7 @@@ static int FNAME(update_accessed_dirty_
                        return ret;
  
                kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
 -              walker->ptes[level] = pte;
 +              walker->ptes[level - 1] = pte;
        }
        return 0;
  }
@@@ -702,23 -702,16 +702,16 @@@ static int FNAME(page_fault)(struct kvm
  
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
  
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-               /*
-                * page fault with PFEC.RSVD  = 1 is caused by shadow
-                * page fault, should not be used to walk guest page
-                * table.
-                */
-               error_code &= ~PFERR_RSVD_MASK;
-       };
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
  
+       /*
+        * If PFEC.RSVD is set, this is a shadow page fault.
+        * The bit needs to be cleared before walking guest page tables.
+        */
+       error_code &= ~PFERR_RSVD_MASK;
        /*
         * Look up the guest pte for the faulting address.
         */
                return 0;
        }
  
+       if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+               shadow_page_table_clear_flood(vcpu, addr);
+               return 1;
+       }
        vcpu->arch.write_fault_to_shadow_pgtable = false;
  
        is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
diff --combined arch/x86/kvm/vmx.c
index b92094ee135e62a1cefcd35df7a4860dc48d1966,46154dac71e64b14d7563d76abc4d5b7c1f70e3b..e87c494cb4769352c6662ecef01202f4a1b1f04f
@@@ -596,8 -596,6 +596,8 @@@ struct vcpu_vmx 
        /* Support for PML */
  #define PML_ENTITY_NUM                512
        struct page *pml_pg;
 +
 +      u64 current_tsc_ratio;
  };
  
  enum segment_cache_field {
@@@ -963,25 -961,36 +963,36 @@@ static const u32 vmx_msr_index[] = 
        MSR_EFER, MSR_TSC_AUX, MSR_STAR,
  };
  
- static inline bool is_page_fault(u32 intr_info)
+ static inline bool is_exception_n(u32 intr_info, u8 vector)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+ }
+ static inline bool is_debug(u32 intr_info)
+ {
+       return is_exception_n(intr_info, DB_VECTOR);
+ }
+ static inline bool is_breakpoint(u32 intr_info)
+ {
+       return is_exception_n(intr_info, BP_VECTOR);
+ }
+ static inline bool is_page_fault(u32 intr_info)
+ {
+       return is_exception_n(intr_info, PF_VECTOR);
  }
  
  static inline bool is_no_device(u32 intr_info)
  {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, NM_VECTOR);
  }
  
  static inline bool is_invalid_opcode(u32 intr_info)
  {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, UD_VECTOR);
  }
  
  static inline bool is_external_interrupt(u32 intr_info)
@@@ -2129,16 -2138,14 +2140,16 @@@ static void vmx_vcpu_load(struct kvm_vc
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
  
 -              /* Setup TSC multiplier */
 -              if (cpu_has_vmx_tsc_scaling())
 -                      vmcs_write64(TSC_MULTIPLIER,
 -                                   vcpu->arch.tsc_scaling_ratio);
 -
                vmx->loaded_vmcs->cpu = cpu;
        }
  
 +      /* Setup TSC multiplier */
 +      if (kvm_has_tsc_control &&
 +          vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
 +              vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
 +              vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
 +      }
 +
        vmx_vcpu_pi_load(vcpu, cpu);
  }
  
@@@ -5479,7 -5486,7 +5490,7 @@@ static int handle_set_cr4(struct kvm_vc
                return kvm_set_cr4(vcpu, val);
  }
  
 -/* called to set cr0 as approriate for clts instruction exit. */
 +/* called to set cr0 as appropriate for clts instruction exit. */
  static void handle_clts(struct kvm_vcpu *vcpu)
  {
        if (is_guest_mode(vcpu)) {
@@@ -5612,11 -5619,8 +5623,8 @@@ static int handle_dr(struct kvm_vcpu *v
        }
  
        if (vcpu->guest_debug == 0) {
-               u32 cpu_based_vm_exec_control;
-               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                               CPU_BASED_MOV_DR_EXITING);
  
                /*
                 * No more DR vmexits; force a reload of the debug registers
@@@ -5653,8 -5657,6 +5661,6 @@@ static void vmx_set_dr6(struct kvm_vcp
  
  static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
  {
-       u32 cpu_based_vm_exec_control;
        get_debugreg(vcpu->arch.db[0], 0);
        get_debugreg(vcpu->arch.db[1], 1);
        get_debugreg(vcpu->arch.db[2], 2);
        vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
  
        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
  }
  
  static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@@ -5751,8 -5750,7 +5754,7 @@@ static int handle_halt(struct kvm_vcpu 
  
  static int handle_vmcall(struct kvm_vcpu *vcpu)
  {
-       kvm_emulate_hypercall(vcpu);
-       return 1;
+       return kvm_emulate_hypercall(vcpu);
  }
  
  static int handle_invd(struct kvm_vcpu *vcpu)
@@@ -6439,8 -6437,8 +6441,8 @@@ static struct loaded_vmcs *nested_get_c
  
        if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
                /* Recycle the least recently used VMCS. */
-               item = list_entry(vmx->nested.vmcs02_pool.prev,
-                       struct vmcs02_list, list);
+               item = list_last_entry(&vmx->nested.vmcs02_pool,
+                                      struct vmcs02_list, list);
                item->vmptr = vmx->nested.current_vmptr;
                list_move(&item->list, &vmx->nested.vmcs02_pool);
                return &item->vmcs02;
@@@ -7227,7 -7225,7 +7229,7 @@@ static int handle_vmwrite(struct kvm_vc
        /* The value to write might be 32 or 64 bits, depending on L1's long
         * mode, and eventually we need to write that into a field of several
         * possible lengths. The code below first zero-extends the value to 64
 -       * bit (field_value), and then copies only the approriate number of
 +       * bit (field_value), and then copies only the appropriate number of
         * bits into the vmcs12 field.
         */
        u64 field_value = 0;
@@@ -7756,6 -7754,13 +7758,13 @@@ static bool nested_vmx_exit_handled(str
                else if (is_no_device(intr_info) &&
                         !(vmcs12->guest_cr0 & X86_CR0_TS))
                        return false;
+               else if (is_debug(intr_info) &&
+                        vcpu->guest_debug &
+                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       return false;
+               else if (is_breakpoint(intr_info) &&
+                        vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       return false;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@@ -8360,7 -8365,6 +8369,7 @@@ static void vmx_complete_atomic_exit(st
  static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
  {
        u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 +      register void *__sp asm(_ASM_SP);
  
        /*
         * If external interrupt exists, IF bit is set in rflags/eflags on the
                        "call *%[entry]\n\t"
                        :
  #ifdef CONFIG_X86_64
 -                      [sp]"=&r"(tmp)
 +                      [sp]"=&r"(tmp),
  #endif
 +                      "+r"(__sp)
                        :
                        [entry]"r"(entry),
                        [ss]"i"(__KERNEL_DS),
@@@ -10770,13 -10773,26 +10779,26 @@@ static int vmx_update_pi_irte(struct kv
                 */
  
                kvm_set_msi_irq(e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+                       /*
+                        * Make sure the IRTE is in remapped mode if
+                        * we don't handle it in posted mode.
+                        */
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       if (ret < 0) {
+                               printk(KERN_INFO
+                                  "failed to back to remapped mode, irq: %u\n",
+                                  host_irq);
+                               goto out;
+                       }
                        continue;
+               }
  
                vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
  
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
  
                if (set)
diff --combined arch/x86/kvm/x86.c
index 4838d35c9641d6cee63da0e930224d7b1d446e8c,60d6c0036a98287eb0b2ac56106a4a027c70363d..82445a8bdf09b067d2d8bdb1717a02f6a3fc1d27
@@@ -123,6 -123,9 +123,9 @@@ module_param(tsc_tolerance_ppm, uint, S
  unsigned int __read_mostly lapic_timer_advance_ns = 0;
  module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
  
+ static bool __read_mostly vector_hashing = true;
+ module_param(vector_hashing, bool, S_IRUGO);
  static bool __read_mostly backwards_tsc_observed = false;
  
  #define KVM_NR_SHARED_MSRS 16
@@@ -1196,17 -1199,11 +1199,11 @@@ static void kvm_write_wall_clock(struc
  
  static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
  {
-       uint32_t quotient, remainder;
-       /* Don't try to replace with do_div(), this one calculates
-        * "(dividend << 32) / divisor" */
-       __asm__ ( "divl %4"
-                 : "=a" (quotient), "=d" (remainder)
-                 : "0" (0), "1" (dividend), "r" (divisor) );
-       return quotient;
+       do_shl32_div32(dividend, divisor);
+       return dividend;
  }
  
- static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
                               s8 *pshift, u32 *pmultiplier)
  {
        uint64_t scaled64;
        uint64_t tps64;
        uint32_t tps32;
  
-       tps64 = base_khz * 1000LL;
-       scaled64 = scaled_khz * 1000LL;
+       tps64 = base_hz;
+       scaled64 = scaled_hz;
        while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
                tps64 >>= 1;
                shift--;
        *pshift = shift;
        *pmultiplier = div_frac(scaled64, tps32);
  
-       pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
-                __func__, base_khz, scaled_khz, shift, *pmultiplier);
+       pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+                __func__, base_hz, scaled_hz, shift, *pmultiplier);
  }
  
  #ifdef CONFIG_X86_64
@@@ -1293,23 -1290,23 +1290,23 @@@ static int set_tsc_khz(struct kvm_vcpu 
        return 0;
  }
  
- static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
  {
        u32 thresh_lo, thresh_hi;
        int use_scaling = 0;
  
        /* tsc_khz can be zero if TSC calibration fails */
-       if (this_tsc_khz == 0) {
+       if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
                vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
                return -1;
        }
  
        /* Compute a scale to convert nanoseconds in TSC cycles */
-       kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+       kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
                           &vcpu->arch.virtual_tsc_shift,
                           &vcpu->arch.virtual_tsc_mult);
-       vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+       vcpu->arch.virtual_tsc_khz = user_tsc_khz;
  
        /*
         * Compute the variation in TSC rate which is acceptable
         */
        thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
        thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
-       if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
-               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+       if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
                use_scaling = 1;
        }
-       return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+       return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
  }
  
  static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@@ -1562,7 -1559,7 +1559,7 @@@ static cycle_t read_tsc(void
  
        /*
         * GCC likes to generate cmov here, but this branch is extremely
 -       * predictable (it's just a funciton of time and the likely is
 +       * predictable (it's just a function of time and the likely is
         * very likely) and there's a data dependence, so force GCC
         * to generate a branch instead.  I don't barrier() because
         * we don't actually need a barrier, and if this function
@@@ -1716,7 -1713,7 +1713,7 @@@ static void kvm_gen_update_masterclock(
  
  static int kvm_guest_time_update(struct kvm_vcpu *v)
  {
-       unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+       unsigned long flags, tgt_tsc_khz;
        struct kvm_vcpu_arch *vcpu = &v->arch;
        struct kvm_arch *ka = &v->kvm->arch;
        s64 kernel_ns;
  
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-       if (unlikely(this_tsc_khz == 0)) {
+       tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+       if (unlikely(tgt_tsc_khz == 0)) {
                local_irq_restore(flags);
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
                return 1;
        if (!vcpu->pv_time_enabled)
                return 0;
  
-       if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-               tgt_tsc_khz = kvm_has_tsc_control ?
-                       vcpu->virtual_tsc_khz : this_tsc_khz;
-               kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+       if (kvm_has_tsc_control)
+               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+       if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+               kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
                                   &vcpu->hv_clock.tsc_shift,
                                   &vcpu->hv_clock.tsc_to_system_mul);
-               vcpu->hw_tsc_khz = this_tsc_khz;
+               vcpu->hw_tsc_khz = tgt_tsc_khz;
        }
  
        /* With all the info we got, fill in the values */
@@@ -2752,6 -2750,7 +2750,7 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
        }
  
        kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+       vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
  }
  
  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@@ -2987,7 -2986,7 +2986,7 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
        kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
  
        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
-           kvm_vcpu_has_lapic(vcpu))
+           lapic_in_kernel(vcpu))
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
  
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
                        vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
                else
                        vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-               if (kvm_vcpu_has_lapic(vcpu)) {
+               if (lapic_in_kernel(vcpu)) {
                        if (events->smi.latched_init)
                                set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
                        else
@@@ -3240,7 -3239,7 +3239,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
        switch (ioctl) {
        case KVM_GET_LAPIC: {
                r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                        goto out;
                u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
  
        }
        case KVM_SET_LAPIC: {
                r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                        goto out;
                u.lapic = memdup_user(argp, sizeof(*u.lapic));
                if (IS_ERR(u.lapic))
@@@ -3605,20 -3604,26 +3604,26 @@@ static int kvm_vm_ioctl_set_irqchip(str
  
  static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
  {
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+       BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+       mutex_lock(&kps->lock);
+       memcpy(ps, &kps->channels, sizeof(*ps));
+       mutex_unlock(&kps->lock);
        return 0;
  }
  
  static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
  {
        int i;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+       struct kvm_pit *pit = kvm->arch.vpit;
+       mutex_lock(&pit->pit_state.lock);
+       memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
        for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+               kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+       mutex_unlock(&pit->pit_state.lock);
        return 0;
  }
  
@@@ -3638,29 -3643,39 +3643,39 @@@ static int kvm_vm_ioctl_set_pit2(struc
        int start = 0;
        int i;
        u32 prev_legacy, cur_legacy;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       struct kvm_pit *pit = kvm->arch.vpit;
+       mutex_lock(&pit->pit_state.lock);
+       prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
        cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
        if (!prev_legacy && cur_legacy)
                start = 1;
-       memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
-              sizeof(kvm->arch.vpit->pit_state.channels));
-       kvm->arch.vpit->pit_state.flags = ps->flags;
+       memcpy(&pit->pit_state.channels, &ps->channels,
+              sizeof(pit->pit_state.channels));
+       pit->pit_state.flags = ps->flags;
        for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+               kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
                                   start && i == 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       mutex_unlock(&pit->pit_state.lock);
        return 0;
  }
  
  static int kvm_vm_ioctl_reinject(struct kvm *kvm,
                                 struct kvm_reinject_control *control)
  {
-       if (!kvm->arch.vpit)
+       struct kvm_pit *pit = kvm->arch.vpit;
+       if (!pit)
                return -ENXIO;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       /* pit->pit_state.lock was overloaded to prevent userspace from getting
+        * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+        * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
+        */
+       mutex_lock(&pit->pit_state.lock);
+       kvm_pit_set_reinject(pit, control->pit_reinject);
+       mutex_unlock(&pit->pit_state.lock);
        return 0;
  }
  
@@@ -4093,7 -4108,7 +4108,7 @@@ static int vcpu_mmio_write(struct kvm_v
  
        do {
                n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
                    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
                        break;
@@@ -4113,7 -4128,7 +4128,7 @@@ static int vcpu_mmio_read(struct kvm_vc
  
        do {
                n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
                                         addr, n, v))
                    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@@ -4346,7 -4361,7 +4361,7 @@@ int emulator_write_phys(struct kvm_vcp
        ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
        if (ret < 0)
                return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       kvm_page_track_write(vcpu, gpa, val, bytes);
        return 1;
  }
  
@@@ -4604,7 -4619,7 +4619,7 @@@ static int emulator_cmpxchg_emulated(st
                return X86EMUL_CMPXCHG_FAILED;
  
        kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
-       kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+       kvm_page_track_write(vcpu, gpa, new, bytes);
  
        return X86EMUL_CONTINUE;
  
@@@ -6010,7 -6025,7 +6025,7 @@@ static void update_cr8_intercept(struc
        if (!kvm_x86_ops->update_cr8_intercept)
                return;
  
-       if (!vcpu->arch.apic)
+       if (!lapic_in_kernel(vcpu))
                return;
  
        if (vcpu->arch.apicv_active)
@@@ -6618,12 -6633,12 +6633,12 @@@ static int vcpu_enter_guest(struct kvm_
         * KVM_DEBUGREG_WONT_EXIT again.
         */
        if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
 -              int i;
 -
                WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
                kvm_x86_ops->sync_dirty_debug_regs(vcpu);
 -              for (i = 0; i < KVM_NR_DB_REGS; i++)
 -                      vcpu->arch.eff_db[i] = vcpu->arch.db[i];
 +              kvm_update_dr0123(vcpu);
 +              kvm_update_dr6(vcpu);
 +              kvm_update_dr7(vcpu);
 +              vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
  
        /*
@@@ -7038,7 -7053,7 +7053,7 @@@ int kvm_arch_vcpu_ioctl_get_mpstate(str
  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
  {
-       if (!kvm_vcpu_has_lapic(vcpu) &&
+       if (!lapic_in_kernel(vcpu) &&
            mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                return -EINVAL;
  
@@@ -7593,6 -7608,7 +7608,7 @@@ bool kvm_vcpu_compatible(struct kvm_vcp
  }
  
  struct static_key kvm_no_apic_vcpu __read_mostly;
+ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
  
  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  {
@@@ -7724,6 -7740,9 +7740,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
  
+       kvm_page_track_init(kvm);
+       kvm_mmu_init_vm(kvm);
        return 0;
  }
  
@@@ -7850,6 -7869,7 +7869,7 @@@ void kvm_arch_destroy_vm(struct kvm *kv
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
        kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvm_mmu_uninit_vm(kvm);
  }
  
  void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                        free->arch.lpage_info[i - 1] = NULL;
                }
        }
+       kvm_page_track_free_memslot(free, dont);
  }
  
  int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
        int i;
  
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               struct kvm_lpage_info *linfo;
                unsigned long ugfn;
                int lpages;
                int level = i + 1;
                if (i == 0)
                        continue;
  
-               slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
-                                       sizeof(*slot->arch.lpage_info[i - 1]));
-               if (!slot->arch.lpage_info[i - 1])
+               linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+               if (!linfo)
                        goto out_free;
  
+               slot->arch.lpage_info[i - 1] = linfo;
                if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][0].write_count = 1;
+                       linfo[0].disallow_lpage = 1;
                if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+                       linfo[lpages - 1].disallow_lpage = 1;
                ugfn = slot->userspace_addr >> PAGE_SHIFT;
                /*
                 * If the gfn and userspace address are not aligned wrt each
                        unsigned long j;
  
                        for (j = 0; j < lpages; ++j)
-                               slot->arch.lpage_info[i - 1][j].write_count = 1;
+                               linfo[j].disallow_lpage = 1;
                }
        }
  
+       if (kvm_page_track_create_memslot(slot, npages))
+               goto out_free;
        return 0;
  
  out_free:
@@@ -8370,6 -8397,12 +8397,12 @@@ int kvm_arch_update_irqfd_routing(struc
        return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
  }
  
+ bool kvm_vector_hashing_enabled(void)
+ {
+       return vector_hashing;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --combined virt/kvm/async_pf.c
index bd3e7d8496e841b54cc0fe79c92a7dbf7948788c,c7e447c4296e3d5d33ee0bf00ade7a8739e7919d..db9668869f6ff6866a72f278def5770d71190994
@@@ -79,13 -79,7 +79,13 @@@ static void async_pf_execute(struct wor
  
        might_sleep();
  
 -      get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
 +      /*
 +       * This work is run asynchromously to the task which owns
 +       * mm and might be done in another context, so we must
 +       * use FOLL_REMOTE.
 +       */
 +      __get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL, FOLL_REMOTE);
 +
        kvm_async_page_present_sync(vcpu, apf);
  
        spin_lock(&vcpu->async_pf.lock);
         * This memory barrier pairs with prepare_to_wait's set_current_state()
         */
        smp_mb();
 -      if (waitqueue_active(&vcpu->wq))
 -              wake_up_interruptible(&vcpu->wq);
 +      if (swait_active(&vcpu->wq))
 +              swake_up(&vcpu->wq);
  
        mmput(mm);
        kvm_put_kvm(vcpu->kvm);
@@@ -115,8 -109,8 +115,8 @@@ void kvm_clear_async_pf_completion_queu
        /* cancel outstanding work queue item */
        while (!list_empty(&vcpu->async_pf.queue)) {
                struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.queue.next,
-                                  typeof(*work), queue);
+                       list_first_entry(&vcpu->async_pf.queue,
+                                        typeof(*work), queue);
                list_del(&work->queue);
  
  #ifdef CONFIG_KVM_ASYNC_PF_SYNC
        spin_lock(&vcpu->async_pf.lock);
        while (!list_empty(&vcpu->async_pf.done)) {
                struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.done.next,
-                                  typeof(*work), link);
+                       list_first_entry(&vcpu->async_pf.done,
+                                        typeof(*work), link);
                list_del(&work->link);
                kmem_cache_free(async_pf_cache, work);
        }
@@@ -178,7 -172,7 +178,7 @@@ int kvm_setup_async_pf(struct kvm_vcpu 
         * do alloc nowait since if we are going to sleep anyway we
         * may as well sleep faulting in page
         */
 -      work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
 +      work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN);
        if (!work)
                return 0;
  
diff --combined virt/kvm/kvm_main.c
index 1ca025816a8b1a23b54d476418b65c4d41717386,1eae05236347f1d1c4c6cc9912a524a845c53218..a6b987886b6c1001e68fbe09dc770778abfe5244
@@@ -72,11 -72,11 +72,11 @@@ module_param(halt_poll_ns, uint, S_IRUG
  
  /* Default doubles per-vcpu halt_poll_ns. */
  static unsigned int halt_poll_ns_grow = 2;
- module_param(halt_poll_ns_grow, int, S_IRUGO);
+ module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
  
  /* Default resets per-vcpu halt_poll_ns . */
  static unsigned int halt_poll_ns_shrink;
- module_param(halt_poll_ns_shrink, int, S_IRUGO);
+ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
  
  /*
   * Ordering of locks:
@@@ -216,7 -216,8 +216,7 @@@ int kvm_vcpu_init(struct kvm_vcpu *vcpu
        vcpu->kvm = kvm;
        vcpu->vcpu_id = id;
        vcpu->pid = NULL;
 -      vcpu->halt_poll_ns = 0;
 -      init_waitqueue_head(&vcpu->wq);
 +      init_swait_queue_head(&vcpu->wq);
        kvm_async_pf_vcpu_init(vcpu);
  
        vcpu->pre_pcpu = -1;
@@@ -619,13 -620,10 +619,10 @@@ void *kvm_kvzalloc(unsigned long size
  
  static void kvm_destroy_devices(struct kvm *kvm)
  {
-       struct list_head *node, *tmp;
+       struct kvm_device *dev, *tmp;
  
-       list_for_each_safe(node, tmp, &kvm->devices) {
-               struct kvm_device *dev =
-                       list_entry(node, struct kvm_device, vm_node);
-               list_del(node);
+       list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
+               list_del(&dev->vm_node);
                dev->ops->destroy(dev);
        }
  }
@@@ -1263,16 -1261,15 +1260,16 @@@ unsigned long kvm_vcpu_gfn_to_hva_prot(
        return gfn_to_hva_memslot_prot(slot, gfn, writable);
  }
  
 -static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 -      unsigned long start, int write, struct page **page)
 +static int get_user_page_nowait(unsigned long start, int write,
 +              struct page **page)
  {
        int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
  
        if (write)
                flags |= FOLL_WRITE;
  
 -      return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 +      return __get_user_pages(current, current->mm, start, 1, flags, page,
 +                      NULL, NULL);
  }
  
  static inline int check_user_page_hwpoison(unsigned long addr)
@@@ -1334,7 -1331,8 +1331,7 @@@ static int hva_to_pfn_slow(unsigned lon
  
        if (async) {
                down_read(&current->mm->mmap_sem);
 -              npages = get_user_page_nowait(current, current->mm,
 -                                            addr, write_fault, page);
 +              npages = get_user_page_nowait(addr, write_fault, page);
                up_read(&current->mm->mmap_sem);
        } else
                npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
@@@ -1436,11 -1434,17 +1433,17 @@@ kvm_pfn_t __gfn_to_pfn_memslot(struct k
  {
        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
  
-       if (addr == KVM_HVA_ERR_RO_BAD)
+       if (addr == KVM_HVA_ERR_RO_BAD) {
+               if (writable)
+                       *writable = false;
                return KVM_PFN_ERR_RO_FAULT;
+       }
  
-       if (kvm_is_error_hva(addr))
+       if (kvm_is_error_hva(addr)) {
+               if (writable)
+                       *writable = false;
                return KVM_PFN_NOSLOT;
+       }
  
        /* Do not map writable pfn in the readonly memslot. */
        if (writable && memslot_is_readonly(slot)) {
@@@ -1942,14 -1946,15 +1945,15 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_di
  
  static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
  {
-       int old, val;
+       unsigned int old, val, grow;
  
        old = val = vcpu->halt_poll_ns;
+       grow = READ_ONCE(halt_poll_ns_grow);
        /* 10us base */
-       if (val == 0 && halt_poll_ns_grow)
+       if (val == 0 && grow)
                val = 10000;
        else
-               val *= halt_poll_ns_grow;
+               val *= grow;
  
        vcpu->halt_poll_ns = val;
        trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
  
  static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
  {
-       int old, val;
+       unsigned int old, val, shrink;
  
        old = val = vcpu->halt_poll_ns;
-       if (halt_poll_ns_shrink == 0)
+       shrink = READ_ONCE(halt_poll_ns_shrink);
+       if (shrink == 0)
                val = 0;
        else
-               val /= halt_poll_ns_shrink;
+               val /= shrink;
  
        vcpu->halt_poll_ns = val;
        trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
@@@ -1989,7 -1995,7 +1994,7 @@@ static int kvm_vcpu_check_block(struct 
  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
  {
        ktime_t start, cur;
 -      DEFINE_WAIT(wait);
 +      DECLARE_SWAITQUEUE(wait);
        bool waited = false;
        u64 block_ns;
  
        kvm_arch_vcpu_blocking(vcpu);
  
        for (;;) {
 -              prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 +              prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
  
                if (kvm_vcpu_check_block(vcpu) < 0)
                        break;
                schedule();
        }
  
 -      finish_wait(&vcpu->wq, &wait);
 +      finish_swait(&vcpu->wq, &wait);
        cur = ktime_get();
  
        kvm_arch_vcpu_unblocking(vcpu);
@@@ -2055,11 -2061,11 +2060,11 @@@ void kvm_vcpu_kick(struct kvm_vcpu *vcp
  {
        int me;
        int cpu = vcpu->cpu;
 -      wait_queue_head_t *wqp;
 +      struct swait_queue_head *wqp;
  
        wqp = kvm_arch_vcpu_wq(vcpu);
 -      if (waitqueue_active(wqp)) {
 -              wake_up_interruptible(wqp);
 +      if (swait_active(wqp)) {
 +              swake_up(wqp);
                ++vcpu->stat.halt_wakeup;
        }
  
@@@ -2160,7 -2166,7 +2165,7 @@@ void kvm_vcpu_on_spin(struct kvm_vcpu *
                                continue;
                        if (vcpu == me)
                                continue;
 -                      if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
 +                      if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                continue;
                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                continue;