Merge remote-tracking branch 'kvm/linux-next'

author Stephen Rothwell <sfr@canb.auug.org.au>

Mon, 7 Mar 2016 04:54:35 +0000 (15:54 +1100)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Mon, 7 Mar 2016 04:54:35 +0000 (15:54 +1100)
author Stephen Rothwell <sfr@canb.auug.org.au>
Mon, 7 Mar 2016 04:54:35 +0000 (15:54 +1100)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Mon, 7 Mar 2016 04:54:35 +0000 (15:54 +1100)
diff --combined arch/powerpc/include/asm/kvm_host.h

index c98afa538b3aeca91901e858c02884e820ca3dfa,2e7c79101652ef863ee049da91b6047a193a440b..d7b343170453df82b4f31429020c8680e1f949bf
--- 1/arch/powerpc/include/asm/kvm_host.h
--- 2/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@@ -182,7 -182,10 +182,10 @@@ struct kvmppc_spapr_tce_table 
         struct list_head list;
         struct kvm *kvm;
         u64 liobn;
-       u32 window_size;
+       struct rcu_head rcu;
+       u32 page_shift;
+       u64 offset;             /* in pages */
+       u64 size;               /* window size in pages */
         struct page *pages[0];
   };
   
@@@ -289,7 -292,7 +292,7 @@@ struct kvmppc_vcore 
         struct list_head runnable_threads;
         struct list_head preempt_list;
         spinlock_t lock;
- -      wait_queue_head_t wq;
+ +      struct swait_queue_head wq;
         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
         u64 stolen_tb;
         u64 preempt_tb;
@@@ -629,7 -632,7 +632,7 @@@ struct kvm_vcpu_arch 
         u8 prodded;
         u32 last_inst;
   
- -      wait_queue_head_t *wqp;
+ +      struct swait_queue_head *wqp;
         struct kvmppc_vcore *vcore;
         int ret;
         int trap;
diff --combined arch/powerpc/include/asm/smp.h

index 174271ef2767c0c748daae4db37eb464dec874e2,78083ed20792f63844ad82941168c1f9d005f1b5..e1afd4c4f695f37037dfe3eac8c3730a87dfbc9d
--- 1/arch/powerpc/include/asm/smp.h
--- 2/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@@ -67,9 -67,6 +67,9 @@@ void generic_cpu_die(unsigned int cpu)
   void generic_set_cpu_dead(unsigned int cpu);
   void generic_set_cpu_up(unsigned int cpu);
   int generic_check_cpu_restart(unsigned int cpu);
+ +int is_cpu_dead(unsigned int cpu);
+ +#else
+ +#define generic_set_cpu_up(i) do { } while (0)
   #endif
   
   #ifdef CONFIG_PPC64
@@@ -117,6 -114,9 +117,9 @@@ extern int cpu_to_core_id(int cpu)
   #define PPC_MSG_TICK_BROADCAST        2
   #define PPC_MSG_DEBUGGER_BREAK  3
   
+ /* This is only used by the powernv kernel */
+ #define PPC_MSG_RM_HOST_ACTION        4
+ 
   /* for irq controllers that have dedicated ipis per message (4) */
   extern int smp_request_message_ipi(int virq, int message);
   extern const char *smp_ipi_name[];
@@@ -124,6 -124,7 +127,7 @@@
   /* for irq controllers with only a single ipi */
   extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
   extern void smp_muxed_ipi_message_pass(int cpu, int msg);
+ extern void smp_muxed_ipi_set_message(int cpu, int msg);
   extern irqreturn_t smp_ipi_demux(void);
   
   void smp_init_pSeries(void);
@@@ -200,7 -201,6 +204,7 @@@ extern void generic_secondary_thread_in
   extern unsigned long __secondary_hold_spinloop;
   extern unsigned long __secondary_hold_acknowledge;
   extern char __secondary_hold;
+ +extern unsigned int booting_thread_hwid;
   
   extern void __early_start(void);
   #endif /* __ASSEMBLY__ */
diff --combined arch/powerpc/include/asm/xics.h

index 5d61bbced6a11d67e387fc30b76e47d8f0bc5dfd,254604856e69e1cb1ced217b4191b21b664834ba..04ef3ae511da85104ba01570deb9050c5f2f4dd0
--- 1/arch/powerpc/include/asm/xics.h
--- 2/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@@ -1,5 -1,5 +1,5 @@@
   /*
- - * Common definitions accross all variants of ICP and ICS interrupt
+ + * Common definitions across all variants of ICP and ICS interrupt
    * controllers.
    */
   
@@@ -30,6 -30,7 +30,7 @@@
   #ifdef CONFIG_PPC_ICP_NATIVE
   extern int icp_native_init(void);
   extern void icp_native_flush_interrupt(void);
+ extern void icp_native_cause_ipi_rm(int cpu);
   #else
   static inline int icp_native_init(void) { return -ENODEV; }
   #endif
diff --combined arch/powerpc/kernel/smp.c

index a3cc75baddccb0ecf70c496b6231459ca0e5980b,cb8be5dc118a72876dc0e93c5bb510bf4e307a49..8cac1eb414661ad6e3340a8469361fe9a885d117
--- 1/arch/powerpc/kernel/smp.c
--- 2/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@@ -206,7 -206,7 +206,7 @@@ int smp_request_message_ipi(int virq, i
   
   #ifdef CONFIG_PPC_SMP_MUXED_IPI
   struct cpu_messages {
-       int messages;                   /* current messages */
+       long messages;                  /* current messages */
         unsigned long data;             /* data for cause ipi */
   };
   static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
@@@ -218,7 -218,7 +218,7 @@@ void smp_muxed_ipi_set_data(int cpu, un
         info->data = data;
   }
   
- void smp_muxed_ipi_message_pass(int cpu, int msg)
+ void smp_muxed_ipi_set_message(int cpu, int msg)
   {
         struct cpu_messages *info = &per_cpu(ipi_message, cpu);
         char *message = (char *)&info->messages;
@@@ -228,6 -228,13 +228,13 @@@
          */
         smp_mb();
         message[msg] = 1;
+ }
+ 
+ void smp_muxed_ipi_message_pass(int cpu, int msg)
+ {
+       struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+ 
+       smp_muxed_ipi_set_message(cpu, msg);
         /*
          * cause_ipi functions are required to include a full barrier
          * before doing whatever causes the IPI.
@@@ -236,20 -243,31 +243,31 @@@
   }
   
   #ifdef __BIG_ENDIAN__
- #define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+ #define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
   #else
- #define IPI_MESSAGE(A) (1 << (8 * (A)))
+ #define IPI_MESSAGE(A) (1uL << (8 * (A)))
   #endif
   
   irqreturn_t smp_ipi_demux(void)
   {
         struct cpu_messages *info = this_cpu_ptr(&ipi_message);
-       unsigned int all;
+       unsigned long all;
   
         mb();   /* order any irq clear */
   
         do {
                 all = xchg(&info->messages, 0);
+ #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+               /*
+                * Must check for PPC_MSG_RM_HOST_ACTION messages
+                * before PPC_MSG_CALL_FUNCTION messages because when
+                * a VM is destroyed, we call kick_all_cpus_sync()
+                * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+                * messages have completed before we free any VCPUs.
+                */
+               if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+                       kvmppc_xics_ipi_action();
+ #endif
                 if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
                         generic_smp_call_function_interrupt();
                 if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
@@@ -427,7 -445,7 +445,7 @@@ void generic_cpu_die(unsigned int cpu
   
         for (i = 0; i < 100; i++) {
                 smp_rmb();
- -              if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+ +              if (is_cpu_dead(cpu))
                         return;
                 msleep(100);
         }
@@@ -454,11 -472,6 +472,11 @@@ int generic_check_cpu_restart(unsigned 
         return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
   }
   
+ +int is_cpu_dead(unsigned int cpu)
+ +{
+ +      return per_cpu(cpu_state, cpu) == CPU_DEAD;
+ +}
+ +
   static bool secondaries_inhibited(void)
   {
         return kvm_hv_mode_active();
@@@ -732,7 -745,7 +750,7 @@@ void start_secondary(void *unused
   
         local_irq_enable();
   
- -      cpu_startup_entry(CPUHP_ONLINE);
+ +      cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
   
         BUG();
   }
diff --combined arch/powerpc/kvm/book3s_64_vio.c

index 9c3b76bb69d93ad647c693fa94703237603446ef,2c2d1030843acf5736e4ac0dd08c87b3522f2b17..82970042295eb6ca30b78c57f2b1ba650cebb6f1
--- 1/arch/powerpc/kvm/book3s_64_vio.c
--- 2/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@@ -14,6 -14,7 +14,7 @@@
    *
    * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
    * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+  * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
    */
   
   #include <linux/types.h>
@@@ -30,34 -31,75 +31,75 @@@
   #include <asm/tlbflush.h>
   #include <asm/kvm_ppc.h>
   #include <asm/kvm_book3s.h>
- -#include <asm/mmu-hash64.h>
+ +#include <asm/book3s/64/mmu-hash.h>
   #include <asm/hvcall.h>
   #include <asm/synch.h>
   #include <asm/ppc-opcode.h>
   #include <asm/kvm_host.h>
   #include <asm/udbg.h>
+ #include <asm/iommu.h>
+ #include <asm/tce.h>
   
- #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
+ static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
+ {
+       return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+ }
   
- static long kvmppc_stt_npages(unsigned long window_size)
+ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
   {
-       return ALIGN((window_size >> SPAPR_TCE_SHIFT)
-                    * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+       unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) +
+                       (tce_pages * sizeof(struct page *));
+ 
+       return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
   }
   
- static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
   {
-       struct kvm *kvm = stt->kvm;
-       int i;
+       long ret = 0;
   
-       mutex_lock(&kvm->lock);
-       list_del(&stt->list);
-       for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+       if (!current || !current->mm)
+               return ret; /* process exited */
+ 
+       down_write(&current->mm->mmap_sem);
+ 
+       if (inc) {
+               unsigned long locked, lock_limit;
+ 
+               locked = current->mm->locked_vm + stt_pages;
+               lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+               if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                       ret = -ENOMEM;
+               else
+                       current->mm->locked_vm += stt_pages;
+       } else {
+               if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
+                       stt_pages = current->mm->locked_vm;
+ 
+               current->mm->locked_vm -= stt_pages;
+       }
+ 
+       pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
+                       inc ? '+' : '-',
+                       stt_pages << PAGE_SHIFT,
+                       current->mm->locked_vm << PAGE_SHIFT,
+                       rlimit(RLIMIT_MEMLOCK),
+                       ret ? " - exceeded" : "");
+ 
+       up_write(&current->mm->mmap_sem);
+ 
+       return ret;
+ }
+ 
+ static void release_spapr_tce_table(struct rcu_head *head)
+ {
+       struct kvmppc_spapr_tce_table *stt = container_of(head,
+                       struct kvmppc_spapr_tce_table, rcu);
+       unsigned long i, npages = kvmppc_tce_pages(stt->size);
+ 
+       for (i = 0; i < npages; i++)
                 __free_page(stt->pages[i]);
-       kfree(stt);
-       mutex_unlock(&kvm->lock);
   
-       kvm_put_kvm(kvm);
+       kfree(stt);
   }
   
   static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@@ -65,7 -107,7 +107,7 @@@
         struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
         struct page *page;
   
-       if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+       if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
                 return VM_FAULT_SIGBUS;
   
         page = stt->pages[vmf->pgoff];
@@@ -88,7 -130,14 +130,14 @@@ static int kvm_spapr_tce_release(struc
   {
         struct kvmppc_spapr_tce_table *stt = filp->private_data;
   
-       release_spapr_tce_table(stt);
+       list_del_rcu(&stt->list);
+ 
+       kvm_put_kvm(stt->kvm);
+ 
+       kvmppc_account_memlimit(
+               kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
+       call_rcu(&stt->rcu, release_spapr_tce_table);
+ 
         return 0;
   }
   
@@@ -98,20 -147,29 +147,29 @@@ static const struct file_operations kvm
   };
   
   long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-                                  struct kvm_create_spapr_tce *args)
+                                  struct kvm_create_spapr_tce_64 *args)
   {
         struct kvmppc_spapr_tce_table *stt = NULL;
-       long npages;
+       unsigned long npages, size;
         int ret = -ENOMEM;
         int i;
   
+       if (!args->size)
+               return -EINVAL;
+ 
         /* Check this LIOBN hasn't been previously allocated */
         list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
                 if (stt->liobn == args->liobn)
                         return -EBUSY;
         }
   
-       npages = kvmppc_stt_npages(args->window_size);
+       size = args->size;
+       npages = kvmppc_tce_pages(size);
+       ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+       if (ret) {
+               stt = NULL;
+               goto fail;
+       }
   
         stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
                       GFP_KERNEL);
@@@ -119,7 -177,9 +177,9 @@@
                 goto fail;
   
         stt->liobn = args->liobn;
-       stt->window_size = args->window_size;
+       stt->page_shift = args->page_shift;
+       stt->offset = args->offset;
+       stt->size = size;
         stt->kvm = kvm;
   
         for (i = 0; i < npages; i++) {
@@@ -131,7 -191,7 +191,7 @@@
         kvm_get_kvm(kvm);
   
         mutex_lock(&kvm->lock);
-       list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+       list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
   
         mutex_unlock(&kvm->lock);
   
@@@ -148,3 -208,59 +208,59 @@@ fail
         }
         return ret;
   }
+ 
+ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_list, unsigned long npages)
+ {
+       struct kvmppc_spapr_tce_table *stt;
+       long i, ret = H_SUCCESS, idx;
+       unsigned long entry, ua = 0;
+       u64 __user *tces, tce;
+ 
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+ 
+       entry = ioba >> stt->page_shift;
+       /*
+        * SPAPR spec says that the maximum size of the list is 512 TCEs
+        * so the whole table fits in 4K page
+        */
+       if (npages > 512)
+               return H_PARAMETER;
+ 
+       if (tce_list & (SZ_4K - 1))
+               return H_PARAMETER;
+ 
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
+ 
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+       if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+               ret = H_TOO_HARD;
+               goto unlock_exit;
+       }
+       tces = (u64 __user *) ua;
+ 
+       for (i = 0; i < npages; ++i) {
+               if (get_user(tce, tces + i)) {
+                       ret = H_TOO_HARD;
+                       goto unlock_exit;
+               }
+               tce = be64_to_cpu(tce);
+ 
+               ret = kvmppc_tce_validate(stt, tce);
+               if (ret != H_SUCCESS)
+                       goto unlock_exit;
+ 
+               kvmppc_tce_put(stt, entry + i, tce);
+       }
+ 
+ unlock_exit:
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ 
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);
diff --combined arch/powerpc/kvm/book3s_64_vio_hv.c

index 039028d3ccb5ab38a39ea52dfa1bad00a6052d74,44be73e6aa26b6563d1cc4cc91e1a332a84271c3..f88b859af53b5c85d71a35e3d11b4dace211674e
--- 1/arch/powerpc/kvm/book3s_64_vio_hv.c
--- 2/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@@ -14,6 -14,7 +14,7 @@@
    *
    * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
    * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+  * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
    */
   
   #include <linux/types.h>
@@@ -29,77 -30,322 +30,322 @@@
   #include <asm/tlbflush.h>
   #include <asm/kvm_ppc.h>
   #include <asm/kvm_book3s.h>
- -#include <asm/mmu-hash64.h>
+ +#include <asm/book3s/64/mmu-hash.h>
+ #include <asm/mmu_context.h>
   #include <asm/hvcall.h>
   #include <asm/synch.h>
   #include <asm/ppc-opcode.h>
   #include <asm/kvm_host.h>
   #include <asm/udbg.h>
+ #include <asm/iommu.h>
+ #include <asm/tce.h>
+ #include <asm/iommu.h>
   
   #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
   
- /* WARNING: This will be called in real-mode on HV KVM and virtual
+ /*
+  * Finds a TCE table descriptor by LIOBN.
+  *
+  * WARNING: This will be called in real or virtual mode on HV KVM and virtual
    *          mode on PR KVM
    */
- long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-                     unsigned long ioba, unsigned long tce)
+ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+               unsigned long liobn)
   {
         struct kvm *kvm = vcpu->kvm;
         struct kvmppc_spapr_tce_table *stt;
   
+       list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
+               if (stt->liobn == liobn)
+                       return stt;
+ 
+       return NULL;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_find_table);
+ 
+ /*
+  * Validates IO address.
+  *
+  * WARNING: This will be called in real-mode on HV KVM and virtual
+  *          mode on PR KVM
+  */
+ long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+               unsigned long ioba, unsigned long npages)
+ {
+       unsigned long mask = (1ULL << stt->page_shift) - 1;
+       unsigned long idx = ioba >> stt->page_shift;
+ 
+       if ((ioba & mask) || (idx < stt->offset) ||
+                       (idx - stt->offset + npages > stt->size) ||
+                       (idx + npages < idx))
+               return H_PARAMETER;
+ 
+       return H_SUCCESS;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
+ 
+ /*
+  * Validates TCE address.
+  * At the moment flags and page mask are validated.
+  * As the host kernel does not access those addresses (just puts them
+  * to the table and user space is supposed to process them), we can skip
+  * checking other things (such as TCE is a guest RAM address or the page
+  * was actually allocated).
+  *
+  * WARNING: This will be called in real-mode on HV KVM and virtual
+  *          mode on PR KVM
+  */
+ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+ {
+       unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
+       unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+ 
+       if (tce & mask)
+               return H_PARAMETER;
+ 
+       return H_SUCCESS;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+ 
+ /* Note on the use of page_address() in real mode,
+  *
+  * It is safe to use page_address() in real mode on ppc64 because
+  * page_address() is always defined as lowmem_page_address()
+  * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic
+  * operation and does not access page struct.
+  *
+  * Theoretically page_address() could be defined different
+  * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+  * would have to be enabled.
+  * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+  * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+  * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+  * is not expected to be enabled on ppc32, page_address()
+  * is safe for ppc32 as well.
+  *
+  * WARNING: This will be called in real-mode on HV KVM and virtual
+  *          mode on PR KVM
+  */
+ static u64 *kvmppc_page_address(struct page *page)
+ {
+ #if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+ #error TODO: fix to avoid page_address() here
+ #endif
+       return (u64 *) page_address(page);
+ }
+ 
+ /*
+  * Handles TCE requests for emulated devices.
+  * Puts guest TCE values to the table and expects user space to convert them.
+  * Called in both real and virtual modes.
+  * Cannot fail so kvmppc_tce_validate must be called before it.
+  *
+  * WARNING: This will be called in real-mode on HV KVM and virtual
+  *          mode on PR KVM
+  */
+ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+               unsigned long idx, unsigned long tce)
+ {
+       struct page *page;
+       u64 *tbl;
+ 
+       idx -= stt->offset;
+       page = stt->pages[idx / TCES_PER_PAGE];
+       tbl = kvmppc_page_address(page);
+ 
+       tbl[idx % TCES_PER_PAGE] = tce;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_tce_put);
+ 
+ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+               unsigned long *ua, unsigned long **prmap)
+ {
+       unsigned long gfn = gpa >> PAGE_SHIFT;
+       struct kvm_memory_slot *memslot;
+ 
+       memslot = search_memslots(kvm_memslots(kvm), gfn);
+       if (!memslot)
+               return -EINVAL;
+ 
+       *ua = __gfn_to_hva_memslot(memslot, gfn) |
+               (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+ 
+ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       if (prmap)
+               *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ #endif
+ 
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
+ 
+ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba, unsigned long tce)
+ {
+       struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+       long ret;
+ 
         /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
         /*          liobn, ioba, tce); */
   
-       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-               if (stt->liobn == liobn) {
-                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-                       struct page *page;
-                       u64 *tbl;
- 
-                       /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
-                       /*          liobn, stt, stt->window_size); */
-                       if (ioba >= stt->window_size)
-                               return H_PARAMETER;
- 
-                       page = stt->pages[idx / TCES_PER_PAGE];
-                       tbl = (u64 *)page_address(page);
- 
-                       /* FIXME: Need to validate the TCE itself */
-                       /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
-                       tbl[idx % TCES_PER_PAGE] = tce;
-                       return H_SUCCESS;
-               }
-       }
+       if (!stt)
+               return H_TOO_HARD;
+ 
+       ret = kvmppc_ioba_validate(stt, ioba, 1);
+       if (ret != H_SUCCESS)
+               return ret;
   
-       /* Didn't find the liobn, punt it to userspace */
-       return H_TOO_HARD;
+       ret = kvmppc_tce_validate(stt, tce);
+       if (ret != H_SUCCESS)
+               return ret;
+ 
+       kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+ 
+       return H_SUCCESS;
   }
   EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
   
- long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-                     unsigned long ioba)
+ static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
+               unsigned long ua, unsigned long *phpa)
+ {
+       pte_t *ptep, pte;
+       unsigned shift = 0;
+ 
+       ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift);
+       if (!ptep || !pte_present(*ptep))
+               return -ENXIO;
+       pte = *ptep;
+ 
+       if (!shift)
+               shift = PAGE_SHIFT;
+ 
+       /* Avoid handling anything potentially complicated in realmode */
+       if (shift > PAGE_SHIFT)
+               return -EAGAIN;
+ 
+       if (!pte_young(pte))
+               return -EAGAIN;
+ 
+       *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) |
+                       (ua & ~PAGE_MASK);
+ 
+       return 0;
+ }
+ 
+ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_list, unsigned long npages)
   {
-       struct kvm *kvm = vcpu->kvm;
         struct kvmppc_spapr_tce_table *stt;
+       long i, ret = H_SUCCESS;
+       unsigned long tces, entry, ua = 0;
+       unsigned long *rmap = NULL;
+ 
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+ 
+       entry = ioba >> stt->page_shift;
+       /*
+        * The spec says that the maximum size of the list is 512 TCEs
+        * so the whole table addressed resides in 4K page
+        */
+       if (npages > 512)
+               return H_PARAMETER;
+ 
+       if (tce_list & (SZ_4K - 1))
+               return H_PARAMETER;
+ 
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
   
-       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-               if (stt->liobn == liobn) {
-                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-                       struct page *page;
-                       u64 *tbl;
+       if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+               return H_TOO_HARD;
   
-                       if (ioba >= stt->window_size)
-                               return H_PARAMETER;
+       rmap = (void *) vmalloc_to_phys(rmap);
   
-                       page = stt->pages[idx / TCES_PER_PAGE];
-                       tbl = (u64 *)page_address(page);
+       /*
+        * Synchronize with the MMU notifier callbacks in
+        * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+        * While we have the rmap lock, code running on other CPUs
+        * cannot finish unmapping the host real page that backs
+        * this guest real page, so we are OK to access the host
+        * real page.
+        */
+       lock_rmap(rmap);
+       if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+               ret = H_TOO_HARD;
+               goto unlock_exit;
+       }
+ 
+       for (i = 0; i < npages; ++i) {
+               unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
+ 
+               ret = kvmppc_tce_validate(stt, tce);
+               if (ret != H_SUCCESS)
+                       goto unlock_exit;
   
-                       vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
-                       return H_SUCCESS;
-               }
+               kvmppc_tce_put(stt, entry + i, tce);
         }
   
-       /* Didn't find the liobn, punt it to userspace */
-       return H_TOO_HARD;
+ unlock_exit:
+       unlock_rmap(rmap);
+ 
+       return ret;
+ }
+ 
+ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_value, unsigned long npages)
+ {
+       struct kvmppc_spapr_tce_table *stt;
+       long i, ret;
+ 
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+ 
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
+ 
+       /* Check permission bits only to allow userspace poison TCE for debug */
+       if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
+               return H_PARAMETER;
+ 
+       for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
+               kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+ 
+       return H_SUCCESS;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
+ 
+ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba)
+ {
+       struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+       long ret;
+       unsigned long idx;
+       struct page *page;
+       u64 *tbl;
+ 
+       if (!stt)
+               return H_TOO_HARD;
+ 
+       ret = kvmppc_ioba_validate(stt, ioba, 1);
+       if (ret != H_SUCCESS)
+               return ret;
+ 
+       idx = (ioba >> stt->page_shift) - stt->offset;
+       page = stt->pages[idx / TCES_PER_PAGE];
+       tbl = (u64 *)page_address(page);
+ 
+       vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+ 
+       return H_SUCCESS;
   }
   EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
+ 
+ #endif /* KVM_BOOK3S_HV_POSSIBLE */
diff --combined arch/powerpc/kvm/book3s_hv.c

index f1187bb6dd4d7f5960e57aea111bd1c12021408d,f47fffefadc1fb8f0a53d13bfdb9d7cbcf55d80d..84fb4fcfaa41b802a614515c67539b0d2d7ee3cf
--- 1/arch/powerpc/kvm/book3s_hv.c
--- 2/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@@ -81,6 -81,17 +81,17 @@@ static int target_smt_mode
   module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
   MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
   
+ #ifdef CONFIG_KVM_XICS
+ static struct kernel_param_ops module_param_ops = {
+       .set = param_set_int,
+       .get = param_get_int,
+ };
+ 
+ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
+                                                       S_IRUGO | S_IWUSR);
+ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+ #endif
+ 
   static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
   static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
   
@@@ -114,11 -125,11 +125,11 @@@ static bool kvmppc_ipi_thread(int cpu
   static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
   {
         int cpu;
- -      wait_queue_head_t *wqp;
+ +      struct swait_queue_head *wqp;
   
         wqp = kvm_arch_vcpu_wq(vcpu);
- -      if (waitqueue_active(wqp)) {
- -              wake_up_interruptible(wqp);
+ +      if (swait_active(wqp)) {
+ +              swake_up(wqp);
                 ++vcpu->stat.halt_wakeup;
         }
   
@@@ -701,8 -712,8 +712,8 @@@ int kvmppc_pseries_do_hcall(struct kvm_
                 tvcpu->arch.prodded = 1;
                 smp_mb();
                 if (vcpu->arch.ceded) {
- -                      if (waitqueue_active(&vcpu->wq)) {
- -                              wake_up_interruptible(&vcpu->wq);
+ +                      if (swait_active(&vcpu->wq)) {
+ +                              swake_up(&vcpu->wq);
                                 vcpu->stat.halt_wakeup++;
                         }
                 }
@@@ -768,7 -779,31 +779,31 @@@
                 if (kvmppc_xics_enabled(vcpu)) {
                         ret = kvmppc_xics_hcall(vcpu, req);
                         break;
-               } /* fallthrough */
+               }
+               return RESUME_HOST;
+       case H_PUT_TCE:
+               ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_PUT_TCE_INDIRECT:
+               ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_STUFF_TCE:
+               ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
         default:
                 return RESUME_HOST;
         }
@@@ -1459,7 -1494,7 +1494,7 @@@ static struct kvmppc_vcore *kvmppc_vcor
         INIT_LIST_HEAD(&vcore->runnable_threads);
         spin_lock_init(&vcore->lock);
         spin_lock_init(&vcore->stoltb_lock);
- -      init_waitqueue_head(&vcore->wq);
+ +      init_swait_queue_head(&vcore->wq);
         vcore->preempt_tb = TB_NIL;
         vcore->lpcr = kvm->arch.lpcr;
         vcore->first_vcpuid = core * threads_per_subcore;
@@@ -2278,6 -2313,46 +2313,46 @@@ static void post_guest_process(struct k
         spin_unlock(&vc->lock);
   }
   
+ /*
+  * Clear core from the list of active host cores as we are about to
+  * enter the guest. Only do this if it is the primary thread of the
+  * core (not if a subcore) that is entering the guest.
+  */
+ static inline void kvmppc_clear_host_core(int cpu)
+ {
+       int core;
+ 
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+       /*
+        * Memory barrier can be omitted here as we will do a smp_wmb()
+        * later in kvmppc_start_thread and we need ensure that state is
+        * visible to other CPUs only after we enter guest.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+ }
+ 
+ /*
+  * Advertise this core as an active host core since we exited the guest
+  * Only need to do this if it is the primary thread of the core that is
+  * exiting.
+  */
+ static inline void kvmppc_set_host_core(int cpu)
+ {
+       int core;
+ 
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+ 
+       /*
+        * Memory barrier can be omitted here because we do a spin_unlock
+        * immediately after this which provides the memory barrier.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+ }
+ 
   /*
    * Run a set of guest threads on a physical core.
    * Called with vc->lock held.
@@@ -2390,6 -2465,8 +2465,8 @@@ static noinline void kvmppc_run_core(st
                 }
         }
   
+       kvmppc_clear_host_core(pcpu);
+ 
         /* Start all the threads */
         active = 0;
         for (sub = 0; sub < core_info.n_subcores; ++sub) {
@@@ -2486,6 -2563,8 +2563,8 @@@
                         kvmppc_ipi_thread(pcpu + i);
         }
   
+       kvmppc_set_host_core(pcpu);
+ 
         spin_unlock(&vc->lock);
   
         /* make sure updates to secondary vcpu structs are visible now */
@@@ -2531,9 -2610,10 +2610,9 @@@ static void kvmppc_vcore_blocked(struc
   {
         struct kvm_vcpu *vcpu;
         int do_sleep = 1;
+ +      DECLARE_SWAITQUEUE(wait);
   
- -      DEFINE_WAIT(wait);
- -
- -      prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+ +      prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
   
         /*
          * Check one last time for pending exceptions and ceded state after
@@@ -2547,7 -2627,7 +2626,7 @@@
         }
   
         if (!do_sleep) {
- -              finish_wait(&vc->wq, &wait);
+ +              finish_swait(&vc->wq, &wait);
                 return;
         }
   
@@@ -2555,7 -2635,7 +2634,7 @@@
         trace_kvmppc_vcore_blocked(vc, 0);
         spin_unlock(&vc->lock);
         schedule();
- -      finish_wait(&vc->wq, &wait);
+ +      finish_swait(&vc->wq, &wait);
         spin_lock(&vc->lock);
         vc->vcore_state = VCORE_INACTIVE;
         trace_kvmppc_vcore_blocked(vc, 1);
@@@ -2611,7 -2691,7 +2690,7 @@@ static int kvmppc_run_vcpu(struct kvm_r
                         kvmppc_start_thread(vcpu, vc);
                         trace_kvm_guest_enter(vcpu);
                 } else if (vc->vcore_state == VCORE_SLEEPING) {
- -                      wake_up(&vc->wq);
+ +                      swake_up(&vc->wq);
                 }
   
         }
@@@ -2983,6 -3063,114 +3062,114 @@@ static int kvmppc_hv_setup_htab_rma(str
         goto out_srcu;
   }
   
+ #ifdef CONFIG_KVM_XICS
+ static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+                       void *hcpu)
+ {
+       unsigned long cpu = (long)hcpu;
+ 
+       switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               kvmppc_set_host_core(cpu);
+               break;
+ 
+ #ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+               kvmppc_clear_host_core(cpu);
+               break;
+ #endif
+       default:
+               break;
+       }
+ 
+       return NOTIFY_OK;
+ }
+ 
+ static struct notifier_block kvmppc_cpu_notifier = {
+           .notifier_call = kvmppc_cpu_notify,
+ };
+ 
+ /*
+  * Allocate a per-core structure for managing state about which cores are
+  * running in the host versus the guest and for exchanging data between
+  * real mode KVM and CPU running in the host.
+  * This is only done for the first VM.
+  * The allocated structure stays even if all VMs have stopped.
+  * It is only freed when the kvm-hv module is unloaded.
+  * It's OK for this routine to fail, we just don't support host
+  * core operations like redirecting H_IPI wakeups.
+  */
+ void kvmppc_alloc_host_rm_ops(void)
+ {
+       struct kvmppc_host_rm_ops *ops;
+       unsigned long l_ops;
+       int cpu, core;
+       int size;
+ 
+       /* Not the first time here ? */
+       if (kvmppc_host_rm_ops_hv != NULL)
+               return;
+ 
+       ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+       if (!ops)
+               return;
+ 
+       size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+       ops->rm_core = kzalloc(size, GFP_KERNEL);
+ 
+       if (!ops->rm_core) {
+               kfree(ops);
+               return;
+       }
+ 
+       get_online_cpus();
+ 
+       for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+               if (!cpu_online(cpu))
+                       continue;
+ 
+               core = cpu >> threads_shift;
+               ops->rm_core[core].rm_state.in_host = 1;
+       }
+ 
+       ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+ 
+       /*
+        * Make the contents of the kvmppc_host_rm_ops structure visible
+        * to other CPUs before we assign it to the global variable.
+        * Do an atomic assignment (no locks used here), but if someone
+        * beats us to it, just free our copy and return.
+        */
+       smp_wmb();
+       l_ops = (unsigned long) ops;
+ 
+       if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+               put_online_cpus();
+               kfree(ops->rm_core);
+               kfree(ops);
+               return;
+       }
+ 
+       register_cpu_notifier(&kvmppc_cpu_notifier);
+ 
+       put_online_cpus();
+ }
+ 
+ void kvmppc_free_host_rm_ops(void)
+ {
+       if (kvmppc_host_rm_ops_hv) {
+               unregister_cpu_notifier(&kvmppc_cpu_notifier);
+               kfree(kvmppc_host_rm_ops_hv->rm_core);
+               kfree(kvmppc_host_rm_ops_hv);
+               kvmppc_host_rm_ops_hv = NULL;
+       }
+ }
+ #endif
+ 
   static int kvmppc_core_init_vm_hv(struct kvm *kvm)
   {
         unsigned long lpcr, lpid;
@@@ -2995,6 -3183,8 +3182,8 @@@
                 return -ENOMEM;
         kvm->arch.lpid = lpid;
   
+       kvmppc_alloc_host_rm_ops();
+ 
         /*
          * Since we don't flush the TLB when tearing down a VM,
          * and this lpid might have previously been used,
@@@ -3228,6 -3418,7 +3417,7 @@@ static int kvmppc_book3s_init_hv(void
   
   static void kvmppc_book3s_exit_hv(void)
   {
+       kvmppc_free_host_rm_ops();
         kvmppc_hv_ops = NULL;
   }
   
diff --combined arch/powerpc/kvm/book3s_hv_rmhandlers.S

index c613fee0b9f7fa7e5accc99976362ddca1b668d4,ed16182a008b7f10b7aa53c3af2d0f3fd167fe25..b20b2071372b3ae7249c9a00699fdfd3277be69d
--- 1/arch/powerpc/kvm/book3s_hv_rmhandlers.S
--- 2/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@@ -27,7 -27,7 +27,7 @@@
   #include <asm/asm-offsets.h>
   #include <asm/exception-64s.h>
   #include <asm/kvm_book3s_asm.h>
- -#include <asm/mmu-hash64.h>
+ +#include <asm/book3s/64/mmu-hash.h>
   #include <asm/tm.h>
   
   #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
@@@ -2006,8 -2006,8 +2006,8 @@@ hcall_real_table
         .long   0               /* 0x12c */
         .long   0               /* 0x130 */
         .long   DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
-       .long   0               /* 0x138 */
-       .long   0               /* 0x13c */
+       .long   DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+       .long   DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
         .long   0               /* 0x140 */
         .long   0               /* 0x144 */
         .long   0               /* 0x148 */
diff --combined arch/s390/include/asm/kvm_host.h

index b0c8ad0799c7f0c09607420441ea87735c88c6d9,727e7f7b33fddbb977c91b72e7d922399e7cb1b0..59174b16dd98783b158553af7d90142575689880
--- 1/arch/s390/include/asm/kvm_host.h
--- 2/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@@ -229,17 -229,11 +229,11 @@@ struct kvm_s390_itdb 
         __u8    data[256];
   } __packed;
   
- struct kvm_s390_vregs {
-       __vector128 vrs[32];
-       __u8    reserved200[512];       /* for future vector expansion */
- } __packed;
- 
   struct sie_page {
         struct kvm_s390_sie_block sie_block;
         __u8 reserved200[1024];         /* 0x0200 */
         struct kvm_s390_itdb itdb;      /* 0x0600 */
-       __u8 reserved700[1280];         /* 0x0700 */
-       struct kvm_s390_vregs vregs;    /* 0x0c00 */
+       __u8 reserved700[2304];         /* 0x0700 */
   } __packed;
   
   struct kvm_vcpu_stat {
@@@ -467,7 -461,7 +461,7 @@@ struct kvm_s390_irq_payload 
   struct kvm_s390_local_interrupt {
         spinlock_t lock;
         struct kvm_s390_float_interrupt *float_int;
- -      wait_queue_head_t *wq;
+ +      struct swait_queue_head *wq;
         atomic_t *cpuflags;
         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
         struct kvm_s390_irq_payload irq;
diff --combined arch/s390/kvm/interrupt.c

index 9ffc7322179213f031939fa184bc6c93545af559,87e2d1a89d74eaba5e398392ee2bcd86cdbb0acb..3105390865c87cabd0f0a8b4e03e60534f9bb07b
--- 1/arch/s390/kvm/interrupt.c
--- 2/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@@ -335,23 -335,6 +335,6 @@@ static void set_intercept_indicators(st
         set_intercept_indicators_stop(vcpu);
   }
   
- static u16 get_ilc(struct kvm_vcpu *vcpu)
- {
-       switch (vcpu->arch.sie_block->icptcode) {
-       case ICPT_INST:
-       case ICPT_INSTPROGI:
-       case ICPT_OPEREXC:
-       case ICPT_PARTEXEC:
-       case ICPT_IOINST:
-               /* last instruction only stored for these icptcodes */
-               return insn_length(vcpu->arch.sie_block->ipa >> 8);
-       case ICPT_PROGI:
-               return vcpu->arch.sie_block->pgmilc;
-       default:
-               return 0;
-       }
- }
- 
   static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
   {
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@@ -588,7 -571,7 +571,7 @@@ static int __must_check __deliver_prog(
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
         struct kvm_s390_pgm_info pgm_info;
         int rc = 0, nullifying = false;
-       u16 ilc = get_ilc(vcpu);
+       u16 ilen;
   
         spin_lock(&li->lock);
         pgm_info = li->irq.pgm;
@@@ -596,8 -579,9 +579,9 @@@
         memset(&li->irq.pgm, 0, sizeof(pgm_info));
         spin_unlock(&li->lock);
   
-       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d",
-                  pgm_info.code, ilc);
+       ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
+       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
+                  pgm_info.code, ilen);
         vcpu->stat.deliver_program_int++;
         trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                          pgm_info.code, 0);
@@@ -681,10 -665,11 +665,11 @@@
                                    (u8 *) __LC_PER_ACCESS_ID);
         }
   
-       if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
-               kvm_s390_rewind_psw(vcpu, ilc);
+       if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND))
+               kvm_s390_rewind_psw(vcpu, ilen);
   
-       rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
+       /* bit 1+2 of the target are the ilc, so we can directly use ilen */
+       rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
         rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
                                  (u64 *) __LC_LAST_BREAK);
         rc |= put_guest_lc(vcpu, pgm_info.code,
@@@ -966,13 -951,13 +951,13 @@@ no_timer
   
   void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
   {
- -      if (waitqueue_active(&vcpu->wq)) {
+ +      if (swait_active(&vcpu->wq)) {
                 /*
                  * The vcpu gave up the cpu voluntarily, mark it as a good
                  * yield-candidate.
                  */
                 vcpu->preempted = true;
- -              wake_up_interruptible(&vcpu->wq);
+ +              swake_up(&vcpu->wq);
                 vcpu->stat.halt_wakeup++;
         }
   }
@@@ -1059,8 -1044,16 +1044,16 @@@ static int __inject_prog(struct kvm_vcp
         trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                    irq->u.pgm.code, 0);
   
+       if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) {
+               /* auto detection if no valid ILC was given */
+               irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK;
+               irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu);
+               irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID;
+       }
+ 
         if (irq->u.pgm.code == PGM_PER) {
                 li->irq.pgm.code |= PGM_PER;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                 /* only modify PER related information */
                 li->irq.pgm.per_address = irq->u.pgm.per_address;
                 li->irq.pgm.per_code = irq->u.pgm.per_code;
@@@ -1069,6 -1062,7 +1062,7 @@@
         } else if (!(irq->u.pgm.code & PGM_PER)) {
                 li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
                                    irq->u.pgm.code;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                 /* only modify non-PER information */
                 li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
                 li->irq.pgm.mon_code = irq->u.pgm.mon_code;
diff --combined arch/x86/kvm/lapic.c

index 3a045f39ed8114e24e375521135cb7d2296e9e7e,d9ae1ce2a6a03e0e8ebac52ea88c94dd913fe5f7..443d2a57ad3d9620246097a48ed3cd7de9e02f50
--- 1/arch/x86/kvm/lapic.c
--- 2/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@@ -281,7 -281,7 +281,7 @@@ void kvm_apic_set_version(struct kvm_vc
         struct kvm_cpuid_entry2 *feat;
         u32 v = APIC_VERSION;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return;
   
         feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@@ -475,26 -475,20 +475,20 @@@ static inline void apic_clear_isr(int v
   
   int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
   {
-       int highest_irr;
- 
         /* This may race with setting of irr in __apic_accept_irq() and
          * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
          * will cause vmexit immediately and the value will be recalculated
          * on the next vmentry.
          */
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
-       highest_irr = apic_find_highest_irr(vcpu->arch.apic);
- 
-       return highest_irr;
+       return apic_find_highest_irr(vcpu->arch.apic);
   }
   
   static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                              int vector, int level, int trig_mode,
-                            unsigned long *dest_map);
+                            struct dest_map *dest_map);
   
   int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-               unsigned long *dest_map)
+                    struct dest_map *dest_map)
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
@@@ -675,8 -669,33 +669,33 @@@ bool kvm_apic_match_dest(struct kvm_vcp
         }
   }
   
+ int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+                      const unsigned long *bitmap, u32 bitmap_size)
+ {
+       u32 mod;
+       int i, idx = -1;
+ 
+       mod = vector % dest_vcpus;
+ 
+       for (i = 0; i <= mod; i++) {
+               idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+               BUG_ON(idx == bitmap_size);
+       }
+ 
+       return idx;
+ }
+ 
+ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+ {
+       if (!kvm->arch.disabled_lapic_found) {
+               kvm->arch.disabled_lapic_found = true;
+               printk(KERN_INFO
+                      "Disabled LAPIC found during irq injection\n");
+       }
+ }
+ 
   bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
   {
         struct kvm_apic_map *map;
         unsigned long bitmap = 1;
@@@ -727,21 -746,42 +746,42 @@@
   
                 dst = map->logical_map[cid];
   
-               if (kvm_lowest_prio_delivery(irq)) {
+               if (!kvm_lowest_prio_delivery(irq))
+                       goto set_irq;
+ 
+               if (!kvm_vector_hashing_enabled()) {
                         int l = -1;
                         for_each_set_bit(i, &bitmap, 16) {
                                 if (!dst[i])
                                         continue;
                                 if (l < 0)
                                         l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
+                                                       dst[l]->vcpu) < 0)
                                         l = i;
                         }
- 
                         bitmap = (l >= 0) ? 1 << l : 0;
+               } else {
+                       int idx;
+                       unsigned int dest_vcpus;
+ 
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
+                               goto out;
+ 
+                       idx = kvm_vector_to_index(irq->vector,
+                               dest_vcpus, &bitmap, 16);
+ 
+                       if (!dst[idx]) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
+ 
+                       bitmap = (idx >= 0) ? 1 << idx : 0;
                 }
         }
   
+ set_irq:
         for_each_set_bit(i, &bitmap, 16) {
                 if (!dst[i])
                         continue;
@@@ -754,6 -794,20 +794,20 @@@ out
         return ret;
   }
   
+ /*
+  * This routine tries to handler interrupts in posted mode, here is how
+  * it deals with different cases:
+  * - For single-destination interrupts, handle it in posted mode
+  * - Else if vector hashing is enabled and it is a lowest-priority
+  *   interrupt, handle it in posted mode and use the following mechanism
+  *   to find the destinaiton vCPU.
+  *    1. For lowest-priority interrupts, store all the possible
+  *       destination vCPUs in an array.
+  *    2. Use "guest vector % max number of destination vCPUs" to find
+  *       the right destination vCPU in the array for the lowest-priority
+  *       interrupt.
+  * - Otherwise, use remapped mode to inject the interrupt.
+  */
   bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                         struct kvm_vcpu **dest_vcpu)
   {
@@@ -795,16 -849,37 +849,37 @@@
                 if (cid >= ARRAY_SIZE(map->logical_map))
                         goto out;
   
-               for_each_set_bit(i, &bitmap, 16) {
-                       dst = map->logical_map[cid][i];
-                       if (++r == 2)
+               if (kvm_vector_hashing_enabled() &&
+                               kvm_lowest_prio_delivery(irq)) {
+                       int idx;
+                       unsigned int dest_vcpus;
+ 
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
                                 goto out;
-               }
   
-               if (dst && kvm_apic_present(dst->vcpu))
+                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+                                                 &bitmap, 16);
+ 
+                       dst = map->logical_map[cid][idx];
+                       if (!dst) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
+ 
                         *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
+               } else {
+                       for_each_set_bit(i, &bitmap, 16) {
+                               dst = map->logical_map[cid][i];
+                               if (++r == 2)
+                                       goto out;
+                       }
+ 
+                       if (dst && kvm_apic_present(dst->vcpu))
+                               *dest_vcpu = dst->vcpu;
+                       else
+                               goto out;
+               }
         }
   
         ret = true;
@@@ -819,7 -894,7 +894,7 @@@ out
    */
   static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                              int vector, int level, int trig_mode,
-                            unsigned long *dest_map)
+                            struct dest_map *dest_map)
   {
         int result = 0;
         struct kvm_vcpu *vcpu = apic->vcpu;
@@@ -839,8 -914,10 +914,10 @@@
   
                 result = 1;
   
-               if (dest_map)
-                       __set_bit(vcpu->vcpu_id, dest_map);
+               if (dest_map) {
+                       __set_bit(vcpu->vcpu_id, dest_map->map);
+                       dest_map->vectors[vcpu->vcpu_id] = vector;
+               }
   
                 if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
                         if (trig_mode)
@@@ -1195,7 -1272,7 +1272,7 @@@ static void apic_update_lvtt(struct kvm
   static void apic_timer_expired(struct kvm_lapic *apic)
   {
         struct kvm_vcpu *vcpu = apic->vcpu;
- -      wait_queue_head_t *q = &vcpu->wq;
+ +      struct swait_queue_head *q = &vcpu->wq;
         struct kvm_timer *ktimer = &apic->lapic_timer;
   
         if (atomic_read(&apic->lapic_timer.pending))
@@@ -1204,8 -1281,8 +1281,8 @@@
         atomic_inc(&apic->lapic_timer.pending);
         kvm_set_pending_timer(vcpu);
   
- -      if (waitqueue_active(q))
- -              wake_up_interruptible(q);
+ +      if (swait_active(q))
+ +              swake_up(q);
   
         if (apic_lvtt_tscdeadline(apic))
                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
@@@ -1239,7 -1316,7 +1316,7 @@@ void wait_lapic_expire(struct kvm_vcpu 
         struct kvm_lapic *apic = vcpu->arch.apic;
         u64 guest_tsc, tsc_deadline;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return;
   
         if (apic->lapic_timer.expired_tscdeadline == 0)
@@@ -1515,8 -1592,7 +1592,7 @@@ static int apic_mmio_write(struct kvm_v
   
   void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
   {
-       if (kvm_vcpu_has_lapic(vcpu))
-               apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+       apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
   }
   EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
   
@@@ -1566,7 -1642,7 +1642,7 @@@ u64 kvm_get_lapic_tscdeadline_msr(struc
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                         apic_lvtt_period(apic))
                 return 0;
   
@@@ -1577,7 -1653,7 +1653,7 @@@ void kvm_set_lapic_tscdeadline_msr(stru
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                         apic_lvtt_period(apic))
                 return;
   
@@@ -1590,9 -1666,6 +1666,6 @@@ void kvm_lapic_set_tpr(struct kvm_vcpu 
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
- 
         apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
                      | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
   }
@@@ -1601,9 -1674,6 +1674,6 @@@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *
   {
         u64 tpr;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
- 
         tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
   
         return (tpr & 0xf0) >> 4;
@@@ -1728,8 -1798,7 +1798,7 @@@ int apic_has_pending_timer(struct kvm_v
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
-                       apic_lvt_enabled(apic, APIC_LVTT))
+       if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
                 return atomic_read(&apic->lapic_timer.pending);
   
         return 0;
@@@ -1826,7 -1895,7 +1895,7 @@@ int kvm_apic_has_interrupt(struct kvm_v
         struct kvm_lapic *apic = vcpu->arch.apic;
         int highest_irr;
   
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+       if (!apic_enabled(apic))
                 return -1;
   
         apic_update_ppr(apic);
@@@ -1854,9 -1923,6 +1923,6 @@@ void kvm_inject_apic_timer_irqs(struct 
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
- 
         if (atomic_read(&apic->lapic_timer.pending) > 0) {
                 kvm_apic_local_deliver(apic, APIC_LVTT);
                 if (apic_lvtt_tscdeadline(apic))
@@@ -1932,7 -1998,7 +1998,7 @@@ void __kvm_migrate_apic_timer(struct kv
   {
         struct hrtimer *timer;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return;
   
         timer = &vcpu->arch.apic->lapic_timer.timer;
@@@ -2105,7 -2171,7 +2171,7 @@@ int kvm_hv_vapic_msr_write(struct kvm_v
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return 1;
   
         /* if this is ICR write vector before command */
@@@ -2119,7 -2185,7 +2185,7 @@@ int kvm_hv_vapic_msr_read(struct kvm_vc
         struct kvm_lapic *apic = vcpu->arch.apic;
         u32 low, high = 0;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return 1;
   
         if (apic_reg_read(apic, reg, 4, &low))
@@@ -2151,7 -2217,7 +2217,7 @@@ void kvm_apic_accept_events(struct kvm_
         u8 sipi_vector;
         unsigned long pe;
   
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+       if (!lapic_in_kernel(vcpu) || !apic->pending_events)
                 return;
   
         /*
diff --combined arch/x86/kvm/mmu.c

index e1bb320dd5b256caee31c0342ff8789731da0299,0a4dc9b541810c58eb6c88bfead1ba45323fc894..af631279a2e64f4b5283822f28916178b45a3ad8
--- 1/arch/x86/kvm/mmu.c
--- 2/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@@ -41,6 -41,7 +41,7 @@@
   #include <asm/cmpxchg.h>
   #include <asm/io.h>
   #include <asm/vmx.h>
+ #include <asm/kvm_page_track.h>
   
   /*
    * When setting this variable to true it enables Two-Dimensional-Paging
@@@ -478,7 -479,7 +479,7 @@@ static bool spte_is_locklessly_modifiab
   static bool spte_has_volatile_bits(u64 spte)
   {
         /*
- -       * Always atomicly update spte if it can be updated
+ +       * Always atomically update spte if it can be updated
          * out of mmu-lock, it can ensure dirty bit is not lost,
          * also, it can help us to get a stable is_writable_pte()
          * to ensure tlb flush is not missed.
@@@ -549,7 -550,7 +550,7 @@@ static bool mmu_spte_update(u64 *sptep
   
         /*
          * For the spte updated out of mmu-lock is safe, since
- -       * we always atomicly update it, see the comments in
+ +       * we always atomically update it, see the comments in
          * spte_has_volatile_bits().
          */
         if (spte_is_locklessly_modifiable(old_spte) &&
@@@ -776,62 -777,85 +777,85 @@@ static struct kvm_lpage_info *lpage_inf
         return &slot->arch.lpage_info[level - 2][idx];
   }
   
+ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+                                           gfn_t gfn, int count)
+ {
+       struct kvm_lpage_info *linfo;
+       int i;
+ 
+       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+               linfo = lpage_info_slot(gfn, slot, i);
+               linfo->disallow_lpage += count;
+               WARN_ON(linfo->disallow_lpage < 0);
+       }
+ }
+ 
+ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       update_gfn_disallow_lpage_count(slot, gfn, 1);
+ }
+ 
+ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       update_gfn_disallow_lpage_count(slot, gfn, -1);
+ }
+ 
   static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
   {
         struct kvm_memslots *slots;
         struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
         gfn_t gfn;
-       int i;
   
+       kvm->arch.indirect_shadow_pages++;
         gfn = sp->gfn;
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count += 1;
-       }
-       kvm->arch.indirect_shadow_pages++;
+ 
+       /* the non-leaf shadow pages are keeping readonly. */
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_add_page(kvm, slot, gfn,
+                                                   KVM_PAGE_TRACK_WRITE);
+ 
+       kvm_mmu_gfn_disallow_lpage(slot, gfn);
   }
   
   static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
   {
         struct kvm_memslots *slots;
         struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
         gfn_t gfn;
-       int i;
   
+       kvm->arch.indirect_shadow_pages--;
         gfn = sp->gfn;
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count -= 1;
-               WARN_ON(linfo->write_count < 0);
-       }
-       kvm->arch.indirect_shadow_pages--;
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+                                                      KVM_PAGE_TRACK_WRITE);
+ 
+       kvm_mmu_gfn_allow_lpage(slot, gfn);
   }
   
- static int __has_wrprotected_page(gfn_t gfn, int level,
-                                 struct kvm_memory_slot *slot)
+ static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+                                         struct kvm_memory_slot *slot)
   {
         struct kvm_lpage_info *linfo;
   
         if (slot) {
                 linfo = lpage_info_slot(gfn, slot, level);
-               return linfo->write_count;
+               return !!linfo->disallow_lpage;
         }
   
-       return 1;
+       return true;
   }
   
- static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                       int level)
   {
         struct kvm_memory_slot *slot;
   
         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-       return __has_wrprotected_page(gfn, level, slot);
+       return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
   }
   
   static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@@ -897,7 -921,7 +921,7 @@@ static int mapping_level(struct kvm_vcp
         max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
   
         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-               if (__has_wrprotected_page(large_gfn, level, slot))
+               if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
                         break;
   
         return level - 1;
@@@ -1323,23 -1347,29 +1347,29 @@@ void kvm_arch_mmu_enable_log_dirty_pt_m
                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
   }
   
- static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+                                   struct kvm_memory_slot *slot, u64 gfn)
   {
-       struct kvm_memory_slot *slot;
         struct kvm_rmap_head *rmap_head;
         int i;
         bool write_protected = false;
   
-       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- 
         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                 rmap_head = __gfn_to_rmap(gfn, i, slot);
-               write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+               write_protected |= __rmap_write_protect(kvm, rmap_head, true);
         }
   
         return write_protected;
   }
   
+ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ {
+       struct kvm_memory_slot *slot;
+ 
+       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+ }
+ 
   static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
   {
         u64 *sptep;
@@@ -1840,13 -1870,16 +1870,16 @@@ static int __mmu_unsync_walk(struct kvm
         return nr_unsync_leaf;
   }
   
+ #define INVALID_INDEX (-1)
+ 
   static int mmu_unsync_walk(struct kvm_mmu_page *sp,
                            struct kvm_mmu_pages *pvec)
   {
+       pvec->nr = 0;
         if (!sp->unsync_children)
                 return 0;
   
-       mmu_pages_add(pvec, sp, 0);
+       mmu_pages_add(pvec, sp, INVALID_INDEX);
         return __mmu_unsync_walk(sp, pvec);
   }
   
@@@ -1956,13 -1989,12 +1989,12 @@@ static void kvm_sync_pages(struct kvm_v
   }
   
   struct mmu_page_path {
-       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-       unsigned int idx[PT64_ROOT_LEVEL-1];
+       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+       unsigned int idx[PT64_ROOT_LEVEL];
   };
   
   #define for_each_sp(pvec, sp, parents, i)                     \
-               for (i = mmu_pages_next(&pvec, &parents, -1),   \
-                       sp = pvec.page[i].sp;                   \
+               for (i = mmu_pages_first(&pvec, &parents);      \
                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
                         i = mmu_pages_next(&pvec, &parents, i))
   
@@@ -1974,19 -2006,43 +2006,43 @@@ static int mmu_pages_next(struct kvm_mm
   
         for (n = i+1; n < pvec->nr; n++) {
                 struct kvm_mmu_page *sp = pvec->page[n].sp;
+               unsigned idx = pvec->page[n].idx;
+               int level = sp->role.level;
   
-               if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-                       parents->idx[0] = pvec->page[n].idx;
-                       return n;
-               }
+               parents->idx[level-1] = idx;
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
   
-               parents->parent[sp->role.level-2] = sp;
-               parents->idx[sp->role.level-1] = pvec->page[n].idx;
+               parents->parent[level-2] = sp;
         }
   
         return n;
   }
   
+ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+                          struct mmu_page_path *parents)
+ {
+       struct kvm_mmu_page *sp;
+       int level;
+ 
+       if (pvec->nr == 0)
+               return 0;
+ 
+       WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+ 
+       sp = pvec->page[0].sp;
+       level = sp->role.level;
+       WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+ 
+       parents->parent[level-2] = sp;
+ 
+       /* Also set up a sentinel.  Further entries in pvec are all
+        * children of sp, so this element is never overwritten.
+        */
+       parents->parent[level-1] = NULL;
+       return mmu_pages_next(pvec, parents, 0);
+ }
+ 
   static void mmu_pages_clear_parents(struct mmu_page_path *parents)
   {
         struct kvm_mmu_page *sp;
@@@ -1994,22 -2050,14 +2050,14 @@@
   
         do {
                 unsigned int idx = parents->idx[level];
- 
                 sp = parents->parent[level];
                 if (!sp)
                         return;
   
+               WARN_ON(idx == INVALID_INDEX);
                 clear_unsync_child_bit(sp, idx);
                 level++;
-       } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
- }
- 
- static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
-                              struct mmu_page_path *parents,
-                              struct kvm_mmu_pages *pvec)
- {
-       parents->parent[parent->role.level-1] = NULL;
-       pvec->nr = 0;
+       } while (!sp->unsync_children);
   }
   
   static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@@ -2021,7 -2069,6 +2069,6 @@@
         struct kvm_mmu_pages pages;
         LIST_HEAD(invalid_list);
   
-       kvm_mmu_pages_init(parent, &parents, &pages);
         while (mmu_unsync_walk(parent, &pages)) {
                 bool protected = false;
   
@@@ -2037,13 -2084,12 +2084,12 @@@
                 }
                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
                 cond_resched_lock(&vcpu->kvm->mmu_lock);
-               kvm_mmu_pages_init(parent, &parents, &pages);
         }
   }
   
   static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
   {
-       sp->write_flooding_count = 0;
+       atomic_set(&sp->write_flooding_count,  0);
   }
   
   static void clear_sp_write_flooding_count(u64 *spte)
@@@ -2112,12 -2158,18 +2158,18 @@@ static struct kvm_mmu_page *kvm_mmu_get
         hlist_add_head(&sp->hash_link,
                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
         if (!direct) {
-               if (rmap_write_protect(vcpu, gfn))
+               /*
+                * we should do write protection before syncing pages
+                * otherwise the content of the synced shadow page may
+                * be inconsistent with guest page table.
+                */
+               account_shadowed(vcpu->kvm, sp);
+               if (level == PT_PAGE_TABLE_LEVEL &&
+                     rmap_write_protect(vcpu, gfn))
                         kvm_flush_remote_tlbs(vcpu->kvm);
+ 
                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
                         kvm_sync_pages(vcpu, gfn);
- 
-               account_shadowed(vcpu->kvm, sp);
         }
         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
         clear_page(sp->spt);
@@@ -2269,7 -2321,6 +2321,6 @@@ static int mmu_zap_unsync_children(stru
         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
                 return 0;
   
-       kvm_mmu_pages_init(parent, &parents, &pages);
         while (mmu_unsync_walk(parent, &pages)) {
                 struct kvm_mmu_page *sp;
   
@@@ -2278,7 -2329,6 +2329,6 @@@
                         mmu_pages_clear_parents(&parents);
                         zapped++;
                 }
-               kvm_mmu_pages_init(parent, &parents, &pages);
         }
   
         return zapped;
@@@ -2354,8 -2404,8 +2404,8 @@@ static bool prepare_zap_oldest_mmu_page
         if (list_empty(&kvm->arch.active_mmu_pages))
                 return false;
   
-       sp = list_entry(kvm->arch.active_mmu_pages.prev,
-                       struct kvm_mmu_page, link);
+       sp = list_last_entry(&kvm->arch.active_mmu_pages,
+                            struct kvm_mmu_page, link);
         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
   
         return true;
@@@ -2408,7 -2458,7 +2458,7 @@@ int kvm_mmu_unprotect_page(struct kvm *
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
   
- static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
   {
         trace_kvm_mmu_unsync_page(sp);
         ++vcpu->kvm->stat.mmu_unsync;
@@@ -2417,37 -2467,26 +2467,26 @@@
         kvm_mmu_mark_parents_unsync(sp);
   }
   
- static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                  bool can_unsync)
   {
-       struct kvm_mmu_page *s;
- 
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
-               if (s->unsync)
-                       continue;
-               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-               __kvm_unsync_page(vcpu, s);
-       }
- }
+       struct kvm_mmu_page *sp;
   
- static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
-                                 bool can_unsync)
- {
-       struct kvm_mmu_page *s;
-       bool need_unsync = false;
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
   
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                 if (!can_unsync)
-                       return 1;
+                       return true;
   
-               if (s->role.level != PT_PAGE_TABLE_LEVEL)
-                       return 1;
+               if (sp->unsync)
+                       continue;
   
-               if (!s->unsync)
-                       need_unsync = true;
+               WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+               kvm_unsync_page(vcpu, sp);
         }
-       if (need_unsync)
-               kvm_unsync_pages(vcpu, gfn);
-       return 0;
+ 
+       return false;
   }
   
   static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@@ -2503,7 -2542,7 +2542,7 @@@ static int set_spte(struct kvm_vcpu *vc
                  * be fixed if guest refault.
                  */
                 if (level > PT_PAGE_TABLE_LEVEL &&
-                   has_wrprotected_page(vcpu, gfn, level))
+                   mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
                         goto done;
   
                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@@ -2768,7 -2807,7 +2807,7 @@@ static void transparent_hugepage_adjust
         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
             level == PT_PAGE_TABLE_LEVEL &&
             PageTransCompound(pfn_to_page(pfn)) &&
-           !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+           !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                 unsigned long mask;
                 /*
                  * mmu_notifier_retry was successful and we hold the
@@@ -2796,20 -2835,16 +2835,16 @@@
   static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                 kvm_pfn_t pfn, unsigned access, int *ret_val)
   {
-       bool ret = true;
- 
         /* The pfn is invalid, report the error! */
         if (unlikely(is_error_pfn(pfn))) {
                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
-               goto exit;
+               return true;
         }
   
         if (unlikely(is_noslot_pfn(pfn)))
                 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
   
-       ret = false;
- exit:
-       return ret;
+       return false;
   }
   
   static bool page_fault_can_be_fast(u32 error_code)
@@@ -3273,7 -3308,7 +3308,7 @@@ static bool is_shadow_zero_bits_set(str
         return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
   }
   
- static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
   {
         if (direct)
                 return vcpu_match_mmio_gpa(vcpu, addr);
@@@ -3332,7 -3367,7 +3367,7 @@@ int handle_mmio_page_fault(struct kvm_v
         u64 spte;
         bool reserved;
   
-       if (quickly_check_mmio_pf(vcpu, addr, direct))
+       if (mmio_info_in_cache(vcpu, addr, direct))
                 return RET_MMIO_PF_EMULATE;
   
         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
@@@ -3362,20 -3397,53 +3397,53 @@@
   }
   EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
   
+ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+                                        u32 error_code, gfn_t gfn)
+ {
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return false;
+ 
+       if (!(error_code & PFERR_PRESENT_MASK) ||
+             !(error_code & PFERR_WRITE_MASK))
+               return false;
+ 
+       /*
+        * guest is writing the page which is write tracked which can
+        * not be fixed by page fault handler.
+        */
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
+ 
+       return false;
+ }
+ 
+ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+ {
+       struct kvm_shadow_walk_iterator iterator;
+       u64 spte;
+ 
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+ 
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+               clear_sp_write_flooding_count(iterator.sptep);
+               if (!is_shadow_present_pte(spte))
+                       break;
+       }
+       walk_shadow_page_lockless_end(vcpu);
+ }
+ 
   static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                 u32 error_code, bool prefault)
   {
-       gfn_t gfn;
+       gfn_t gfn = gva >> PAGE_SHIFT;
         int r;
   
         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
   
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gva, true);
- 
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
   
         r = mmu_topup_memory_caches(vcpu);
         if (r)
@@@ -3383,7 -3451,6 +3451,6 @@@
   
         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
   
-       gfn = gva >> PAGE_SHIFT;
   
         return nonpaging_map(vcpu, gva & PAGE_MASK,
                              error_code, gfn, prefault);
@@@ -3460,12 -3527,8 +3527,8 @@@ static int tdp_page_fault(struct kvm_vc
   
         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
   
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gpa, true);
- 
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
   
         r = mmu_topup_memory_caches(vcpu);
         if (r)
@@@ -4186,7 -4249,8 +4249,8 @@@ static bool detect_write_flooding(struc
         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
                 return false;
   
-       return ++sp->write_flooding_count >= 3;
+       atomic_inc(&sp->write_flooding_count);
+       return atomic_read(&sp->write_flooding_count) >= 3;
   }
   
   /*
@@@ -4248,8 -4312,8 +4312,8 @@@ static u64 *get_written_sptes(struct kv
         return spte;
   }
   
- void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes)
+ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                             const u8 *new, int bytes)
   {
         gfn_t gfn = gpa >> PAGE_SHIFT;
         struct kvm_mmu_page *sp;
@@@ -4354,32 -4418,34 +4418,34 @@@ static void make_mmu_pages_available(st
         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
   }
   
- static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
- {
-       if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
-               return vcpu_match_mmio_gpa(vcpu, addr);
- 
-       return vcpu_match_mmio_gva(vcpu, addr);
- }
- 
   int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
                        void *insn, int insn_len)
   {
         int r, emulation_type = EMULTYPE_RETRY;
         enum emulation_result er;
+       bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+ 
+       if (unlikely(error_code & PFERR_RSVD_MASK)) {
+               r = handle_mmio_page_fault(vcpu, cr2, direct);
+               if (r == RET_MMIO_PF_EMULATE) {
+                       emulation_type = 0;
+                       goto emulate;
+               }
+               if (r == RET_MMIO_PF_RETRY)
+                       return 1;
+               if (r < 0)
+                       return r;
+       }
   
         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
         if (r < 0)
-               goto out;
- 
-       if (!r) {
-               r = 1;
-               goto out;
-       }
+               return r;
+       if (!r)
+               return 1;
   
-       if (is_mmio_page_fault(vcpu, cr2))
+       if (mmio_info_in_cache(vcpu, cr2, direct))
                 emulation_type = 0;
- 
+ emulate:
         er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
   
         switch (er) {
@@@ -4393,8 -4459,6 +4459,6 @@@
         default:
                 BUG();
         }
- out:
-       return r;
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
   
@@@ -4463,6 -4527,21 +4527,21 @@@ void kvm_mmu_setup(struct kvm_vcpu *vcp
         init_kvm_mmu(vcpu);
   }
   
+ void kvm_mmu_init_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ 
+       node->track_write = kvm_mmu_pte_write;
+       kvm_page_track_register_notifier(kvm, node);
+ }
+ 
+ void kvm_mmu_uninit_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ 
+       kvm_page_track_unregister_notifier(kvm, node);
+ }
+ 
   /* The return value indicates if tlb flush on all vcpus is needed. */
   typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
   
diff --combined arch/x86/kvm/paging_tmpl.h

index 2ce4f05e81d3804cef4ded542a105a5a917b8bed,4174cf290fa3f71fde863339eaba71efdb4e8269..49f1c0b9082babf4ef0b23887e72ba8ecc185ff3
--- 1/arch/x86/kvm/paging_tmpl.h
--- 2/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@@ -249,7 -249,7 +249,7 @@@ static int FNAME(update_accessed_dirty_
                         return ret;
   
                 kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
- -              walker->ptes[level] = pte;
+ +              walker->ptes[level - 1] = pte;
         }
         return 0;
   }
@@@ -702,23 -702,16 +702,16 @@@ static int FNAME(page_fault)(struct kvm
   
         pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
   
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
- 
-               /*
-                * page fault with PFEC.RSVD  = 1 is caused by shadow
-                * page fault, should not be used to walk guest page
-                * table.
-                */
-               error_code &= ~PFERR_RSVD_MASK;
-       };
- 
         r = mmu_topup_memory_caches(vcpu);
         if (r)
                 return r;
   
+       /*
+        * If PFEC.RSVD is set, this is a shadow page fault.
+        * The bit needs to be cleared before walking guest page tables.
+        */
+       error_code &= ~PFERR_RSVD_MASK;
+ 
         /*
          * Look up the guest pte for the faulting address.
          */
@@@ -735,6 -728,11 +728,11 @@@
                 return 0;
         }
   
+       if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+               shadow_page_table_clear_flood(vcpu, addr);
+               return 1;
+       }
+ 
         vcpu->arch.write_fault_to_shadow_pgtable = false;
   
         is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
diff --combined arch/x86/kvm/vmx.c

index b92094ee135e62a1cefcd35df7a4860dc48d1966,46154dac71e64b14d7563d76abc4d5b7c1f70e3b..e87c494cb4769352c6662ecef01202f4a1b1f04f
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -596,8 -596,6 +596,8 @@@ struct vcpu_vmx 
         /* Support for PML */
   #define PML_ENTITY_NUM                512
         struct page *pml_pg;
+ +
+ +      u64 current_tsc_ratio;
   };
   
   enum segment_cache_field {
@@@ -963,25 -961,36 +963,36 @@@ static const u32 vmx_msr_index[] = 
         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
   };
   
- static inline bool is_page_fault(u32 intr_info)
+ static inline bool is_exception_n(u32 intr_info, u8 vector)
   {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                              INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+ }
+ 
+ static inline bool is_debug(u32 intr_info)
+ {
+       return is_exception_n(intr_info, DB_VECTOR);
+ }
+ 
+ static inline bool is_breakpoint(u32 intr_info)
+ {
+       return is_exception_n(intr_info, BP_VECTOR);
+ }
+ 
+ static inline bool is_page_fault(u32 intr_info)
+ {
+       return is_exception_n(intr_info, PF_VECTOR);
   }
   
   static inline bool is_no_device(u32 intr_info)
   {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, NM_VECTOR);
   }
   
   static inline bool is_invalid_opcode(u32 intr_info)
   {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, UD_VECTOR);
   }
   
   static inline bool is_external_interrupt(u32 intr_info)
@@@ -2129,16 -2138,14 +2140,16 @@@ static void vmx_vcpu_load(struct kvm_vc
                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
   
- -              /* Setup TSC multiplier */
- -              if (cpu_has_vmx_tsc_scaling())
- -                      vmcs_write64(TSC_MULTIPLIER,
- -                                   vcpu->arch.tsc_scaling_ratio);
- -
                 vmx->loaded_vmcs->cpu = cpu;
         }
   
+ +      /* Setup TSC multiplier */
+ +      if (kvm_has_tsc_control &&
+ +          vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
+ +              vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+ +              vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+ +      }
+ +
         vmx_vcpu_pi_load(vcpu, cpu);
   }
   
@@@ -5479,7 -5486,7 +5490,7 @@@ static int handle_set_cr4(struct kvm_vc
                 return kvm_set_cr4(vcpu, val);
   }
   
- -/* called to set cr0 as approriate for clts instruction exit. */
+ +/* called to set cr0 as appropriate for clts instruction exit. */
   static void handle_clts(struct kvm_vcpu *vcpu)
   {
         if (is_guest_mode(vcpu)) {
@@@ -5612,11 -5619,8 +5623,8 @@@ static int handle_dr(struct kvm_vcpu *v
         }
   
         if (vcpu->guest_debug == 0) {
-               u32 cpu_based_vm_exec_control;
- 
-               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                               CPU_BASED_MOV_DR_EXITING);
   
                 /*
                  * No more DR vmexits; force a reload of the debug registers
@@@ -5653,8 -5657,6 +5661,6 @@@ static void vmx_set_dr6(struct kvm_vcp
   
   static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
   {
-       u32 cpu_based_vm_exec_control;
- 
         get_debugreg(vcpu->arch.db[0], 0);
         get_debugreg(vcpu->arch.db[1], 1);
         get_debugreg(vcpu->arch.db[2], 2);
@@@ -5663,10 -5665,7 +5669,7 @@@
         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
   
         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
- 
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
   }
   
   static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@@ -5751,8 -5750,7 +5754,7 @@@ static int handle_halt(struct kvm_vcpu 
   
   static int handle_vmcall(struct kvm_vcpu *vcpu)
   {
-       kvm_emulate_hypercall(vcpu);
-       return 1;
+       return kvm_emulate_hypercall(vcpu);
   }
   
   static int handle_invd(struct kvm_vcpu *vcpu)
@@@ -6439,8 -6437,8 +6441,8 @@@ static struct loaded_vmcs *nested_get_c
   
         if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
                 /* Recycle the least recently used VMCS. */
-               item = list_entry(vmx->nested.vmcs02_pool.prev,
-                       struct vmcs02_list, list);
+               item = list_last_entry(&vmx->nested.vmcs02_pool,
+                                      struct vmcs02_list, list);
                 item->vmptr = vmx->nested.current_vmptr;
                 list_move(&item->list, &vmx->nested.vmcs02_pool);
                 return &item->vmcs02;
@@@ -7227,7 -7225,7 +7229,7 @@@ static int handle_vmwrite(struct kvm_vc
         /* The value to write might be 32 or 64 bits, depending on L1's long
          * mode, and eventually we need to write that into a field of several
          * possible lengths. The code below first zero-extends the value to 64
- -       * bit (field_value), and then copies only the approriate number of
+ +       * bit (field_value), and then copies only the appropriate number of
          * bits into the vmcs12 field.
          */
         u64 field_value = 0;
@@@ -7756,6 -7754,13 +7758,13 @@@ static bool nested_vmx_exit_handled(str
                 else if (is_no_device(intr_info) &&
                          !(vmcs12->guest_cr0 & X86_CR0_TS))
                         return false;
+               else if (is_debug(intr_info) &&
+                        vcpu->guest_debug &
+                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       return false;
+               else if (is_breakpoint(intr_info) &&
+                        vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       return false;
                 return vmcs12->exception_bitmap &
                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
         case EXIT_REASON_EXTERNAL_INTERRUPT:
@@@ -8360,7 -8365,6 +8369,7 @@@ static void vmx_complete_atomic_exit(st
   static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
   {
         u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ +      register void *__sp asm(_ASM_SP);
   
         /*
          * If external interrupt exists, IF bit is set in rflags/eflags on the
@@@ -8393,9 -8397,8 +8402,9 @@@
                         "call *%[entry]\n\t"
                         :
   #ifdef CONFIG_X86_64
- -                      [sp]"=&r"(tmp)
+ +                      [sp]"=&r"(tmp),
   #endif
+ +                      "+r"(__sp)
                         :
                         [entry]"r"(entry),
                         [ss]"i"(__KERNEL_DS),
@@@ -10770,13 -10773,26 +10779,26 @@@ static int vmx_update_pi_irte(struct kv
                  */
   
                 kvm_set_msi_irq(e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+                       /*
+                        * Make sure the IRTE is in remapped mode if
+                        * we don't handle it in posted mode.
+                        */
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       if (ret < 0) {
+                               printk(KERN_INFO
+                                  "failed to back to remapped mode, irq: %u\n",
+                                  host_irq);
+                               goto out;
+                       }
+ 
                         continue;
+               }
   
                 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                 vcpu_info.vector = irq.vector;
   
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
                                 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
   
                 if (set)
diff --combined arch/x86/kvm/x86.c

index 4838d35c9641d6cee63da0e930224d7b1d446e8c,60d6c0036a98287eb0b2ac56106a4a027c70363d..82445a8bdf09b067d2d8bdb1717a02f6a3fc1d27
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -123,6 -123,9 +123,9 @@@ module_param(tsc_tolerance_ppm, uint, S
   unsigned int __read_mostly lapic_timer_advance_ns = 0;
   module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
   
+ static bool __read_mostly vector_hashing = true;
+ module_param(vector_hashing, bool, S_IRUGO);
+ 
   static bool __read_mostly backwards_tsc_observed = false;
   
   #define KVM_NR_SHARED_MSRS 16
@@@ -1196,17 -1199,11 +1199,11 @@@ static void kvm_write_wall_clock(struc
   
   static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
   {
-       uint32_t quotient, remainder;
- 
-       /* Don't try to replace with do_div(), this one calculates
-        * "(dividend << 32) / divisor" */
-       __asm__ ( "divl %4"
-                 : "=a" (quotient), "=d" (remainder)
-                 : "0" (0), "1" (dividend), "r" (divisor) );
-       return quotient;
+       do_shl32_div32(dividend, divisor);
+       return dividend;
   }
   
- static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
                                s8 *pshift, u32 *pmultiplier)
   {
         uint64_t scaled64;
@@@ -1214,8 -1211,8 +1211,8 @@@
         uint64_t tps64;
         uint32_t tps32;
   
-       tps64 = base_khz * 1000LL;
-       scaled64 = scaled_khz * 1000LL;
+       tps64 = base_hz;
+       scaled64 = scaled_hz;
         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
                 tps64 >>= 1;
                 shift--;
@@@ -1233,8 -1230,8 +1230,8 @@@
         *pshift = shift;
         *pmultiplier = div_frac(scaled64, tps32);
   
-       pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
-                __func__, base_khz, scaled_khz, shift, *pmultiplier);
+       pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+                __func__, base_hz, scaled_hz, shift, *pmultiplier);
   }
   
   #ifdef CONFIG_X86_64
@@@ -1293,23 -1290,23 +1290,23 @@@ static int set_tsc_khz(struct kvm_vcpu 
         return 0;
   }
   
- static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
   {
         u32 thresh_lo, thresh_hi;
         int use_scaling = 0;
   
         /* tsc_khz can be zero if TSC calibration fails */
-       if (this_tsc_khz == 0) {
+       if (user_tsc_khz == 0) {
                 /* set tsc_scaling_ratio to a safe value */
                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
                 return -1;
         }
   
         /* Compute a scale to convert nanoseconds in TSC cycles */
-       kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+       kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
                            &vcpu->arch.virtual_tsc_shift,
                            &vcpu->arch.virtual_tsc_mult);
-       vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+       vcpu->arch.virtual_tsc_khz = user_tsc_khz;
   
         /*
          * Compute the variation in TSC rate which is acceptable
@@@ -1319,11 -1316,11 +1316,11 @@@
          */
         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
-       if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
-               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+       if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
                 use_scaling = 1;
         }
-       return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+       return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
   }
   
   static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@@ -1562,7 -1559,7 +1559,7 @@@ static cycle_t read_tsc(void
   
         /*
          * GCC likes to generate cmov here, but this branch is extremely
- -       * predictable (it's just a funciton of time and the likely is
+ +       * predictable (it's just a function of time and the likely is
          * very likely) and there's a data dependence, so force GCC
          * to generate a branch instead.  I don't barrier() because
          * we don't actually need a barrier, and if this function
@@@ -1716,7 -1713,7 +1713,7 @@@ static void kvm_gen_update_masterclock(
   
   static int kvm_guest_time_update(struct kvm_vcpu *v)
   {
-       unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+       unsigned long flags, tgt_tsc_khz;
         struct kvm_vcpu_arch *vcpu = &v->arch;
         struct kvm_arch *ka = &v->kvm->arch;
         s64 kernel_ns;
@@@ -1742,8 -1739,8 +1739,8 @@@
   
         /* Keep irq disabled to prevent changes to the clock */
         local_irq_save(flags);
-       this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-       if (unlikely(this_tsc_khz == 0)) {
+       tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+       if (unlikely(tgt_tsc_khz == 0)) {
                 local_irq_restore(flags);
                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
                 return 1;
@@@ -1778,13 -1775,14 +1775,14 @@@
         if (!vcpu->pv_time_enabled)
                 return 0;
   
-       if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-               tgt_tsc_khz = kvm_has_tsc_control ?
-                       vcpu->virtual_tsc_khz : this_tsc_khz;
-               kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+       if (kvm_has_tsc_control)
+               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+ 
+       if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+               kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
                                    &vcpu->hv_clock.tsc_shift,
                                    &vcpu->hv_clock.tsc_to_system_mul);
-               vcpu->hw_tsc_khz = this_tsc_khz;
+               vcpu->hw_tsc_khz = tgt_tsc_khz;
         }
   
         /* With all the info we got, fill in the values */
@@@ -2752,6 -2750,7 +2750,7 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
         }
   
         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+       vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
   }
   
   void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@@ -2987,7 -2986,7 +2986,7 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
   
         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
-           kvm_vcpu_has_lapic(vcpu))
+           lapic_in_kernel(vcpu))
                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
   
         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@@ -3000,7 -2999,7 +2999,7 @@@
                         vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
                 else
                         vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-               if (kvm_vcpu_has_lapic(vcpu)) {
+               if (lapic_in_kernel(vcpu)) {
                         if (events->smi.latched_init)
                                 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
                         else
@@@ -3240,7 -3239,7 +3239,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
         switch (ioctl) {
         case KVM_GET_LAPIC: {
                 r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                         goto out;
                 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
   
@@@ -3258,7 -3257,7 +3257,7 @@@
         }
         case KVM_SET_LAPIC: {
                 r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                         goto out;
                 u.lapic = memdup_user(argp, sizeof(*u.lapic));
                 if (IS_ERR(u.lapic))
@@@ -3605,20 -3604,26 +3604,26 @@@ static int kvm_vm_ioctl_set_irqchip(str
   
   static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
   {
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+ 
+       BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+ 
+       mutex_lock(&kps->lock);
+       memcpy(ps, &kps->channels, sizeof(*ps));
+       mutex_unlock(&kps->lock);
         return 0;
   }
   
   static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
   {
         int i;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+       struct kvm_pit *pit = kvm->arch.vpit;
+ 
+       mutex_lock(&pit->pit_state.lock);
+       memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
         for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+               kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+       mutex_unlock(&pit->pit_state.lock);
         return 0;
   }
   
@@@ -3638,29 -3643,39 +3643,39 @@@ static int kvm_vm_ioctl_set_pit2(struc
         int start = 0;
         int i;
         u32 prev_legacy, cur_legacy;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       struct kvm_pit *pit = kvm->arch.vpit;
+ 
+       mutex_lock(&pit->pit_state.lock);
+       prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
         if (!prev_legacy && cur_legacy)
                 start = 1;
-       memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
-              sizeof(kvm->arch.vpit->pit_state.channels));
-       kvm->arch.vpit->pit_state.flags = ps->flags;
+       memcpy(&pit->pit_state.channels, &ps->channels,
+              sizeof(pit->pit_state.channels));
+       pit->pit_state.flags = ps->flags;
         for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+               kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
                                    start && i == 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       mutex_unlock(&pit->pit_state.lock);
         return 0;
   }
   
   static int kvm_vm_ioctl_reinject(struct kvm *kvm,
                                  struct kvm_reinject_control *control)
   {
-       if (!kvm->arch.vpit)
+       struct kvm_pit *pit = kvm->arch.vpit;
+ 
+       if (!pit)
                 return -ENXIO;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ 
+       /* pit->pit_state.lock was overloaded to prevent userspace from getting
+        * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+        * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
+        */
+       mutex_lock(&pit->pit_state.lock);
+       kvm_pit_set_reinject(pit, control->pit_reinject);
+       mutex_unlock(&pit->pit_state.lock);
+ 
         return 0;
   }
   
@@@ -4093,7 -4108,7 +4108,7 @@@ static int vcpu_mmio_write(struct kvm_v
   
         do {
                 n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                       !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
                     && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
                         break;
@@@ -4113,7 -4128,7 +4128,7 @@@ static int vcpu_mmio_read(struct kvm_vc
   
         do {
                 n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                       !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
                                          addr, n, v))
                     && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@@ -4346,7 -4361,7 +4361,7 @@@ int emulator_write_phys(struct kvm_vcp
         ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
         if (ret < 0)
                 return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       kvm_page_track_write(vcpu, gpa, val, bytes);
         return 1;
   }
   
@@@ -4604,7 -4619,7 +4619,7 @@@ static int emulator_cmpxchg_emulated(st
                 return X86EMUL_CMPXCHG_FAILED;
   
         kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
-       kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+       kvm_page_track_write(vcpu, gpa, new, bytes);
   
         return X86EMUL_CONTINUE;
   
@@@ -6010,7 -6025,7 +6025,7 @@@ static void update_cr8_intercept(struc
         if (!kvm_x86_ops->update_cr8_intercept)
                 return;
   
-       if (!vcpu->arch.apic)
+       if (!lapic_in_kernel(vcpu))
                 return;
   
         if (vcpu->arch.apicv_active)
@@@ -6618,12 -6633,12 +6633,12 @@@ static int vcpu_enter_guest(struct kvm_
          * KVM_DEBUGREG_WONT_EXIT again.
          */
         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
- -              int i;
- -
                 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
                 kvm_x86_ops->sync_dirty_debug_regs(vcpu);
- -              for (i = 0; i < KVM_NR_DB_REGS; i++)
- -                      vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ +              kvm_update_dr0123(vcpu);
+ +              kvm_update_dr6(vcpu);
+ +              kvm_update_dr7(vcpu);
+ +              vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
         }
   
         /*
@@@ -7038,7 -7053,7 +7053,7 @@@ int kvm_arch_vcpu_ioctl_get_mpstate(str
   int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state)
   {
-       if (!kvm_vcpu_has_lapic(vcpu) &&
+       if (!lapic_in_kernel(vcpu) &&
             mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                 return -EINVAL;
   
@@@ -7593,6 -7608,7 +7608,7 @@@ bool kvm_vcpu_compatible(struct kvm_vcp
   }
   
   struct static_key kvm_no_apic_vcpu __read_mostly;
+ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
   
   int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
   {
@@@ -7724,6 -7740,9 +7740,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
   
+       kvm_page_track_init(kvm);
+       kvm_mmu_init_vm(kvm);
+ 
         return 0;
   }
   
@@@ -7850,6 -7869,7 +7869,7 @@@ void kvm_arch_destroy_vm(struct kvm *kv
         kfree(kvm->arch.vioapic);
         kvm_free_vcpus(kvm);
         kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvm_mmu_uninit_vm(kvm);
   }
   
   void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@@ -7871,6 -7891,8 +7891,8 @@@
                         free->arch.lpage_info[i - 1] = NULL;
                 }
         }
+ 
+       kvm_page_track_free_memslot(free, dont);
   }
   
   int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@@ -7879,6 -7901,7 +7901,7 @@@
         int i;
   
         for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               struct kvm_lpage_info *linfo;
                 unsigned long ugfn;
                 int lpages;
                 int level = i + 1;
@@@ -7893,15 -7916,16 +7916,16 @@@
                 if (i == 0)
                         continue;
   
-               slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
-                                       sizeof(*slot->arch.lpage_info[i - 1]));
-               if (!slot->arch.lpage_info[i - 1])
+               linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+               if (!linfo)
                         goto out_free;
   
+               slot->arch.lpage_info[i - 1] = linfo;
+ 
                 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][0].write_count = 1;
+                       linfo[0].disallow_lpage = 1;
                 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+                       linfo[lpages - 1].disallow_lpage = 1;
                 ugfn = slot->userspace_addr >> PAGE_SHIFT;
                 /*
                  * If the gfn and userspace address are not aligned wrt each
@@@ -7913,10 -7937,13 +7937,13 @@@
                         unsigned long j;
   
                         for (j = 0; j < lpages; ++j)
-                               slot->arch.lpage_info[i - 1][j].write_count = 1;
+                               linfo[j].disallow_lpage = 1;
                 }
         }
   
+       if (kvm_page_track_create_memslot(slot, npages))
+               goto out_free;
+ 
         return 0;
   
   out_free:
@@@ -8370,6 -8397,12 +8397,12 @@@ int kvm_arch_update_irqfd_routing(struc
         return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
   }
   
+ bool kvm_vector_hashing_enabled(void)
+ {
+       return vector_hashing;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+ 
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --combined virt/kvm/async_pf.c

index bd3e7d8496e841b54cc0fe79c92a7dbf7948788c,c7e447c4296e3d5d33ee0bf00ade7a8739e7919d..db9668869f6ff6866a72f278def5770d71190994
--- 1/virt/kvm/async_pf.c
--- 2/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@@ -79,13 -79,7 +79,13 @@@ static void async_pf_execute(struct wor
   
         might_sleep();
   
- -      get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
+ +      /*
+ +       * This work is run asynchromously to the task which owns
+ +       * mm and might be done in another context, so we must
+ +       * use FOLL_REMOTE.
+ +       */
+ +      __get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL, FOLL_REMOTE);
+ +
         kvm_async_page_present_sync(vcpu, apf);
   
         spin_lock(&vcpu->async_pf.lock);
@@@ -103,8 -97,8 +103,8 @@@
          * This memory barrier pairs with prepare_to_wait's set_current_state()
          */
         smp_mb();
- -      if (waitqueue_active(&vcpu->wq))
- -              wake_up_interruptible(&vcpu->wq);
+ +      if (swait_active(&vcpu->wq))
+ +              swake_up(&vcpu->wq);
   
         mmput(mm);
         kvm_put_kvm(vcpu->kvm);
@@@ -115,8 -109,8 +115,8 @@@ void kvm_clear_async_pf_completion_queu
         /* cancel outstanding work queue item */
         while (!list_empty(&vcpu->async_pf.queue)) {
                 struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.queue.next,
-                                  typeof(*work), queue);
+                       list_first_entry(&vcpu->async_pf.queue,
+                                        typeof(*work), queue);
                 list_del(&work->queue);
   
   #ifdef CONFIG_KVM_ASYNC_PF_SYNC
@@@ -133,8 -127,8 +133,8 @@@
         spin_lock(&vcpu->async_pf.lock);
         while (!list_empty(&vcpu->async_pf.done)) {
                 struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.done.next,
-                                  typeof(*work), link);
+                       list_first_entry(&vcpu->async_pf.done,
+                                        typeof(*work), link);
                 list_del(&work->link);
                 kmem_cache_free(async_pf_cache, work);
         }
@@@ -178,7 -172,7 +178,7 @@@ int kvm_setup_async_pf(struct kvm_vcpu 
          * do alloc nowait since if we are going to sleep anyway we
          * may as well sleep faulting in page
          */
- -      work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
+ +      work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN);
         if (!work)
                 return 0;
   
diff --combined virt/kvm/kvm_main.c

index 1ca025816a8b1a23b54d476418b65c4d41717386,1eae05236347f1d1c4c6cc9912a524a845c53218..a6b987886b6c1001e68fbe09dc770778abfe5244
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -72,11 -72,11 +72,11 @@@ module_param(halt_poll_ns, uint, S_IRUG
   
   /* Default doubles per-vcpu halt_poll_ns. */
   static unsigned int halt_poll_ns_grow = 2;
- module_param(halt_poll_ns_grow, int, S_IRUGO);
+ module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
   
   /* Default resets per-vcpu halt_poll_ns . */
   static unsigned int halt_poll_ns_shrink;
- module_param(halt_poll_ns_shrink, int, S_IRUGO);
+ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
   
   /*
    * Ordering of locks:
@@@ -216,7 -216,8 +216,7 @@@ int kvm_vcpu_init(struct kvm_vcpu *vcpu
         vcpu->kvm = kvm;
         vcpu->vcpu_id = id;
         vcpu->pid = NULL;
- -      vcpu->halt_poll_ns = 0;
- -      init_waitqueue_head(&vcpu->wq);
+ +      init_swait_queue_head(&vcpu->wq);
         kvm_async_pf_vcpu_init(vcpu);
   
         vcpu->pre_pcpu = -1;
@@@ -619,13 -620,10 +619,10 @@@ void *kvm_kvzalloc(unsigned long size
   
   static void kvm_destroy_devices(struct kvm *kvm)
   {
-       struct list_head *node, *tmp;
+       struct kvm_device *dev, *tmp;
   
-       list_for_each_safe(node, tmp, &kvm->devices) {
-               struct kvm_device *dev =
-                       list_entry(node, struct kvm_device, vm_node);
- 
-               list_del(node);
+       list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
+               list_del(&dev->vm_node);
                 dev->ops->destroy(dev);
         }
   }
@@@ -1263,16 -1261,15 +1260,16 @@@ unsigned long kvm_vcpu_gfn_to_hva_prot(
         return gfn_to_hva_memslot_prot(slot, gfn, writable);
   }
   
- -static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
- -      unsigned long start, int write, struct page **page)
+ +static int get_user_page_nowait(unsigned long start, int write,
+ +              struct page **page)
   {
         int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
   
         if (write)
                 flags |= FOLL_WRITE;
   
- -      return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
+ +      return __get_user_pages(current, current->mm, start, 1, flags, page,
+ +                      NULL, NULL);
   }
   
   static inline int check_user_page_hwpoison(unsigned long addr)
@@@ -1334,7 -1331,8 +1331,7 @@@ static int hva_to_pfn_slow(unsigned lon
   
         if (async) {
                 down_read(&current->mm->mmap_sem);
- -              npages = get_user_page_nowait(current, current->mm,
- -                                            addr, write_fault, page);
+ +              npages = get_user_page_nowait(addr, write_fault, page);
                 up_read(&current->mm->mmap_sem);
         } else
                 npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
@@@ -1436,11 -1434,17 +1433,17 @@@ kvm_pfn_t __gfn_to_pfn_memslot(struct k
   {
         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
   
-       if (addr == KVM_HVA_ERR_RO_BAD)
+       if (addr == KVM_HVA_ERR_RO_BAD) {
+               if (writable)
+                       *writable = false;
                 return KVM_PFN_ERR_RO_FAULT;
+       }
   
-       if (kvm_is_error_hva(addr))
+       if (kvm_is_error_hva(addr)) {
+               if (writable)
+                       *writable = false;
                 return KVM_PFN_NOSLOT;
+       }
   
         /* Do not map writable pfn in the readonly memslot. */
         if (writable && memslot_is_readonly(slot)) {
@@@ -1942,14 -1946,15 +1945,15 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_di
   
   static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
   {
-       int old, val;
+       unsigned int old, val, grow;
   
         old = val = vcpu->halt_poll_ns;
+       grow = READ_ONCE(halt_poll_ns_grow);
         /* 10us base */
-       if (val == 0 && halt_poll_ns_grow)
+       if (val == 0 && grow)
                 val = 10000;
         else
-               val *= halt_poll_ns_grow;
+               val *= grow;
   
         vcpu->halt_poll_ns = val;
         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
@@@ -1957,13 -1962,14 +1961,14 @@@
   
   static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
   {
-       int old, val;
+       unsigned int old, val, shrink;
   
         old = val = vcpu->halt_poll_ns;
-       if (halt_poll_ns_shrink == 0)
+       shrink = READ_ONCE(halt_poll_ns_shrink);
+       if (shrink == 0)
                 val = 0;
         else
-               val /= halt_poll_ns_shrink;
+               val /= shrink;
   
         vcpu->halt_poll_ns = val;
         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
@@@ -1989,7 -1995,7 +1994,7 @@@ static int kvm_vcpu_check_block(struct 
   void kvm_vcpu_block(struct kvm_vcpu *vcpu)
   {
         ktime_t start, cur;
- -      DEFINE_WAIT(wait);
+ +      DECLARE_SWAITQUEUE(wait);
         bool waited = false;
         u64 block_ns;
   
@@@ -2014,7 -2020,7 +2019,7 @@@
         kvm_arch_vcpu_blocking(vcpu);
   
         for (;;) {
- -              prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+ +              prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
   
                 if (kvm_vcpu_check_block(vcpu) < 0)
                         break;
@@@ -2023,7 -2029,7 +2028,7 @@@
                 schedule();
         }
   
- -      finish_wait(&vcpu->wq, &wait);
+ +      finish_swait(&vcpu->wq, &wait);
         cur = ktime_get();
   
         kvm_arch_vcpu_unblocking(vcpu);
@@@ -2055,11 -2061,11 +2060,11 @@@ void kvm_vcpu_kick(struct kvm_vcpu *vcp
   {
         int me;
         int cpu = vcpu->cpu;
- -      wait_queue_head_t *wqp;
+ +      struct swait_queue_head *wqp;
   
         wqp = kvm_arch_vcpu_wq(vcpu);
- -      if (waitqueue_active(wqp)) {
- -              wake_up_interruptible(wqp);
+ +      if (swait_active(wqp)) {
+ +              swake_up(wqp);
                 ++vcpu->stat.halt_wakeup;
         }
   
@@@ -2160,7 -2166,7 +2165,7 @@@ void kvm_vcpu_on_spin(struct kvm_vcpu *
                                 continue;
                         if (vcpu == me)
                                 continue;
- -                      if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+ +                      if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                 continue;
                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                 continue;
author	Stephen Rothwell <sfr@canb.auug.org.au>
	Mon, 7 Mar 2016 04:54:35 +0000 (15:54 +1100)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Mon, 7 Mar 2016 04:54:35 +0000 (15:54 +1100)
		1	2
arch/powerpc/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/smp.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/xics.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/smp.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_64_vio.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_64_vio_hv.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_hv.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_hv_rmhandlers.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/interrupt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/lapic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/paging_tmpl.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/async_pf.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history