]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm into next
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Jun 2014 15:47:12 +0000 (08:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Jun 2014 15:47:12 +0000 (08:47 -0700)
Pull KVM updates from Paolo Bonzini:
 "At over 200 commits, covering almost all supported architectures, this
  was a pretty active cycle for KVM.  Changes include:

   - a lot of s390 changes: optimizations, support for migration, GDB
     support and more

   - ARM changes are pretty small: support for the PSCI 0.2 hypercall
     interface on both the guest and the host (the latter acked by
     Catalin)

   - initial POWER8 and little-endian host support

   - support for running u-boot on embedded POWER targets

   - pretty large changes to MIPS too, completing the userspace
     interface and improving the handling of virtualized timer hardware

   - for x86, a larger set of changes is scheduled for 3.17.  Still, we
     have a few emulator bugfixes and support for running nested
     fully-virtualized Xen guests (para-virtualized Xen guests have
     always worked).  And some optimizations too.

  The only missing architecture here is ia64.  It's not a coincidence
  that support for KVM on ia64 is scheduled for removal in 3.17"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (203 commits)
  KVM: add missing cleanup_srcu_struct
  KVM: PPC: Book3S PR: Rework SLB switching code
  KVM: PPC: Book3S PR: Use SLB entry 0
  KVM: PPC: Book3S HV: Fix machine check delivery to guest
  KVM: PPC: Book3S HV: Work around POWER8 performance monitor bugs
  KVM: PPC: Book3S HV: Make sure we don't miss dirty pages
  KVM: PPC: Book3S HV: Fix dirty map for hugepages
  KVM: PPC: Book3S HV: Put huge-page HPTEs in rmap chain for base address
  KVM: PPC: Book3S HV: Fix check for running inside guest in global_invalidates()
  KVM: PPC: Book3S: Move KVM_REG_PPC_WORT to an unused register number
  KVM: PPC: Book3S: Add ONE_REG register names that were missed
  KVM: PPC: Add CAP to indicate hcall fixes
  KVM: PPC: MPIC: Reset IRQ source private members
  KVM: PPC: Graciously fail broken LE hypercalls
  PPC: ePAPR: Fix hypercall on LE guest
  KVM: PPC: BOOK3S: Remove open coded make_dsisr in alignment handler
  KVM: PPC: BOOK3S: Always use the saved DAR value
  PPC: KVM: Make NX bit available with magic page
  KVM: PPC: Disable NX for old magic page using guests
  KVM: PPC: BOOK3S: HV: Add mixed page-size support for guest
  ...

133 files changed:
Documentation/devicetree/bindings/arm/psci.txt
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/devices/vm.txt [new file with mode: 0644]
Documentation/virtual/kvm/ppc-pv.txt
Documentation/virtual/kvm/s390-diag.txt
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_psci.h
arch/arm/include/asm/psci.h
arch/arm/include/uapi/asm/kvm.h
arch/arm/kernel/psci.c
arch/arm/kernel/psci_smp.c
arch/arm/kvm/arm.c
arch/arm/kvm/handle_exit.c
arch/arm/kvm/psci.c
arch/arm64/include/asm/cpu_ops.h
arch/arm64/include/asm/cputype.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_psci.h
arch/arm64/include/asm/psci.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kernel/psci.c
arch/arm64/kernel/smp.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/sys_regs_generic_v8.c
arch/mips/Kconfig
arch/mips/include/asm/kvm_host.h
arch/mips/include/uapi/asm/kvm.h
arch/mips/kvm/kvm_locore.S
arch/mips/kvm/kvm_mips.c
arch/mips/kvm/kvm_mips_dyntrans.c
arch/mips/kvm/kvm_mips_emul.c
arch/mips/kvm/kvm_tlb.c
arch/mips/kvm/kvm_trap_emul.c
arch/mips/mm/cache.c
arch/mips/mti-malta/malta-time.c
arch/powerpc/include/asm/disassemble.h
arch/powerpc/include/asm/kvm_asm.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/reg_booke.h
arch/powerpc/include/uapi/asm/kvm.h
arch/powerpc/include/uapi/asm/kvm_para.h
arch/powerpc/kernel/align.c
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/epapr_paravirt.c
arch/powerpc/kernel/kvm.c
arch/powerpc/kernel/paca.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_32_mmu.c
arch/powerpc/kvm/book3s_32_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu.c
arch/powerpc/kvm/book3s_64_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_slb.S
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_exports.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_interrupts.S
arch/powerpc/kvm/book3s_paired_singles.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/book3s_pr_papr.c
arch/powerpc/kvm/book3s_rtas.c
arch/powerpc/kvm/book3s_segment.S
arch/powerpc/kvm/e500_emulate.c
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/mpic.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/trace_pr.h
arch/powerpc/mm/slb.c
arch/s390/include/asm/ctl_reg.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/lowcore.h
arch/s390/include/asm/mmu.h
arch/s390/include/asm/mmu_context.h
arch/s390/include/asm/pgalloc.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/ptrace.h
arch/s390/include/asm/sclp.h
arch/s390/include/uapi/asm/kvm.h
arch/s390/include/uapi/asm/sie.h [new file with mode: 0644]
arch/s390/kernel/asm-offsets.c
arch/s390/kernel/entry.S
arch/s390/kernel/entry64.S
arch/s390/kvm/Makefile
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.c [new file with mode: 0644]
arch/s390/kvm/gaccess.h
arch/s390/kvm/guestdbg.c [new file with mode: 0644]
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/kvm/sigp.c
arch/s390/kvm/trace-s390.h
arch/s390/kvm/trace.h
arch/s390/mm/pgtable.c
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/traps.h
arch/x86/kernel/kvm.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/emulate.c
arch/x86/kvm/irq.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/pmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/s390/char/sclp_early.c
include/linux/kvm_host.h
include/uapi/linux/Kbuild
include/uapi/linux/kvm.h
include/uapi/linux/psci.h [new file with mode: 0644]
virt/kvm/async_pf.c
virt/kvm/eventfd.c
virt/kvm/irq_comm.c
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

index 433afe9cb59097fd1c445f4e3a79684d1664c634..b4a58f39223cbde877347a65475e63cf1784112a 100644 (file)
@@ -21,7 +21,15 @@ to #0.
 
 Main node required properties:
 
- - compatible    : Must be "arm,psci"
+ - compatible    : should contain at least one of:
+
+                                * "arm,psci" : for implementations complying to PSCI versions prior to
+                                       0.2. For these cases function IDs must be provided.
+
+                                * "arm,psci-0.2" : for implementations complying to PSCI 0.2. Function
+                                       IDs are not required and should be ignored by an OS with PSCI 0.2
+                                       support, but are permitted to be present for compatibility with
+                                       existing software when "arm,psci" is later in the compatible list.
 
  - method        : The method of calling the PSCI firmware. Permitted
                    values are:
@@ -45,6 +53,8 @@ Main node optional properties:
 
 Example:
 
+Case 1: PSCI v0.1 only.
+
        psci {
                compatible      = "arm,psci";
                method          = "smc";
@@ -53,3 +63,28 @@ Example:
                cpu_on          = <0x95c10002>;
                migrate         = <0x95c10003>;
        };
+
+
+Case 2: PSCI v0.2 only
+
+       psci {
+               compatible      = "arm,psci-0.2";
+               method          = "smc";
+       };
+
+Case 3: PSCI v0.2 and PSCI v0.1.
+
+       A DTB may provide IDs for use by kernels without PSCI 0.2 support,
+       enabling firmware and hypervisors to support existing and new kernels.
+       These IDs will be ignored by kernels with PSCI 0.2 support, which will
+       use the standard PSCI 0.2 IDs exclusively.
+
+       psci {
+               compatible = "arm,psci-0.2", "arm,psci";
+               method = "hvc";
+
+               cpu_on = < arbitrary value >;
+               cpu_off = < arbitrary value >;
+
+               ...
+       };
index b4f53653c106d40d648930776e3258278c476932..75f20c6038a9290f801703981d600c9fd5f34796 100644 (file)
@@ -1794,6 +1794,11 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_MMCR0     | 64
   PPC   | KVM_REG_PPC_MMCR1     | 64
   PPC   | KVM_REG_PPC_MMCRA     | 64
+  PPC   | KVM_REG_PPC_MMCR2     | 64
+  PPC   | KVM_REG_PPC_MMCRS     | 64
+  PPC   | KVM_REG_PPC_SIAR      | 64
+  PPC   | KVM_REG_PPC_SDAR      | 64
+  PPC   | KVM_REG_PPC_SIER      | 64
   PPC   | KVM_REG_PPC_PMC1      | 32
   PPC   | KVM_REG_PPC_PMC2      | 32
   PPC   | KVM_REG_PPC_PMC3      | 32
@@ -1868,6 +1873,7 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_PPR      | 64
   PPC   | KVM_REG_PPC_ARCH_COMPAT 32
   PPC   | KVM_REG_PPC_DABRX     | 32
+  PPC   | KVM_REG_PPC_WORT      | 64
   PPC   | KVM_REG_PPC_TM_GPR0  | 64
           ...
   PPC   | KVM_REG_PPC_TM_GPR31 | 64
@@ -2211,6 +2217,8 @@ KVM_S390_SIGP_STOP (vcpu) - sigp restart
 KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm
 KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm
 KVM_S390_RESTART (vcpu) - restart
+KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt
+KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt
 KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt
                           parameters in parm and parm64
 KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm
@@ -2314,8 +2322,8 @@ struct kvm_create_device {
 
 4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
 
-Capability: KVM_CAP_DEVICE_CTRL
-Type: device ioctl
+Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device
+Type: device ioctl, vm ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
@@ -2340,8 +2348,8 @@ struct kvm_device_attr {
 
 4.81 KVM_HAS_DEVICE_ATTR
 
-Capability: KVM_CAP_DEVICE_CTRL
-Type: device ioctl
+Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device
+Type: device ioctl, vm ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
@@ -2376,6 +2384,8 @@ Possible features:
          Depends on KVM_CAP_ARM_PSCI.
        - KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode.
          Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only).
+       - KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 for the CPU.
+         Depends on KVM_CAP_ARM_PSCI_0_2.
 
 
 4.83 KVM_ARM_PREFERRED_TARGET
@@ -2738,6 +2748,21 @@ It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an
 external interrupt has just been delivered into the guest. User space
 should put the acknowledged interrupt vector into the 'epr' field.
 
+               /* KVM_EXIT_SYSTEM_EVENT */
+               struct {
+#define KVM_SYSTEM_EVENT_SHUTDOWN       1
+#define KVM_SYSTEM_EVENT_RESET          2
+                       __u32 type;
+                       __u64 flags;
+               } system_event;
+
+If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
+a system-level event using some architecture specific mechanism (hypercall
+or some special instruction). In case of ARM/ARM64, this is triggered using
+HVC instruction based PSCI call from the vcpu. The 'type' field describes
+the system-level event type. The 'flags' field describes architecture
+specific flags for the system-level event.
+
                /* Fix the size of the union. */
                char padding[256];
        };
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
new file mode 100644 (file)
index 0000000..0d16f96
--- /dev/null
@@ -0,0 +1,26 @@
+Generic vm interface
+====================================
+
+The virtual machine "device" also accepts the ioctls KVM_SET_DEVICE_ATTR,
+KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same
+struct kvm_device_attr as other devices, but targets VM-wide settings
+and controls.
+
+The groups and attributes per virtual machine, if any, are architecture
+specific.
+
+1. GROUP: KVM_S390_VM_MEM_CTRL
+Architectures: s390
+
+1.1. ATTRIBUTE: KVM_S390_VM_MEM_CTRL
+Parameters: none
+Returns: -EBUSY if already a vcpus is defined, otherwise 0
+
+Enables CMMA for the virtual machine
+
+1.2. ATTRIBUTE: KVM_S390_VM_CLR_CMMA
+Parameteres: none
+Returns: 0
+
+Clear the CMMA status for all guest pages, so any pages the guest marked
+as unused are again used any may not be reclaimed by the host.
index 4643cde517c4a870b2ec4f97b517d6f566a8e303..319560646f3293020469c4cd2eddc5588cb2fac9 100644 (file)
@@ -94,10 +94,24 @@ a bitmap of available features inside the magic page.
 The following enhancements to the magic page are currently available:
 
   KVM_MAGIC_FEAT_SR            Maps SR registers r/w in the magic page
+  KVM_MAGIC_FEAT_MAS0_TO_SPRG7 Maps MASn, ESR, PIR and high SPRGs
 
 For enhanced features in the magic page, please check for the existence of the
 feature before using them!
 
+Magic page flags
+================
+
+In addition to features that indicate whether a host is capable of a particular
+feature we also have a channel for a guest to tell the guest whether it's capable
+of something. This is what we call "flags".
+
+Flags are passed to the host in the low 12 bits of the Effective Address.
+
+The following flags are currently available for a guest to expose:
+
+  MAGIC_PAGE_FLAG_NOT_MAPPED_NX Guest handles NX bits correclty wrt magic page
+
 MSR bits
 ========
 
index f1de4fbade155b3ccccdf7d0328cdf0adacf2a7f..48c4921794edf0b98c9dfb59e0361b235d3124e2 100644 (file)
@@ -78,3 +78,5 @@ DIAGNOSE function code 'X'501 - KVM breakpoint
 
 If the function code specifies 0x501, breakpoint functions may be performed.
 This function code is handled by userspace.
+
+This diagnose function code has no subfunctions and uses no parameters.
index 09af14999c9b2458adee96fc9a9c8ff621a720ca..193ceaf01bfd00078fd150e4b80e1edef2324f91 100644 (file)
@@ -36,7 +36,7 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HAVE_ONE_REG
 
-#define KVM_VCPU_MAX_FEATURES 1
+#define KVM_VCPU_MAX_FEATURES 2
 
 #include <kvm/arm_vgic.h>
 
index 9a83d98bf170c2c158c050ff275337c6a59537cc..6bda945d31fa8effe1b9d51c589733c5f9f66183 100644 (file)
 #ifndef __ARM_KVM_PSCI_H__
 #define __ARM_KVM_PSCI_H__
 
-bool kvm_psci_call(struct kvm_vcpu *vcpu);
+#define KVM_ARM_PSCI_0_1       1
+#define KVM_ARM_PSCI_0_2       2
+
+int kvm_psci_version(struct kvm_vcpu *vcpu);
+int kvm_psci_call(struct kvm_vcpu *vcpu);
 
 #endif /* __ARM_KVM_PSCI_H__ */
index c4ae171850f8326bef2c3b991693771ae2f13d69..c25ef3ec6d1f85dc1e259dd1f777ce546d33d97e 100644 (file)
@@ -29,16 +29,19 @@ struct psci_operations {
        int (*cpu_off)(struct psci_power_state state);
        int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
        int (*migrate)(unsigned long cpuid);
+       int (*affinity_info)(unsigned long target_affinity,
+                       unsigned long lowest_affinity_level);
+       int (*migrate_info_type)(void);
 };
 
 extern struct psci_operations psci_ops;
 extern struct smp_operations psci_smp_ops;
 
 #ifdef CONFIG_ARM_PSCI
-void psci_init(void);
+int psci_init(void);
 bool psci_smp_available(void);
 #else
-static inline void psci_init(void) { }
+static inline int psci_init(void) { return 0; }
 static inline bool psci_smp_available(void) { return false; }
 #endif
 
index ef0c8785ba165f62be49e1498653f403a6a3c757..e6ebdd3471e566f7e3d39e8c387f583342c2b9c7 100644 (file)
@@ -20,6 +20,7 @@
 #define __ARM_KVM_H__
 
 #include <linux/types.h>
+#include <linux/psci.h>
 #include <asm/ptrace.h>
 
 #define __KVM_HAVE_GUEST_DEBUG
@@ -83,6 +84,7 @@ struct kvm_regs {
 #define KVM_VGIC_V2_CPU_SIZE           0x2000
 
 #define KVM_ARM_VCPU_POWER_OFF         0 /* CPU is started in OFF state */
+#define KVM_ARM_VCPU_PSCI_0_2          1 /* CPU uses PSCI v0.2 */
 
 struct kvm_vcpu_init {
        __u32 target;
@@ -201,9 +203,9 @@ struct kvm_arch_memory_slot {
 #define KVM_PSCI_FN_CPU_ON             KVM_PSCI_FN(2)
 #define KVM_PSCI_FN_MIGRATE            KVM_PSCI_FN(3)
 
-#define KVM_PSCI_RET_SUCCESS           0
-#define KVM_PSCI_RET_NI                        ((unsigned long)-1)
-#define KVM_PSCI_RET_INVAL             ((unsigned long)-2)
-#define KVM_PSCI_RET_DENIED            ((unsigned long)-3)
+#define KVM_PSCI_RET_SUCCESS           PSCI_RET_SUCCESS
+#define KVM_PSCI_RET_NI                        PSCI_RET_NOT_SUPPORTED
+#define KVM_PSCI_RET_INVAL             PSCI_RET_INVALID_PARAMS
+#define KVM_PSCI_RET_DENIED            PSCI_RET_DENIED
 
 #endif /* __ARM_KVM_H__ */
index 46931880093dc1bb2382b893743a5da4168f87a0..f73891b6b7300dc67e353a8694a035e9a212c2d7 100644 (file)
 
 #include <linux/init.h>
 #include <linux/of.h>
+#include <linux/reboot.h>
+#include <linux/pm.h>
+#include <uapi/linux/psci.h>
 
 #include <asm/compiler.h>
 #include <asm/errno.h>
 #include <asm/opcodes-sec.h>
 #include <asm/opcodes-virt.h>
 #include <asm/psci.h>
+#include <asm/system_misc.h>
 
 struct psci_operations psci_ops;
 
 static int (*invoke_psci_fn)(u32, u32, u32, u32);
+typedef int (*psci_initcall_t)(const struct device_node *);
 
 enum psci_function {
        PSCI_FN_CPU_SUSPEND,
        PSCI_FN_CPU_ON,
        PSCI_FN_CPU_OFF,
        PSCI_FN_MIGRATE,
+       PSCI_FN_AFFINITY_INFO,
+       PSCI_FN_MIGRATE_INFO_TYPE,
        PSCI_FN_MAX,
 };
 
 static u32 psci_function_id[PSCI_FN_MAX];
 
-#define PSCI_RET_SUCCESS               0
-#define PSCI_RET_EOPNOTSUPP            -1
-#define PSCI_RET_EINVAL                        -2
-#define PSCI_RET_EPERM                 -3
-
 static int psci_to_linux_errno(int errno)
 {
        switch (errno) {
        case PSCI_RET_SUCCESS:
                return 0;
-       case PSCI_RET_EOPNOTSUPP:
+       case PSCI_RET_NOT_SUPPORTED:
                return -EOPNOTSUPP;
-       case PSCI_RET_EINVAL:
+       case PSCI_RET_INVALID_PARAMS:
                return -EINVAL;
-       case PSCI_RET_EPERM:
+       case PSCI_RET_DENIED:
                return -EPERM;
        };
 
        return -EINVAL;
 }
 
-#define PSCI_POWER_STATE_ID_MASK       0xffff
-#define PSCI_POWER_STATE_ID_SHIFT      0
-#define PSCI_POWER_STATE_TYPE_MASK     0x1
-#define PSCI_POWER_STATE_TYPE_SHIFT    16
-#define PSCI_POWER_STATE_AFFL_MASK     0x3
-#define PSCI_POWER_STATE_AFFL_SHIFT    24
-
 static u32 psci_power_state_pack(struct psci_power_state state)
 {
-       return  ((state.id & PSCI_POWER_STATE_ID_MASK)
-                       << PSCI_POWER_STATE_ID_SHIFT)   |
-               ((state.type & PSCI_POWER_STATE_TYPE_MASK)
-                       << PSCI_POWER_STATE_TYPE_SHIFT) |
-               ((state.affinity_level & PSCI_POWER_STATE_AFFL_MASK)
-                       << PSCI_POWER_STATE_AFFL_SHIFT);
+       return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT)
+                       & PSCI_0_2_POWER_STATE_ID_MASK) |
+               ((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
+                & PSCI_0_2_POWER_STATE_TYPE_MASK) |
+               ((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
+                & PSCI_0_2_POWER_STATE_AFFL_MASK);
 }
 
 /*
@@ -110,6 +105,14 @@ static noinline int __invoke_psci_fn_smc(u32 function_id, u32 arg0, u32 arg1,
        return function_id;
 }
 
+static int psci_get_version(void)
+{
+       int err;
+
+       err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
+       return err;
+}
+
 static int psci_cpu_suspend(struct psci_power_state state,
                            unsigned long entry_point)
 {
@@ -153,26 +156,36 @@ static int psci_migrate(unsigned long cpuid)
        return psci_to_linux_errno(err);
 }
 
-static const struct of_device_id psci_of_match[] __initconst = {
-       { .compatible = "arm,psci",     },
-       {},
-};
+static int psci_affinity_info(unsigned long target_affinity,
+               unsigned long lowest_affinity_level)
+{
+       int err;
+       u32 fn;
+
+       fn = psci_function_id[PSCI_FN_AFFINITY_INFO];
+       err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0);
+       return err;
+}
 
-void __init psci_init(void)
+static int psci_migrate_info_type(void)
 {
-       struct device_node *np;
-       const char *method;
-       u32 id;
+       int err;
+       u32 fn;
 
-       np = of_find_matching_node(NULL, psci_of_match);
-       if (!np)
-               return;
+       fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE];
+       err = invoke_psci_fn(fn, 0, 0, 0);
+       return err;
+}
+
+static int get_set_conduit_method(struct device_node *np)
+{
+       const char *method;
 
-       pr_info("probing function IDs from device-tree\n");
+       pr_info("probing for conduit method from DT.\n");
 
        if (of_property_read_string(np, "method", &method)) {
-               pr_warning("missing \"method\" property\n");
-               goto out_put_node;
+               pr_warn("missing \"method\" property\n");
+               return -ENXIO;
        }
 
        if (!strcmp("hvc", method)) {
@@ -180,10 +193,99 @@ void __init psci_init(void)
        } else if (!strcmp("smc", method)) {
                invoke_psci_fn = __invoke_psci_fn_smc;
        } else {
-               pr_warning("invalid \"method\" property: %s\n", method);
+               pr_warn("invalid \"method\" property: %s\n", method);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
+{
+       invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
+}
+
+static void psci_sys_poweroff(void)
+{
+       invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
+}
+
+/*
+ * PSCI Function IDs for v0.2+ are well defined so use
+ * standard values.
+ */
+static int psci_0_2_init(struct device_node *np)
+{
+       int err, ver;
+
+       err = get_set_conduit_method(np);
+
+       if (err)
+               goto out_put_node;
+
+       ver = psci_get_version();
+
+       if (ver == PSCI_RET_NOT_SUPPORTED) {
+               /* PSCI v0.2 mandates implementation of PSCI_ID_VERSION. */
+               pr_err("PSCI firmware does not comply with the v0.2 spec.\n");
+               err = -EOPNOTSUPP;
                goto out_put_node;
+       } else {
+               pr_info("PSCIv%d.%d detected in firmware.\n",
+                               PSCI_VERSION_MAJOR(ver),
+                               PSCI_VERSION_MINOR(ver));
+
+               if (PSCI_VERSION_MAJOR(ver) == 0 &&
+                               PSCI_VERSION_MINOR(ver) < 2) {
+                       err = -EINVAL;
+                       pr_err("Conflicting PSCI version detected.\n");
+                       goto out_put_node;
+               }
        }
 
+       pr_info("Using standard PSCI v0.2 function IDs\n");
+       psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN_CPU_SUSPEND;
+       psci_ops.cpu_suspend = psci_cpu_suspend;
+
+       psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
+       psci_ops.cpu_off = psci_cpu_off;
+
+       psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN_CPU_ON;
+       psci_ops.cpu_on = psci_cpu_on;
+
+       psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN_MIGRATE;
+       psci_ops.migrate = psci_migrate;
+
+       psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN_AFFINITY_INFO;
+       psci_ops.affinity_info = psci_affinity_info;
+
+       psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] =
+               PSCI_0_2_FN_MIGRATE_INFO_TYPE;
+       psci_ops.migrate_info_type = psci_migrate_info_type;
+
+       arm_pm_restart = psci_sys_reset;
+
+       pm_power_off = psci_sys_poweroff;
+
+out_put_node:
+       of_node_put(np);
+       return err;
+}
+
+/*
+ * PSCI < v0.2 get PSCI Function IDs via DT.
+ */
+static int psci_0_1_init(struct device_node *np)
+{
+       u32 id;
+       int err;
+
+       err = get_set_conduit_method(np);
+
+       if (err)
+               goto out_put_node;
+
+       pr_info("Using PSCI v0.1 Function IDs from DT\n");
+
        if (!of_property_read_u32(np, "cpu_suspend", &id)) {
                psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
                psci_ops.cpu_suspend = psci_cpu_suspend;
@@ -206,5 +308,25 @@ void __init psci_init(void)
 
 out_put_node:
        of_node_put(np);
-       return;
+       return err;
+}
+
+static const struct of_device_id psci_of_match[] __initconst = {
+       { .compatible = "arm,psci", .data = psci_0_1_init},
+       { .compatible = "arm,psci-0.2", .data = psci_0_2_init},
+       {},
+};
+
+int __init psci_init(void)
+{
+       struct device_node *np;
+       const struct of_device_id *matched_np;
+       psci_initcall_t init_fn;
+
+       np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
+       if (!np)
+               return -ENODEV;
+
+       init_fn = (psci_initcall_t)matched_np->data;
+       return init_fn(np);
 }
index 570a48cc3d64b1714bd711c4b4cc4a6d1ab1358c..28a1db4da70428cbc286f80359eeae5bbdcd8057 100644 (file)
@@ -16,6 +16,8 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/of.h>
+#include <linux/delay.h>
+#include <uapi/linux/psci.h>
 
 #include <asm/psci.h>
 #include <asm/smp_plat.h>
@@ -66,6 +68,36 @@ void __ref psci_cpu_die(unsigned int cpu)
        /* We should never return */
        panic("psci: cpu %d failed to shutdown\n", cpu);
 }
+
+int __ref psci_cpu_kill(unsigned int cpu)
+{
+       int err, i;
+
+       if (!psci_ops.affinity_info)
+               return 1;
+       /*
+        * cpu_kill could race with cpu_die and we can
+        * potentially end up declaring this cpu undead
+        * while it is dying. So, try again a few times.
+        */
+
+       for (i = 0; i < 10; i++) {
+               err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
+               if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {
+                       pr_info("CPU%d killed.\n", cpu);
+                       return 1;
+               }
+
+               msleep(10);
+               pr_info("Retrying again to check for CPU kill\n");
+       }
+
+       pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",
+                       cpu, err);
+       /* Make platform_cpu_kill() fail. */
+       return 0;
+}
+
 #endif
 
 bool __init psci_smp_available(void)
@@ -78,5 +110,6 @@ struct smp_operations __initdata psci_smp_ops = {
        .smp_boot_secondary     = psci_boot_secondary,
 #ifdef CONFIG_HOTPLUG_CPU
        .cpu_die                = psci_cpu_die,
+       .cpu_kill               = psci_cpu_kill,
 #endif
 };
index f0e50a0f3a65b1c0476ec18db3ff914594c1061f..3c82b37c0f9edbe031718a67ac3c8426139f8862 100644 (file)
@@ -197,6 +197,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
        case KVM_CAP_ONE_REG:
        case KVM_CAP_ARM_PSCI:
+       case KVM_CAP_ARM_PSCI_0_2:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
index 0de91fc6de0ff06e671357430c0544581127ea80..4c979d466cc1681c4b3efc70623345eee5974b78 100644 (file)
@@ -38,14 +38,18 @@ static int handle_svc_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
+       int ret;
+
        trace_kvm_hvc(*vcpu_pc(vcpu), *vcpu_reg(vcpu, 0),
                      kvm_vcpu_hvc_get_imm(vcpu));
 
-       if (kvm_psci_call(vcpu))
+       ret = kvm_psci_call(vcpu);
+       if (ret < 0) {
+               kvm_inject_undefined(vcpu);
                return 1;
+       }
 
-       kvm_inject_undefined(vcpu);
-       return 1;
+       return ret;
 }
 
 static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
index 448f60e8d23ca0c9886baa72ea437087b4fea41e..09cf37737ee2ad24bda1251541689ea2f8bdb535 100644 (file)
  * as described in ARM document number ARM DEN 0022A.
  */
 
+#define AFFINITY_MASK(level)   ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
+
+static unsigned long psci_affinity_mask(unsigned long affinity_level)
+{
+       if (affinity_level <= 3)
+               return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
+
+       return 0;
+}
+
+static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
+{
+       /*
+        * NOTE: For simplicity, we make VCPU suspend emulation to be
+        * same-as WFI (Wait-for-interrupt) emulation.
+        *
+        * This means for KVM the wakeup events are interrupts and
+        * this is consistent with intended use of StateID as described
+        * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
+        *
+        * Further, we also treat power-down request to be same as
+        * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
+        * specification (ARM DEN 0022A). This means all suspend states
+        * for KVM will preserve the register state.
+        */
+       kvm_vcpu_block(vcpu);
+
+       return PSCI_RET_SUCCESS;
+}
+
 static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.pause = true;
@@ -38,6 +68,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
        struct kvm_vcpu *vcpu = NULL, *tmp;
        wait_queue_head_t *wq;
        unsigned long cpu_id;
+       unsigned long context_id;
        unsigned long mpidr;
        phys_addr_t target_pc;
        int i;
@@ -58,10 +89,17 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
         * Make sure the caller requested a valid CPU and that the CPU is
         * turned off.
         */
-       if (!vcpu || !vcpu->arch.pause)
-               return KVM_PSCI_RET_INVAL;
+       if (!vcpu)
+               return PSCI_RET_INVALID_PARAMS;
+       if (!vcpu->arch.pause) {
+               if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
+                       return PSCI_RET_ALREADY_ON;
+               else
+                       return PSCI_RET_INVALID_PARAMS;
+       }
 
        target_pc = *vcpu_reg(source_vcpu, 2);
+       context_id = *vcpu_reg(source_vcpu, 3);
 
        kvm_reset_vcpu(vcpu);
 
@@ -76,26 +114,160 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
                kvm_vcpu_set_be(vcpu);
 
        *vcpu_pc(vcpu) = target_pc;
+       /*
+        * NOTE: We always update r0 (or x0) because for PSCI v0.1
+        * the general puspose registers are undefined upon CPU_ON.
+        */
+       *vcpu_reg(vcpu, 0) = context_id;
        vcpu->arch.pause = false;
        smp_mb();               /* Make sure the above is visible */
 
        wq = kvm_arch_vcpu_wq(vcpu);
        wake_up_interruptible(wq);
 
-       return KVM_PSCI_RET_SUCCESS;
+       return PSCI_RET_SUCCESS;
 }
 
-/**
- * kvm_psci_call - handle PSCI call if r0 value is in range
- * @vcpu: Pointer to the VCPU struct
- *
- * Handle PSCI calls from guests through traps from HVC instructions.
- * The calling convention is similar to SMC calls to the secure world where
- * the function number is placed in r0 and this function returns true if the
- * function number specified in r0 is withing the PSCI range, and false
- * otherwise.
- */
-bool kvm_psci_call(struct kvm_vcpu *vcpu)
+static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
+{
+       int i;
+       unsigned long mpidr;
+       unsigned long target_affinity;
+       unsigned long target_affinity_mask;
+       unsigned long lowest_affinity_level;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu *tmp;
+
+       target_affinity = *vcpu_reg(vcpu, 1);
+       lowest_affinity_level = *vcpu_reg(vcpu, 2);
+
+       /* Determine target affinity mask */
+       target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
+       if (!target_affinity_mask)
+               return PSCI_RET_INVALID_PARAMS;
+
+       /* Ignore other bits of target affinity */
+       target_affinity &= target_affinity_mask;
+
+       /*
+        * If one or more VCPU matching target affinity are running
+        * then ON else OFF
+        */
+       kvm_for_each_vcpu(i, tmp, kvm) {
+               mpidr = kvm_vcpu_get_mpidr(tmp);
+               if (((mpidr & target_affinity_mask) == target_affinity) &&
+                   !tmp->arch.pause) {
+                       return PSCI_0_2_AFFINITY_LEVEL_ON;
+               }
+       }
+
+       return PSCI_0_2_AFFINITY_LEVEL_OFF;
+}
+
+static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
+{
+       memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
+       vcpu->run->system_event.type = type;
+       vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+}
+
+static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
+{
+       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
+}
+
+static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
+{
+       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
+}
+
+int kvm_psci_version(struct kvm_vcpu *vcpu)
+{
+       if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
+               return KVM_ARM_PSCI_0_2;
+
+       return KVM_ARM_PSCI_0_1;
+}
+
+static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
+{
+       int ret = 1;
+       unsigned long psci_fn = *vcpu_reg(vcpu, 0) & ~((u32) 0);
+       unsigned long val;
+
+       switch (psci_fn) {
+       case PSCI_0_2_FN_PSCI_VERSION:
+               /*
+                * Bits[31:16] = Major Version = 0
+                * Bits[15:0] = Minor Version = 2
+                */
+               val = 2;
+               break;
+       case PSCI_0_2_FN_CPU_SUSPEND:
+       case PSCI_0_2_FN64_CPU_SUSPEND:
+               val = kvm_psci_vcpu_suspend(vcpu);
+               break;
+       case PSCI_0_2_FN_CPU_OFF:
+               kvm_psci_vcpu_off(vcpu);
+               val = PSCI_RET_SUCCESS;
+               break;
+       case PSCI_0_2_FN_CPU_ON:
+       case PSCI_0_2_FN64_CPU_ON:
+               val = kvm_psci_vcpu_on(vcpu);
+               break;
+       case PSCI_0_2_FN_AFFINITY_INFO:
+       case PSCI_0_2_FN64_AFFINITY_INFO:
+               val = kvm_psci_vcpu_affinity_info(vcpu);
+               break;
+       case PSCI_0_2_FN_MIGRATE:
+       case PSCI_0_2_FN64_MIGRATE:
+               val = PSCI_RET_NOT_SUPPORTED;
+               break;
+       case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+               /*
+                * Trusted OS is MP hence does not require migration
+                * or
+                * Trusted OS is not present
+                */
+               val = PSCI_0_2_TOS_MP;
+               break;
+       case PSCI_0_2_FN_MIGRATE_INFO_UP_CPU:
+       case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU:
+               val = PSCI_RET_NOT_SUPPORTED;
+               break;
+       case PSCI_0_2_FN_SYSTEM_OFF:
+               kvm_psci_system_off(vcpu);
+               /*
+                * We should'nt be going back to guest VCPU after
+                * receiving SYSTEM_OFF request.
+                *
+                * If user space accidently/deliberately resumes
+                * guest VCPU after SYSTEM_OFF request then guest
+                * VCPU should see internal failure from PSCI return
+                * value. To achieve this, we preload r0 (or x0) with
+                * PSCI return value INTERNAL_FAILURE.
+                */
+               val = PSCI_RET_INTERNAL_FAILURE;
+               ret = 0;
+               break;
+       case PSCI_0_2_FN_SYSTEM_RESET:
+               kvm_psci_system_reset(vcpu);
+               /*
+                * Same reason as SYSTEM_OFF for preloading r0 (or x0)
+                * with PSCI return value INTERNAL_FAILURE.
+                */
+               val = PSCI_RET_INTERNAL_FAILURE;
+               ret = 0;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       *vcpu_reg(vcpu, 0) = val;
+       return ret;
+}
+
+static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
 {
        unsigned long psci_fn = *vcpu_reg(vcpu, 0) & ~((u32) 0);
        unsigned long val;
@@ -103,20 +275,45 @@ bool kvm_psci_call(struct kvm_vcpu *vcpu)
        switch (psci_fn) {
        case KVM_PSCI_FN_CPU_OFF:
                kvm_psci_vcpu_off(vcpu);
-               val = KVM_PSCI_RET_SUCCESS;
+               val = PSCI_RET_SUCCESS;
                break;
        case KVM_PSCI_FN_CPU_ON:
                val = kvm_psci_vcpu_on(vcpu);
                break;
        case KVM_PSCI_FN_CPU_SUSPEND:
        case KVM_PSCI_FN_MIGRATE:
-               val = KVM_PSCI_RET_NI;
+               val = PSCI_RET_NOT_SUPPORTED;
                break;
-
        default:
-               return false;
+               return -EINVAL;
        }
 
        *vcpu_reg(vcpu, 0) = val;
-       return true;
+       return 1;
+}
+
+/**
+ * kvm_psci_call - handle PSCI call if r0 value is in range
+ * @vcpu: Pointer to the VCPU struct
+ *
+ * Handle PSCI calls from guests through traps from HVC instructions.
+ * The calling convention is similar to SMC calls to the secure world
+ * where the function number is placed in r0.
+ *
+ * This function returns: > 0 (success), 0 (success but exit to user
+ * space), and < 0 (errors)
+ *
+ * Errors:
+ * -EINVAL: Unrecognized PSCI function
+ */
+int kvm_psci_call(struct kvm_vcpu *vcpu)
+{
+       switch (kvm_psci_version(vcpu)) {
+       case KVM_ARM_PSCI_0_2:
+               return kvm_psci_0_2_call(vcpu);
+       case KVM_ARM_PSCI_0_1:
+               return kvm_psci_0_1_call(vcpu);
+       default:
+               return -EINVAL;
+       };
 }
index 152413076503ba4731ce13d693ae2fa70d197795..d7b4b38a8e8625f25131bfc5d72c79aecfbbc4f8 100644 (file)
@@ -39,6 +39,7 @@ struct device_node;
  *             from the cpu to be killed.
  * @cpu_die:   Makes a cpu leave the kernel. Must not fail. Called from the
  *             cpu being killed.
+ * @cpu_kill:  Ensures a cpu has left the kernel. Called from another cpu.
  * @cpu_suspend: Suspends a cpu and saves the required context. May fail owing
  *               to wrong parameters or error conditions. Called from the
  *               CPU being suspended. Must be called with IRQs disabled.
@@ -52,6 +53,7 @@ struct cpu_operations {
 #ifdef CONFIG_HOTPLUG_CPU
        int             (*cpu_disable)(unsigned int cpu);
        void            (*cpu_die)(unsigned int cpu);
+       int             (*cpu_kill)(unsigned int cpu);
 #endif
 #ifdef CONFIG_ARM64_CPU_SUSPEND
        int             (*cpu_suspend)(unsigned long);
index c404fb0df3a673710285603c8ba7571fa42a86f8..27f54a7cc81b3b0d524b33afa20ef69f25b6ae29 100644 (file)
@@ -41,6 +41,7 @@
 
 #define ARM_CPU_PART_AEM_V8    0xD0F0
 #define ARM_CPU_PART_FOUNDATION        0xD000
+#define ARM_CPU_PART_CORTEX_A53        0xD030
 #define ARM_CPU_PART_CORTEX_A57        0xD070
 
 #define APM_CPU_PART_POTENZA   0x0000
index 0a1d69751562d15ec7657a9e6c9e68aa54359b83..92242ce06309c790f60ae1ec3a372ccbc673726e 100644 (file)
@@ -39,7 +39,7 @@
 #include <kvm/arm_vgic.h>
 #include <kvm/arm_arch_timer.h>
 
-#define KVM_VCPU_MAX_FEATURES 2
+#define KVM_VCPU_MAX_FEATURES 3
 
 struct kvm_vcpu;
 int kvm_target_cpu(void);
index e301a48163553c54017701a12e4dbaa8e0b84467..bc39e557c56c83e163e04c560ed45a3852e2d34f 100644 (file)
 #ifndef __ARM64_KVM_PSCI_H__
 #define __ARM64_KVM_PSCI_H__
 
-bool kvm_psci_call(struct kvm_vcpu *vcpu);
+#define KVM_ARM_PSCI_0_1       1
+#define KVM_ARM_PSCI_0_2       2
+
+int kvm_psci_version(struct kvm_vcpu *vcpu);
+int kvm_psci_call(struct kvm_vcpu *vcpu);
 
 #endif /* __ARM64_KVM_PSCI_H__ */
index d15ab8b463360869553f4ecfa4cc255d4c8455e3..e5312ea0ec1a59bdd92934926da81155a6ce3e12 100644 (file)
@@ -14,6 +14,6 @@
 #ifndef __ASM_PSCI_H
 #define __ASM_PSCI_H
 
-void psci_init(void);
+int psci_init(void);
 
 #endif /* __ASM_PSCI_H */
index eaf54a30bedcef3ee1b88b93e2e0f4741f0ef10d..e633ff8cdec8d8bc93bfcb4f486f03a74b3f9f59 100644 (file)
@@ -31,6 +31,7 @@
 #define KVM_NR_SPSR    5
 
 #ifndef __ASSEMBLY__
+#include <linux/psci.h>
 #include <asm/types.h>
 #include <asm/ptrace.h>
 
@@ -56,8 +57,9 @@ struct kvm_regs {
 #define KVM_ARM_TARGET_FOUNDATION_V8   1
 #define KVM_ARM_TARGET_CORTEX_A57      2
 #define KVM_ARM_TARGET_XGENE_POTENZA   3
+#define KVM_ARM_TARGET_CORTEX_A53      4
 
-#define KVM_ARM_NUM_TARGETS            4
+#define KVM_ARM_NUM_TARGETS            5
 
 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
 #define KVM_ARM_DEVICE_TYPE_SHIFT      0
@@ -77,6 +79,7 @@ struct kvm_regs {
 
 #define KVM_ARM_VCPU_POWER_OFF         0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_EL1_32BIT         1 /* CPU running a 32bit VM */
+#define KVM_ARM_VCPU_PSCI_0_2          2 /* CPU uses PSCI v0.2 */
 
 struct kvm_vcpu_init {
        __u32 target;
@@ -186,10 +189,10 @@ struct kvm_arch_memory_slot {
 #define KVM_PSCI_FN_CPU_ON             KVM_PSCI_FN(2)
 #define KVM_PSCI_FN_MIGRATE            KVM_PSCI_FN(3)
 
-#define KVM_PSCI_RET_SUCCESS           0
-#define KVM_PSCI_RET_NI                        ((unsigned long)-1)
-#define KVM_PSCI_RET_INVAL             ((unsigned long)-2)
-#define KVM_PSCI_RET_DENIED            ((unsigned long)-3)
+#define KVM_PSCI_RET_SUCCESS           PSCI_RET_SUCCESS
+#define KVM_PSCI_RET_NI                        PSCI_RET_NOT_SUPPORTED
+#define KVM_PSCI_RET_INVAL             PSCI_RET_INVALID_PARAMS
+#define KVM_PSCI_RET_DENIED            PSCI_RET_DENIED
 
 #endif
 
index ea4828a4aa96729993d85145603ed45afba7ac03..9e9798f91172ecb9d340c975f6260b1889ddec99 100644 (file)
 #include <linux/init.h>
 #include <linux/of.h>
 #include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/pm.h>
+#include <linux/delay.h>
+#include <uapi/linux/psci.h>
 
 #include <asm/compiler.h>
 #include <asm/cpu_ops.h>
 #include <asm/errno.h>
 #include <asm/psci.h>
 #include <asm/smp_plat.h>
+#include <asm/system_misc.h>
 
 #define PSCI_POWER_STATE_TYPE_STANDBY          0
 #define PSCI_POWER_STATE_TYPE_POWER_DOWN       1
@@ -40,58 +45,52 @@ struct psci_operations {
        int (*cpu_off)(struct psci_power_state state);
        int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
        int (*migrate)(unsigned long cpuid);
+       int (*affinity_info)(unsigned long target_affinity,
+                       unsigned long lowest_affinity_level);
+       int (*migrate_info_type)(void);
 };
 
 static struct psci_operations psci_ops;
 
 static int (*invoke_psci_fn)(u64, u64, u64, u64);
+typedef int (*psci_initcall_t)(const struct device_node *);
 
 enum psci_function {
        PSCI_FN_CPU_SUSPEND,
        PSCI_FN_CPU_ON,
        PSCI_FN_CPU_OFF,
        PSCI_FN_MIGRATE,
+       PSCI_FN_AFFINITY_INFO,
+       PSCI_FN_MIGRATE_INFO_TYPE,
        PSCI_FN_MAX,
 };
 
 static u32 psci_function_id[PSCI_FN_MAX];
 
-#define PSCI_RET_SUCCESS               0
-#define PSCI_RET_EOPNOTSUPP            -1
-#define PSCI_RET_EINVAL                        -2
-#define PSCI_RET_EPERM                 -3
-
 static int psci_to_linux_errno(int errno)
 {
        switch (errno) {
        case PSCI_RET_SUCCESS:
                return 0;
-       case PSCI_RET_EOPNOTSUPP:
+       case PSCI_RET_NOT_SUPPORTED:
                return -EOPNOTSUPP;
-       case PSCI_RET_EINVAL:
+       case PSCI_RET_INVALID_PARAMS:
                return -EINVAL;
-       case PSCI_RET_EPERM:
+       case PSCI_RET_DENIED:
                return -EPERM;
        };
 
        return -EINVAL;
 }
 
-#define PSCI_POWER_STATE_ID_MASK       0xffff
-#define PSCI_POWER_STATE_ID_SHIFT      0
-#define PSCI_POWER_STATE_TYPE_MASK     0x1
-#define PSCI_POWER_STATE_TYPE_SHIFT    16
-#define PSCI_POWER_STATE_AFFL_MASK     0x3
-#define PSCI_POWER_STATE_AFFL_SHIFT    24
-
 static u32 psci_power_state_pack(struct psci_power_state state)
 {
-       return  ((state.id & PSCI_POWER_STATE_ID_MASK)
-                       << PSCI_POWER_STATE_ID_SHIFT)   |
-               ((state.type & PSCI_POWER_STATE_TYPE_MASK)
-                       << PSCI_POWER_STATE_TYPE_SHIFT) |
-               ((state.affinity_level & PSCI_POWER_STATE_AFFL_MASK)
-                       << PSCI_POWER_STATE_AFFL_SHIFT);
+       return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT)
+                       & PSCI_0_2_POWER_STATE_ID_MASK) |
+               ((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
+                & PSCI_0_2_POWER_STATE_TYPE_MASK) |
+               ((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
+                & PSCI_0_2_POWER_STATE_AFFL_MASK);
 }
 
 /*
@@ -128,6 +127,14 @@ static noinline int __invoke_psci_fn_smc(u64 function_id, u64 arg0, u64 arg1,
        return function_id;
 }
 
+static int psci_get_version(void)
+{
+       int err;
+
+       err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
+       return err;
+}
+
 static int psci_cpu_suspend(struct psci_power_state state,
                            unsigned long entry_point)
 {
@@ -171,26 +178,36 @@ static int psci_migrate(unsigned long cpuid)
        return psci_to_linux_errno(err);
 }
 
-static const struct of_device_id psci_of_match[] __initconst = {
-       { .compatible = "arm,psci",     },
-       {},
-};
+static int psci_affinity_info(unsigned long target_affinity,
+               unsigned long lowest_affinity_level)
+{
+       int err;
+       u32 fn;
+
+       fn = psci_function_id[PSCI_FN_AFFINITY_INFO];
+       err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0);
+       return err;
+}
 
-void __init psci_init(void)
+static int psci_migrate_info_type(void)
 {
-       struct device_node *np;
-       const char *method;
-       u32 id;
+       int err;
+       u32 fn;
 
-       np = of_find_matching_node(NULL, psci_of_match);
-       if (!np)
-               return;
+       fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE];
+       err = invoke_psci_fn(fn, 0, 0, 0);
+       return err;
+}
 
-       pr_info("probing function IDs from device-tree\n");
+static int get_set_conduit_method(struct device_node *np)
+{
+       const char *method;
+
+       pr_info("probing for conduit method from DT.\n");
 
        if (of_property_read_string(np, "method", &method)) {
-               pr_warning("missing \"method\" property\n");
-               goto out_put_node;
+               pr_warn("missing \"method\" property\n");
+               return -ENXIO;
        }
 
        if (!strcmp("hvc", method)) {
@@ -198,10 +215,99 @@ void __init psci_init(void)
        } else if (!strcmp("smc", method)) {
                invoke_psci_fn = __invoke_psci_fn_smc;
        } else {
-               pr_warning("invalid \"method\" property: %s\n", method);
+               pr_warn("invalid \"method\" property: %s\n", method);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
+{
+       invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
+}
+
+static void psci_sys_poweroff(void)
+{
+       invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
+}
+
+/*
+ * PSCI Function IDs for v0.2+ are well defined so use
+ * standard values.
+ */
+static int psci_0_2_init(struct device_node *np)
+{
+       int err, ver;
+
+       err = get_set_conduit_method(np);
+
+       if (err)
+               goto out_put_node;
+
+       ver = psci_get_version();
+
+       if (ver == PSCI_RET_NOT_SUPPORTED) {
+               /* PSCI v0.2 mandates implementation of PSCI_ID_VERSION. */
+               pr_err("PSCI firmware does not comply with the v0.2 spec.\n");
+               err = -EOPNOTSUPP;
                goto out_put_node;
+       } else {
+               pr_info("PSCIv%d.%d detected in firmware.\n",
+                               PSCI_VERSION_MAJOR(ver),
+                               PSCI_VERSION_MINOR(ver));
+
+               if (PSCI_VERSION_MAJOR(ver) == 0 &&
+                               PSCI_VERSION_MINOR(ver) < 2) {
+                       err = -EINVAL;
+                       pr_err("Conflicting PSCI version detected.\n");
+                       goto out_put_node;
+               }
        }
 
+       pr_info("Using standard PSCI v0.2 function IDs\n");
+       psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN64_CPU_SUSPEND;
+       psci_ops.cpu_suspend = psci_cpu_suspend;
+
+       psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
+       psci_ops.cpu_off = psci_cpu_off;
+
+       psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN64_CPU_ON;
+       psci_ops.cpu_on = psci_cpu_on;
+
+       psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN64_MIGRATE;
+       psci_ops.migrate = psci_migrate;
+
+       psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN64_AFFINITY_INFO;
+       psci_ops.affinity_info = psci_affinity_info;
+
+       psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] =
+               PSCI_0_2_FN_MIGRATE_INFO_TYPE;
+       psci_ops.migrate_info_type = psci_migrate_info_type;
+
+       arm_pm_restart = psci_sys_reset;
+
+       pm_power_off = psci_sys_poweroff;
+
+out_put_node:
+       of_node_put(np);
+       return err;
+}
+
+/*
+ * PSCI < v0.2 get PSCI Function IDs via DT.
+ */
+static int psci_0_1_init(struct device_node *np)
+{
+       u32 id;
+       int err;
+
+       err = get_set_conduit_method(np);
+
+       if (err)
+               goto out_put_node;
+
+       pr_info("Using PSCI v0.1 Function IDs from DT\n");
+
        if (!of_property_read_u32(np, "cpu_suspend", &id)) {
                psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
                psci_ops.cpu_suspend = psci_cpu_suspend;
@@ -224,7 +330,28 @@ void __init psci_init(void)
 
 out_put_node:
        of_node_put(np);
-       return;
+       return err;
+}
+
+static const struct of_device_id psci_of_match[] __initconst = {
+       { .compatible = "arm,psci",     .data = psci_0_1_init},
+       { .compatible = "arm,psci-0.2", .data = psci_0_2_init},
+       {},
+};
+
+int __init psci_init(void)
+{
+       struct device_node *np;
+       const struct of_device_id *matched_np;
+       psci_initcall_t init_fn;
+
+       np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
+
+       if (!np)
+               return -ENODEV;
+
+       init_fn = (psci_initcall_t)matched_np->data;
+       return init_fn(np);
 }
 
 #ifdef CONFIG_SMP
@@ -277,6 +404,35 @@ static void cpu_psci_cpu_die(unsigned int cpu)
 
        pr_crit("unable to power off CPU%u (%d)\n", cpu, ret);
 }
+
+static int cpu_psci_cpu_kill(unsigned int cpu)
+{
+       int err, i;
+
+       if (!psci_ops.affinity_info)
+               return 1;
+       /*
+        * cpu_kill could race with cpu_die and we can
+        * potentially end up declaring this cpu undead
+        * while it is dying. So, try again a few times.
+        */
+
+       for (i = 0; i < 10; i++) {
+               err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
+               if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {
+                       pr_info("CPU%d killed.\n", cpu);
+                       return 1;
+               }
+
+               msleep(10);
+               pr_info("Retrying again to check for CPU kill\n");
+       }
+
+       pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",
+                       cpu, err);
+       /* Make op_cpu_kill() fail. */
+       return 0;
+}
 #endif
 
 const struct cpu_operations cpu_psci_ops = {
@@ -287,6 +443,7 @@ const struct cpu_operations cpu_psci_ops = {
 #ifdef CONFIG_HOTPLUG_CPU
        .cpu_disable    = cpu_psci_cpu_disable,
        .cpu_die        = cpu_psci_cpu_die,
+       .cpu_kill       = cpu_psci_cpu_kill,
 #endif
 };
 
index f0a141dd5655817171605293d45429c740495661..c3cb160edc697fd7d996040233188c90f78b81f4 100644 (file)
@@ -228,6 +228,19 @@ int __cpu_disable(void)
        return 0;
 }
 
+static int op_cpu_kill(unsigned int cpu)
+{
+       /*
+        * If we have no means of synchronising with the dying CPU, then assume
+        * that it is really dead. We can only wait for an arbitrary length of
+        * time and hope that it's dead, so let's skip the wait and just hope.
+        */
+       if (!cpu_ops[cpu]->cpu_kill)
+               return 1;
+
+       return cpu_ops[cpu]->cpu_kill(cpu);
+}
+
 static DECLARE_COMPLETION(cpu_died);
 
 /*
@@ -241,6 +254,15 @@ void __cpu_die(unsigned int cpu)
                return;
        }
        pr_notice("CPU%u: shutdown\n", cpu);
+
+       /*
+        * Now that the dying CPU is beyond the point of no return w.r.t.
+        * in-kernel synchronisation, try to get the firwmare to help us to
+        * verify that it has really left the kernel before we consider
+        * clobbering anything it might still be using.
+        */
+       if (!op_cpu_kill(cpu))
+               pr_warn("CPU%d may not have shut down cleanly\n", cpu);
 }
 
 /*
index 08745578d54de84466f6bbc800e4dc984e0e1c4f..60b5c31f3c10e58e72029d8854f13f527d84a097 100644 (file)
@@ -214,6 +214,8 @@ int __attribute_const__ kvm_target_cpu(void)
                        return KVM_ARM_TARGET_AEM_V8;
                case ARM_CPU_PART_FOUNDATION:
                        return KVM_ARM_TARGET_FOUNDATION_V8;
+               case ARM_CPU_PART_CORTEX_A53:
+                       return KVM_ARM_TARGET_CORTEX_A53;
                case ARM_CPU_PART_CORTEX_A57:
                        return KVM_ARM_TARGET_CORTEX_A57;
                };
index 7bc41eab4c649b8903a27793a8e19ab662b72f8a..182415e1a952bf5e54918b14f1534faaee731bba 100644 (file)
@@ -30,11 +30,15 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
 
 static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-       if (kvm_psci_call(vcpu))
+       int ret;
+
+       ret = kvm_psci_call(vcpu);
+       if (ret < 0) {
+               kvm_inject_undefined(vcpu);
                return 1;
+       }
 
-       kvm_inject_undefined(vcpu);
-       return 1;
+       return ret;
 }
 
 static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
index 8fe6f76b0edce8b80adc3640b82605a116c66315..475fd29293102649ea45ee411c077cca64f37782 100644 (file)
@@ -88,6 +88,8 @@ static int __init sys_reg_genericv8_init(void)
                                          &genericv8_target_table);
        kvm_register_target_sys_reg_table(KVM_ARM_TARGET_FOUNDATION_V8,
                                          &genericv8_target_table);
+       kvm_register_target_sys_reg_table(KVM_ARM_TARGET_CORTEX_A53,
+                                         &genericv8_target_table);
        kvm_register_target_sys_reg_table(KVM_ARM_TARGET_CORTEX_A57,
                                          &genericv8_target_table);
        kvm_register_target_sys_reg_table(KVM_ARM_TARGET_XGENE_POTENZA,
index 5cd695f905a1c424b02fed524a9169eaaa5d63f3..5e0014e864f355d7c27cfa9d6ccad07dd4fd8e1b 100644 (file)
@@ -1756,14 +1756,14 @@ config KVM_GUEST
        help
          Select this option if building a guest kernel for KVM (Trap & Emulate) mode
 
-config KVM_HOST_FREQ
-       int "KVM Host Processor Frequency (MHz)"
+config KVM_GUEST_TIMER_FREQ
+       int "Count/Compare Timer Frequency (MHz)"
        depends on KVM_GUEST
-       default 500
+       default 100
        help
-         Select this option if building a guest kernel for KVM to skip
-         RTC emulation when determining guest CPU Frequency.  Instead, the guest
-         processor frequency is automatically derived from the host frequency.
+         Set this to non-zero if building a guest kernel for KVM to skip RTC
+         emulation when determining guest CPU Frequency. Instead, the guest's
+         timer frequency is specified directly.
 
 choice
        prompt "Kernel page size"
index 060aaa6348d7e8cd39fa2acda69e30bf9be6e7fa..b0aa95565752a8d79212ecfd0bc56be0bacd2e42 100644 (file)
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 
+/* MIPS KVM register ids */
+#define MIPS_CP0_32(_R, _S)                                    \
+       (KVM_REG_MIPS | KVM_REG_SIZE_U32 | 0x10000 | (8 * (_R) + (_S)))
+
+#define MIPS_CP0_64(_R, _S)                                    \
+       (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 0x10000 | (8 * (_R) + (_S)))
+
+#define KVM_REG_MIPS_CP0_INDEX         MIPS_CP0_32(0, 0)
+#define KVM_REG_MIPS_CP0_ENTRYLO0      MIPS_CP0_64(2, 0)
+#define KVM_REG_MIPS_CP0_ENTRYLO1      MIPS_CP0_64(3, 0)
+#define KVM_REG_MIPS_CP0_CONTEXT       MIPS_CP0_64(4, 0)
+#define KVM_REG_MIPS_CP0_USERLOCAL     MIPS_CP0_64(4, 2)
+#define KVM_REG_MIPS_CP0_PAGEMASK      MIPS_CP0_32(5, 0)
+#define KVM_REG_MIPS_CP0_PAGEGRAIN     MIPS_CP0_32(5, 1)
+#define KVM_REG_MIPS_CP0_WIRED         MIPS_CP0_32(6, 0)
+#define KVM_REG_MIPS_CP0_HWRENA                MIPS_CP0_32(7, 0)
+#define KVM_REG_MIPS_CP0_BADVADDR      MIPS_CP0_64(8, 0)
+#define KVM_REG_MIPS_CP0_COUNT         MIPS_CP0_32(9, 0)
+#define KVM_REG_MIPS_CP0_ENTRYHI       MIPS_CP0_64(10, 0)
+#define KVM_REG_MIPS_CP0_COMPARE       MIPS_CP0_32(11, 0)
+#define KVM_REG_MIPS_CP0_STATUS                MIPS_CP0_32(12, 0)
+#define KVM_REG_MIPS_CP0_CAUSE         MIPS_CP0_32(13, 0)
+#define KVM_REG_MIPS_CP0_EPC           MIPS_CP0_64(14, 0)
+#define KVM_REG_MIPS_CP0_EBASE         MIPS_CP0_64(15, 1)
+#define KVM_REG_MIPS_CP0_CONFIG                MIPS_CP0_32(16, 0)
+#define KVM_REG_MIPS_CP0_CONFIG1       MIPS_CP0_32(16, 1)
+#define KVM_REG_MIPS_CP0_CONFIG2       MIPS_CP0_32(16, 2)
+#define KVM_REG_MIPS_CP0_CONFIG3       MIPS_CP0_32(16, 3)
+#define KVM_REG_MIPS_CP0_CONFIG7       MIPS_CP0_32(16, 7)
+#define KVM_REG_MIPS_CP0_XCONTEXT      MIPS_CP0_64(20, 0)
+#define KVM_REG_MIPS_CP0_ERROREPC      MIPS_CP0_64(30, 0)
+
 
 #define KVM_MAX_VCPUS          1
 #define KVM_USER_MEM_SLOTS     8
@@ -372,8 +404,19 @@ struct kvm_vcpu_arch {
 
        u32 io_gpr;             /* GPR used as IO source/target */
 
-       /* Used to calibrate the virutal count register for the guest */
-       int32_t host_cp0_count;
+       struct hrtimer comparecount_timer;
+       /* Count timer control KVM register */
+       uint32_t count_ctl;
+       /* Count bias from the raw time */
+       uint32_t count_bias;
+       /* Frequency of timer in Hz */
+       uint32_t count_hz;
+       /* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */
+       s64 count_dyn_bias;
+       /* Resume time */
+       ktime_t count_resume;
+       /* Period of timer tick in ns */
+       u64 count_period;
 
        /* Bitmask of exceptions that are pending */
        unsigned long pending_exceptions;
@@ -394,8 +437,6 @@ struct kvm_vcpu_arch {
        uint32_t guest_kernel_asid[NR_CPUS];
        struct mm_struct guest_kernel_mm, guest_user_mm;
 
-       struct hrtimer comparecount_timer;
-
        int last_sched_cpu;
 
        /* WAIT executed */
@@ -410,6 +451,7 @@ struct kvm_vcpu_arch {
 #define kvm_read_c0_guest_context(cop0)                (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
 #define kvm_write_c0_guest_context(cop0, val)  (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
 #define kvm_read_c0_guest_userlocal(cop0)      (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
+#define kvm_write_c0_guest_userlocal(cop0, val)        (cop0->reg[MIPS_CP0_TLB_CONTEXT][2] = (val))
 #define kvm_read_c0_guest_pagemask(cop0)       (cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
 #define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
 #define kvm_read_c0_guest_wired(cop0)          (cop0->reg[MIPS_CP0_TLB_WIRED][0])
@@ -449,15 +491,74 @@ struct kvm_vcpu_arch {
 #define kvm_read_c0_guest_errorepc(cop0)       (cop0->reg[MIPS_CP0_ERROR_PC][0])
 #define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
 
+/*
+ * Some of the guest registers may be modified asynchronously (e.g. from a
+ * hrtimer callback in hard irq context) and therefore need stronger atomicity
+ * guarantees than other registers.
+ */
+
+static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
+                                               unsigned long val)
+{
+       unsigned long temp;
+       do {
+               __asm__ __volatile__(
+               "       .set    mips3                           \n"
+               "       " __LL "%0, %1                          \n"
+               "       or      %0, %2                          \n"
+               "       " __SC  "%0, %1                         \n"
+               "       .set    mips0                           \n"
+               : "=&r" (temp), "+m" (*reg)
+               : "r" (val));
+       } while (unlikely(!temp));
+}
+
+static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg,
+                                                 unsigned long val)
+{
+       unsigned long temp;
+       do {
+               __asm__ __volatile__(
+               "       .set    mips3                           \n"
+               "       " __LL "%0, %1                          \n"
+               "       and     %0, %2                          \n"
+               "       " __SC  "%0, %1                         \n"
+               "       .set    mips0                           \n"
+               : "=&r" (temp), "+m" (*reg)
+               : "r" (~val));
+       } while (unlikely(!temp));
+}
+
+static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
+                                                  unsigned long change,
+                                                  unsigned long val)
+{
+       unsigned long temp;
+       do {
+               __asm__ __volatile__(
+               "       .set    mips3                           \n"
+               "       " __LL "%0, %1                          \n"
+               "       and     %0, %2                          \n"
+               "       or      %0, %3                          \n"
+               "       " __SC  "%0, %1                         \n"
+               "       .set    mips0                           \n"
+               : "=&r" (temp), "+m" (*reg)
+               : "r" (~change), "r" (val & change));
+       } while (unlikely(!temp));
+}
+
 #define kvm_set_c0_guest_status(cop0, val)     (cop0->reg[MIPS_CP0_STATUS][0] |= (val))
 #define kvm_clear_c0_guest_status(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
-#define kvm_set_c0_guest_cause(cop0, val)      (cop0->reg[MIPS_CP0_CAUSE][0] |= (val))
-#define kvm_clear_c0_guest_cause(cop0, val)    (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val))
+
+/* Cause can be modified asynchronously from hardirq hrtimer callback */
+#define kvm_set_c0_guest_cause(cop0, val)                              \
+       _kvm_atomic_set_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val)
+#define kvm_clear_c0_guest_cause(cop0, val)                            \
+       _kvm_atomic_clear_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val)
 #define kvm_change_c0_guest_cause(cop0, change, val)                   \
-{                                                                      \
-       kvm_clear_c0_guest_cause(cop0, change);                         \
-       kvm_set_c0_guest_cause(cop0, ((val) & (change)));               \
-}
+       _kvm_atomic_change_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0],  \
+                                       change, val)
+
 #define kvm_set_c0_guest_ebase(cop0, val)      (cop0->reg[MIPS_CP0_PRID][1] |= (val))
 #define kvm_clear_c0_guest_ebase(cop0, val)    (cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
 #define kvm_change_c0_guest_ebase(cop0, change, val)                   \
@@ -468,29 +569,33 @@ struct kvm_vcpu_arch {
 
 
 struct kvm_mips_callbacks {
-       int (*handle_cop_unusable) (struct kvm_vcpu *vcpu);
-       int (*handle_tlb_mod) (struct kvm_vcpu *vcpu);
-       int (*handle_tlb_ld_miss) (struct kvm_vcpu *vcpu);
-       int (*handle_tlb_st_miss) (struct kvm_vcpu *vcpu);
-       int (*handle_addr_err_st) (struct kvm_vcpu *vcpu);
-       int (*handle_addr_err_ld) (struct kvm_vcpu *vcpu);
-       int (*handle_syscall) (struct kvm_vcpu *vcpu);
-       int (*handle_res_inst) (struct kvm_vcpu *vcpu);
-       int (*handle_break) (struct kvm_vcpu *vcpu);
-       int (*vm_init) (struct kvm *kvm);
-       int (*vcpu_init) (struct kvm_vcpu *vcpu);
-       int (*vcpu_setup) (struct kvm_vcpu *vcpu);
-        gpa_t(*gva_to_gpa) (gva_t gva);
-       void (*queue_timer_int) (struct kvm_vcpu *vcpu);
-       void (*dequeue_timer_int) (struct kvm_vcpu *vcpu);
-       void (*queue_io_int) (struct kvm_vcpu *vcpu,
-                             struct kvm_mips_interrupt *irq);
-       void (*dequeue_io_int) (struct kvm_vcpu *vcpu,
-                               struct kvm_mips_interrupt *irq);
-       int (*irq_deliver) (struct kvm_vcpu *vcpu, unsigned int priority,
-                           uint32_t cause);
-       int (*irq_clear) (struct kvm_vcpu *vcpu, unsigned int priority,
-                         uint32_t cause);
+       int (*handle_cop_unusable)(struct kvm_vcpu *vcpu);
+       int (*handle_tlb_mod)(struct kvm_vcpu *vcpu);
+       int (*handle_tlb_ld_miss)(struct kvm_vcpu *vcpu);
+       int (*handle_tlb_st_miss)(struct kvm_vcpu *vcpu);
+       int (*handle_addr_err_st)(struct kvm_vcpu *vcpu);
+       int (*handle_addr_err_ld)(struct kvm_vcpu *vcpu);
+       int (*handle_syscall)(struct kvm_vcpu *vcpu);
+       int (*handle_res_inst)(struct kvm_vcpu *vcpu);
+       int (*handle_break)(struct kvm_vcpu *vcpu);
+       int (*vm_init)(struct kvm *kvm);
+       int (*vcpu_init)(struct kvm_vcpu *vcpu);
+       int (*vcpu_setup)(struct kvm_vcpu *vcpu);
+       gpa_t (*gva_to_gpa)(gva_t gva);
+       void (*queue_timer_int)(struct kvm_vcpu *vcpu);
+       void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
+       void (*queue_io_int)(struct kvm_vcpu *vcpu,
+                            struct kvm_mips_interrupt *irq);
+       void (*dequeue_io_int)(struct kvm_vcpu *vcpu,
+                              struct kvm_mips_interrupt *irq);
+       int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority,
+                          uint32_t cause);
+       int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority,
+                        uint32_t cause);
+       int (*get_one_reg)(struct kvm_vcpu *vcpu,
+                          const struct kvm_one_reg *reg, s64 *v);
+       int (*set_one_reg)(struct kvm_vcpu *vcpu,
+                          const struct kvm_one_reg *reg, s64 v);
 };
 extern struct kvm_mips_callbacks *kvm_mips_callbacks;
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
@@ -609,7 +714,16 @@ extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
 extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
                                                         struct kvm_run *run);
 
-enum emulation_result kvm_mips_emulate_count(struct kvm_vcpu *vcpu);
+uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare);
+void kvm_mips_init_count(struct kvm_vcpu *vcpu);
+int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
+int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
+int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz);
+void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
+void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
+enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
 
 enum emulation_result kvm_mips_check_privilege(unsigned long cause,
                                               uint32_t *opc,
@@ -646,7 +760,6 @@ extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc,
                               struct kvm_vcpu *vcpu);
 
 /* Misc */
-extern void mips32_SyncICache(unsigned long addr, unsigned long size);
 extern int kvm_mips_dump_stats(struct kvm_vcpu *vcpu);
 extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm);
 
index f09ff5ae20593497f9888796c143895122f2b1c4..2c04b6d9ff85380de722745e934944411a5e33d3 100644 (file)
@@ -106,6 +106,41 @@ struct kvm_fpu {
 #define KVM_REG_MIPS_LO (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 33)
 #define KVM_REG_MIPS_PC (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 34)
 
+/* KVM specific control registers */
+
+/*
+ * CP0_Count control
+ * DC:    Set 0: Master disable CP0_Count and set COUNT_RESUME to now
+ *        Set 1: Master re-enable CP0_Count with unchanged bias, handling timer
+ *               interrupts since COUNT_RESUME
+ *        This can be used to freeze the timer to get a consistent snapshot of
+ *        the CP0_Count and timer interrupt pending state, while also resuming
+ *        safely without losing time or guest timer interrupts.
+ * Other: Reserved, do not change.
+ */
+#define KVM_REG_MIPS_COUNT_CTL         (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
+                                        0x20000 | 0)
+#define KVM_REG_MIPS_COUNT_CTL_DC      0x00000001
+
+/*
+ * CP0_Count resume monotonic nanoseconds
+ * The monotonic nanosecond time of the last set of COUNT_CTL.DC (master
+ * disable). Any reads and writes of Count related registers while
+ * COUNT_CTL.DC=1 will appear to occur at this time. When COUNT_CTL.DC is
+ * cleared again (master enable) any timer interrupts since this time will be
+ * emulated.
+ * Modifications to times in the future are rejected.
+ */
+#define KVM_REG_MIPS_COUNT_RESUME      (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
+                                        0x20000 | 1)
+/*
+ * CP0_Count rate in Hz
+ * Specifies the rate of the CP0_Count timer in Hz. Modifications occur without
+ * discontinuities in CP0_Count.
+ */
+#define KVM_REG_MIPS_COUNT_HZ          (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
+                                        0x20000 | 2)
+
 /*
  * KVM MIPS specific structures and definitions
  *
index bbace092ad0addef5277391a14944d5590b97be4..033ac343e72c81ca7a3a09cf1626359760a529a4 100644 (file)
@@ -611,35 +611,3 @@ MIPSX(exceptions):
        .word _C_LABEL(MIPSX(GuestException))   # 29
        .word _C_LABEL(MIPSX(GuestException))   # 30
        .word _C_LABEL(MIPSX(GuestException))   # 31
-
-
-/* This routine makes changes to the instruction stream effective to the hardware.
- * It should be called after the instruction stream is written.
- * On return, the new instructions are effective.
- * Inputs:
- * a0 = Start address of new instruction stream
- * a1 = Size, in bytes, of new instruction stream
- */
-
-#define HW_SYNCI_Step       $1
-LEAF(MIPSX(SyncICache))
-       .set    push
-       .set    mips32r2
-       beq     a1, zero, 20f
-        nop
-       REG_ADDU a1, a0, a1
-       rdhwr   v0, HW_SYNCI_Step
-       beq     v0, zero, 20f
-        nop
-10:
-       synci   0(a0)
-       REG_ADDU a0, a0, v0
-       sltu    v1, a0, a1
-       bne     v1, zero, 10b
-        nop
-       sync
-20:
-       jr.hb   ra
-        nop
-       .set    pop
-END(MIPSX(SyncICache))
index da5186fbd77a3a6946d7ffd125805f92d407fa65..cd5e4f568439e3dff4cc46a53b31f19d4ab082aa 100644 (file)
@@ -61,11 +61,6 @@ static int kvm_mips_reset_vcpu(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-       return gfn;
-}
-
 /* XXXKYMA: We are simulatoring a processor that has the WII bit set in Config7, so we
  * are "runnable" if interrupts are pending
  */
@@ -130,8 +125,8 @@ static void kvm_mips_init_vm_percpu(void *arg)
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
        if (atomic_inc_return(&kvm_mips_instance) == 1) {
-               kvm_info("%s: 1st KVM instance, setup host TLB parameters\n",
-                        __func__);
+               kvm_debug("%s: 1st KVM instance, setup host TLB parameters\n",
+                         __func__);
                on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1);
        }
 
@@ -149,9 +144,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
                if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
                        kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]);
        }
-
-       if (kvm->arch.guest_pmap)
-               kfree(kvm->arch.guest_pmap);
+       kfree(kvm->arch.guest_pmap);
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
                kvm_arch_vcpu_free(vcpu);
@@ -186,8 +179,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 
        /* If this is the last instance, restore wired count */
        if (atomic_dec_return(&kvm_mips_instance) == 0) {
-               kvm_info("%s: last KVM instance, restoring TLB parameters\n",
-                        __func__);
+               kvm_debug("%s: last KVM instance, restoring TLB parameters\n",
+                         __func__);
                on_each_cpu(kvm_mips_uninit_tlbs, NULL, 1);
        }
 }
@@ -249,9 +242,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                                goto out;
                        }
 
-                       kvm_info
-                           ("Allocated space for Guest PMAP Table (%ld pages) @ %p\n",
-                            npages, kvm->arch.guest_pmap);
+                       kvm_debug("Allocated space for Guest PMAP Table (%ld pages) @ %p\n",
+                                 npages, kvm->arch.guest_pmap);
 
                        /* Now setup the page table */
                        for (i = 0; i < npages; i++) {
@@ -296,7 +288,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        if (err)
                goto out_free_cpu;
 
-       kvm_info("kvm @ %p: create cpu %d at %p\n", kvm, id, vcpu);
+       kvm_debug("kvm @ %p: create cpu %d at %p\n", kvm, id, vcpu);
 
        /* Allocate space for host mode exception handlers that handle
         * guest mode exits
@@ -304,7 +296,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        if (cpu_has_veic || cpu_has_vint) {
                size = 0x200 + VECTORSPACING * 64;
        } else {
-               size = 0x200;
+               size = 0x4000;
        }
 
        /* Save Linux EBASE */
@@ -316,8 +308,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
                err = -ENOMEM;
                goto out_free_cpu;
        }
-       kvm_info("Allocated %d bytes for KVM Exception Handlers @ %p\n",
-                ALIGN(size, PAGE_SIZE), gebase);
+       kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n",
+                 ALIGN(size, PAGE_SIZE), gebase);
 
        /* Save new ebase */
        vcpu->arch.guest_ebase = gebase;
@@ -342,15 +334,16 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 
        /* General handler, relocate to unmapped space for sanity's sake */
        offset = 0x2000;
-       kvm_info("Installing KVM Exception handlers @ %p, %#x bytes\n",
-                gebase + offset,
-                mips32_GuestExceptionEnd - mips32_GuestException);
+       kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n",
+                 gebase + offset,
+                 mips32_GuestExceptionEnd - mips32_GuestException);
 
        memcpy(gebase + offset, mips32_GuestException,
               mips32_GuestExceptionEnd - mips32_GuestException);
 
        /* Invalidate the icache for these ranges */
-       mips32_SyncICache((unsigned long) gebase, ALIGN(size, PAGE_SIZE));
+       local_flush_icache_range((unsigned long)gebase,
+                               (unsigned long)gebase + ALIGN(size, PAGE_SIZE));
 
        /* Allocate comm page for guest kernel, a TLB will be reserved for mapping GVA @ 0xFFFF8000 to this page */
        vcpu->arch.kseg0_commpage = kzalloc(PAGE_SIZE << 1, GFP_KERNEL);
@@ -360,14 +353,14 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
                goto out_free_gebase;
        }
 
-       kvm_info("Allocated COMM page @ %p\n", vcpu->arch.kseg0_commpage);
+       kvm_debug("Allocated COMM page @ %p\n", vcpu->arch.kseg0_commpage);
        kvm_mips_commpage_init(vcpu);
 
        /* Init */
        vcpu->arch.last_sched_cpu = -1;
 
        /* Start off the timer */
-       kvm_mips_emulate_count(vcpu);
+       kvm_mips_init_count(vcpu);
 
        return vcpu;
 
@@ -389,12 +382,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 
        kvm_mips_dump_stats(vcpu);
 
-       if (vcpu->arch.guest_ebase)
-               kfree(vcpu->arch.guest_ebase);
-
-       if (vcpu->arch.kseg0_commpage)
-               kfree(vcpu->arch.kseg0_commpage);
-
+       kfree(vcpu->arch.guest_ebase);
+       kfree(vcpu->arch.kseg0_commpage);
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -423,11 +412,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                vcpu->mmio_needed = 0;
        }
 
+       local_irq_disable();
        /* Check if we have any exceptions/interrupts pending */
        kvm_mips_deliver_interrupts(vcpu,
                                    kvm_read_c0_guest_cause(vcpu->arch.cop0));
 
-       local_irq_disable();
        kvm_guest_enter();
 
        r = __kvm_mips_vcpu_run(run, vcpu);
@@ -490,36 +479,6 @@ kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
        return -ENOIOCTLCMD;
 }
 
-#define MIPS_CP0_32(_R, _S)                                    \
-       (KVM_REG_MIPS | KVM_REG_SIZE_U32 | 0x10000 | (8 * (_R) + (_S)))
-
-#define MIPS_CP0_64(_R, _S)                                    \
-       (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 0x10000 | (8 * (_R) + (_S)))
-
-#define KVM_REG_MIPS_CP0_INDEX         MIPS_CP0_32(0, 0)
-#define KVM_REG_MIPS_CP0_ENTRYLO0      MIPS_CP0_64(2, 0)
-#define KVM_REG_MIPS_CP0_ENTRYLO1      MIPS_CP0_64(3, 0)
-#define KVM_REG_MIPS_CP0_CONTEXT       MIPS_CP0_64(4, 0)
-#define KVM_REG_MIPS_CP0_USERLOCAL     MIPS_CP0_64(4, 2)
-#define KVM_REG_MIPS_CP0_PAGEMASK      MIPS_CP0_32(5, 0)
-#define KVM_REG_MIPS_CP0_PAGEGRAIN     MIPS_CP0_32(5, 1)
-#define KVM_REG_MIPS_CP0_WIRED         MIPS_CP0_32(6, 0)
-#define KVM_REG_MIPS_CP0_HWRENA                MIPS_CP0_32(7, 0)
-#define KVM_REG_MIPS_CP0_BADVADDR      MIPS_CP0_64(8, 0)
-#define KVM_REG_MIPS_CP0_COUNT         MIPS_CP0_32(9, 0)
-#define KVM_REG_MIPS_CP0_ENTRYHI       MIPS_CP0_64(10, 0)
-#define KVM_REG_MIPS_CP0_COMPARE       MIPS_CP0_32(11, 0)
-#define KVM_REG_MIPS_CP0_STATUS                MIPS_CP0_32(12, 0)
-#define KVM_REG_MIPS_CP0_CAUSE         MIPS_CP0_32(13, 0)
-#define KVM_REG_MIPS_CP0_EBASE         MIPS_CP0_64(15, 1)
-#define KVM_REG_MIPS_CP0_CONFIG                MIPS_CP0_32(16, 0)
-#define KVM_REG_MIPS_CP0_CONFIG1       MIPS_CP0_32(16, 1)
-#define KVM_REG_MIPS_CP0_CONFIG2       MIPS_CP0_32(16, 2)
-#define KVM_REG_MIPS_CP0_CONFIG3       MIPS_CP0_32(16, 3)
-#define KVM_REG_MIPS_CP0_CONFIG7       MIPS_CP0_32(16, 7)
-#define KVM_REG_MIPS_CP0_XCONTEXT      MIPS_CP0_64(20, 0)
-#define KVM_REG_MIPS_CP0_ERROREPC      MIPS_CP0_64(30, 0)
-
 static u64 kvm_mips_get_one_regs[] = {
        KVM_REG_MIPS_R0,
        KVM_REG_MIPS_R1,
@@ -560,25 +519,34 @@ static u64 kvm_mips_get_one_regs[] = {
 
        KVM_REG_MIPS_CP0_INDEX,
        KVM_REG_MIPS_CP0_CONTEXT,
+       KVM_REG_MIPS_CP0_USERLOCAL,
        KVM_REG_MIPS_CP0_PAGEMASK,
        KVM_REG_MIPS_CP0_WIRED,
+       KVM_REG_MIPS_CP0_HWRENA,
        KVM_REG_MIPS_CP0_BADVADDR,
+       KVM_REG_MIPS_CP0_COUNT,
        KVM_REG_MIPS_CP0_ENTRYHI,
+       KVM_REG_MIPS_CP0_COMPARE,
        KVM_REG_MIPS_CP0_STATUS,
        KVM_REG_MIPS_CP0_CAUSE,
-       /* EPC set via kvm_regs, et al. */
+       KVM_REG_MIPS_CP0_EPC,
        KVM_REG_MIPS_CP0_CONFIG,
        KVM_REG_MIPS_CP0_CONFIG1,
        KVM_REG_MIPS_CP0_CONFIG2,
        KVM_REG_MIPS_CP0_CONFIG3,
        KVM_REG_MIPS_CP0_CONFIG7,
-       KVM_REG_MIPS_CP0_ERROREPC
+       KVM_REG_MIPS_CP0_ERROREPC,
+
+       KVM_REG_MIPS_COUNT_CTL,
+       KVM_REG_MIPS_COUNT_RESUME,
+       KVM_REG_MIPS_COUNT_HZ,
 };
 
 static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
                            const struct kvm_one_reg *reg)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
+       int ret;
        s64 v;
 
        switch (reg->id) {
@@ -601,24 +569,36 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_CONTEXT:
                v = (long)kvm_read_c0_guest_context(cop0);
                break;
+       case KVM_REG_MIPS_CP0_USERLOCAL:
+               v = (long)kvm_read_c0_guest_userlocal(cop0);
+               break;
        case KVM_REG_MIPS_CP0_PAGEMASK:
                v = (long)kvm_read_c0_guest_pagemask(cop0);
                break;
        case KVM_REG_MIPS_CP0_WIRED:
                v = (long)kvm_read_c0_guest_wired(cop0);
                break;
+       case KVM_REG_MIPS_CP0_HWRENA:
+               v = (long)kvm_read_c0_guest_hwrena(cop0);
+               break;
        case KVM_REG_MIPS_CP0_BADVADDR:
                v = (long)kvm_read_c0_guest_badvaddr(cop0);
                break;
        case KVM_REG_MIPS_CP0_ENTRYHI:
                v = (long)kvm_read_c0_guest_entryhi(cop0);
                break;
+       case KVM_REG_MIPS_CP0_COMPARE:
+               v = (long)kvm_read_c0_guest_compare(cop0);
+               break;
        case KVM_REG_MIPS_CP0_STATUS:
                v = (long)kvm_read_c0_guest_status(cop0);
                break;
        case KVM_REG_MIPS_CP0_CAUSE:
                v = (long)kvm_read_c0_guest_cause(cop0);
                break;
+       case KVM_REG_MIPS_CP0_EPC:
+               v = (long)kvm_read_c0_guest_epc(cop0);
+               break;
        case KVM_REG_MIPS_CP0_ERROREPC:
                v = (long)kvm_read_c0_guest_errorepc(cop0);
                break;
@@ -637,6 +617,15 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_CONFIG7:
                v = (long)kvm_read_c0_guest_config7(cop0);
                break;
+       /* registers to be handled specially */
+       case KVM_REG_MIPS_CP0_COUNT:
+       case KVM_REG_MIPS_COUNT_CTL:
+       case KVM_REG_MIPS_COUNT_RESUME:
+       case KVM_REG_MIPS_COUNT_HZ:
+               ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
+               if (ret)
+                       return ret;
+               break;
        default:
                return -EINVAL;
        }
@@ -697,12 +686,18 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_CONTEXT:
                kvm_write_c0_guest_context(cop0, v);
                break;
+       case KVM_REG_MIPS_CP0_USERLOCAL:
+               kvm_write_c0_guest_userlocal(cop0, v);
+               break;
        case KVM_REG_MIPS_CP0_PAGEMASK:
                kvm_write_c0_guest_pagemask(cop0, v);
                break;
        case KVM_REG_MIPS_CP0_WIRED:
                kvm_write_c0_guest_wired(cop0, v);
                break;
+       case KVM_REG_MIPS_CP0_HWRENA:
+               kvm_write_c0_guest_hwrena(cop0, v);
+               break;
        case KVM_REG_MIPS_CP0_BADVADDR:
                kvm_write_c0_guest_badvaddr(cop0, v);
                break;
@@ -712,12 +707,20 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_STATUS:
                kvm_write_c0_guest_status(cop0, v);
                break;
-       case KVM_REG_MIPS_CP0_CAUSE:
-               kvm_write_c0_guest_cause(cop0, v);
+       case KVM_REG_MIPS_CP0_EPC:
+               kvm_write_c0_guest_epc(cop0, v);
                break;
        case KVM_REG_MIPS_CP0_ERROREPC:
                kvm_write_c0_guest_errorepc(cop0, v);
                break;
+       /* registers to be handled specially */
+       case KVM_REG_MIPS_CP0_COUNT:
+       case KVM_REG_MIPS_CP0_COMPARE:
+       case KVM_REG_MIPS_CP0_CAUSE:
+       case KVM_REG_MIPS_COUNT_CTL:
+       case KVM_REG_MIPS_COUNT_RESUME:
+       case KVM_REG_MIPS_COUNT_HZ:
+               return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
        default:
                return -EINVAL;
        }
@@ -920,7 +923,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
                return -1;
 
        printk("VCPU Register Dump:\n");
-       printk("\tpc = 0x%08lx\n", vcpu->arch.pc);;
+       printk("\tpc = 0x%08lx\n", vcpu->arch.pc);
        printk("\texceptions: %08lx\n", vcpu->arch.pending_exceptions);
 
        for (i = 0; i < 32; i += 4) {
@@ -969,7 +972,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        return 0;
 }
 
-void kvm_mips_comparecount_func(unsigned long data)
+static void kvm_mips_comparecount_func(unsigned long data)
 {
        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 
@@ -984,15 +987,13 @@ void kvm_mips_comparecount_func(unsigned long data)
 /*
  * low level hrtimer wake routine.
  */
-enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer)
+static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer)
 {
        struct kvm_vcpu *vcpu;
 
        vcpu = container_of(timer, struct kvm_vcpu, arch.comparecount_timer);
        kvm_mips_comparecount_func((unsigned long) vcpu);
-       hrtimer_forward_now(&vcpu->arch.comparecount_timer,
-                           ktime_set(0, MS_TO_NS(10)));
-       return HRTIMER_RESTART;
+       return kvm_mips_count_timeout(vcpu);
 }
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
index 96528e2d1ea678dd335232eed86556f00963882d..b80e41d858fd7150423285968908ed7c4f3677fc 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
+#include <asm/cacheflush.h>
 
 #include "kvm_mips_comm.h"
 
@@ -40,7 +41,7 @@ kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
            CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
                       (vcpu, (unsigned long) opc));
        memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
-       mips32_SyncICache(kseg0_opc, 32);
+       local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
 
        return result;
 }
@@ -66,7 +67,7 @@ kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
            CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
                       (vcpu, (unsigned long) opc));
        memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
-       mips32_SyncICache(kseg0_opc, 32);
+       local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
 
        return result;
 }
@@ -99,11 +100,12 @@ kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
                    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
                               (vcpu, (unsigned long) opc));
                memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t));
-               mips32_SyncICache(kseg0_opc, 32);
+               local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
        } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
                local_irq_save(flags);
                memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t));
-               mips32_SyncICache((unsigned long) opc, 32);
+               local_flush_icache_range((unsigned long)opc,
+                                        (unsigned long)opc + 32);
                local_irq_restore(flags);
        } else {
                kvm_err("%s: Invalid address: %p\n", __func__, opc);
@@ -134,11 +136,12 @@ kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
                    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
                               (vcpu, (unsigned long) opc));
                memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t));
-               mips32_SyncICache(kseg0_opc, 32);
+               local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
        } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
                local_irq_save(flags);
                memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t));
-               mips32_SyncICache((unsigned long) opc, 32);
+               local_flush_icache_range((unsigned long)opc,
+                                        (unsigned long)opc + 32);
                local_irq_restore(flags);
        } else {
                kvm_err("%s: Invalid address: %p\n", __func__, opc);
index e3fec99941a7de2caaae5eb6bd28563f48c9aac3..8d484009008283e9afdd0da7344c99cf4b8ec8e6 100644 (file)
@@ -11,6 +11,7 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/ktime.h>
 #include <linux/kvm_host.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
@@ -228,25 +229,520 @@ enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause)
        return er;
 }
 
-/* Everytime the compare register is written to, we need to decide when to fire
- * the timer that represents timer ticks to the GUEST.
+/**
+ * kvm_mips_count_disabled() - Find whether the CP0_Count timer is disabled.
+ * @vcpu:      Virtual CPU.
  *
+ * Returns:    1 if the CP0_Count timer is disabled by either the guest
+ *             CP0_Cause.DC bit or the count_ctl.DC bit.
+ *             0 otherwise (in which case CP0_Count timer is running).
  */
-enum emulation_result kvm_mips_emulate_count(struct kvm_vcpu *vcpu)
+static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       enum emulation_result er = EMULATE_DONE;
+       return  (vcpu->arch.count_ctl & KVM_REG_MIPS_COUNT_CTL_DC) ||
+               (kvm_read_c0_guest_cause(cop0) & CAUSEF_DC);
+}
+
+/**
+ * kvm_mips_ktime_to_count() - Scale ktime_t to a 32-bit count.
+ *
+ * Caches the dynamic nanosecond bias in vcpu->arch.count_dyn_bias.
+ *
+ * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
+ */
+static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
+{
+       s64 now_ns, periods;
+       u64 delta;
+
+       now_ns = ktime_to_ns(now);
+       delta = now_ns + vcpu->arch.count_dyn_bias;
+
+       if (delta >= vcpu->arch.count_period) {
+               /* If delta is out of safe range the bias needs adjusting */
+               periods = div64_s64(now_ns, vcpu->arch.count_period);
+               vcpu->arch.count_dyn_bias = -periods * vcpu->arch.count_period;
+               /* Recalculate delta with new bias */
+               delta = now_ns + vcpu->arch.count_dyn_bias;
+       }
+
+       /*
+        * We've ensured that:
+        *   delta < count_period
+        *
+        * Therefore the intermediate delta*count_hz will never overflow since
+        * at the boundary condition:
+        *   delta = count_period
+        *   delta = NSEC_PER_SEC * 2^32 / count_hz
+        *   delta * count_hz = NSEC_PER_SEC * 2^32
+        */
+       return div_u64(delta * vcpu->arch.count_hz, NSEC_PER_SEC);
+}
+
+/**
+ * kvm_mips_count_time() - Get effective current time.
+ * @vcpu:      Virtual CPU.
+ *
+ * Get effective monotonic ktime. This is usually a straightforward ktime_get(),
+ * except when the master disable bit is set in count_ctl, in which case it is
+ * count_resume, i.e. the time that the count was disabled.
+ *
+ * Returns:    Effective monotonic ktime for CP0_Count.
+ */
+static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(vcpu->arch.count_ctl & KVM_REG_MIPS_COUNT_CTL_DC))
+               return vcpu->arch.count_resume;
+
+       return ktime_get();
+}
+
+/**
+ * kvm_mips_read_count_running() - Read the current count value as if running.
+ * @vcpu:      Virtual CPU.
+ * @now:       Kernel time to read CP0_Count at.
+ *
+ * Returns the current guest CP0_Count register at time @now and handles if the
+ * timer interrupt is pending and hasn't been handled yet.
+ *
+ * Returns:    The current value of the guest CP0_Count register.
+ */
+static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
+{
+       ktime_t expires;
+       int running;
+
+       /* Is the hrtimer pending? */
+       expires = hrtimer_get_expires(&vcpu->arch.comparecount_timer);
+       if (ktime_compare(now, expires) >= 0) {
+               /*
+                * Cancel it while we handle it so there's no chance of
+                * interference with the timeout handler.
+                */
+               running = hrtimer_cancel(&vcpu->arch.comparecount_timer);
+
+               /* Nothing should be waiting on the timeout */
+               kvm_mips_callbacks->queue_timer_int(vcpu);
+
+               /*
+                * Restart the timer if it was running based on the expiry time
+                * we read, so that we don't push it back 2 periods.
+                */
+               if (running) {
+                       expires = ktime_add_ns(expires,
+                                              vcpu->arch.count_period);
+                       hrtimer_start(&vcpu->arch.comparecount_timer, expires,
+                                     HRTIMER_MODE_ABS);
+               }
+       }
+
+       /* Return the biased and scaled guest CP0_Count */
+       return vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+}
+
+/**
+ * kvm_mips_read_count() - Read the current count value.
+ * @vcpu:      Virtual CPU.
+ *
+ * Read the current guest CP0_Count value, taking into account whether the timer
+ * is stopped.
+ *
+ * Returns:    The current guest CP0_Count value.
+ */
+uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+       /* If count disabled just read static copy of count */
+       if (kvm_mips_count_disabled(vcpu))
+               return kvm_read_c0_guest_count(cop0);
+
+       return kvm_mips_read_count_running(vcpu, ktime_get());
+}
+
+/**
+ * kvm_mips_freeze_hrtimer() - Safely stop the hrtimer.
+ * @vcpu:      Virtual CPU.
+ * @count:     Output pointer for CP0_Count value at point of freeze.
+ *
+ * Freeze the hrtimer safely and return both the ktime and the CP0_Count value
+ * at the point it was frozen. It is guaranteed that any pending interrupts at
+ * the point it was frozen are handled, and none after that point.
+ *
+ * This is useful where the time/CP0_Count is needed in the calculation of the
+ * new parameters.
+ *
+ * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
+ *
+ * Returns:    The ktime at the point of freeze.
+ */
+static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
+                                      uint32_t *count)
+{
+       ktime_t now;
+
+       /* stop hrtimer before finding time */
+       hrtimer_cancel(&vcpu->arch.comparecount_timer);
+       now = ktime_get();
+
+       /* find count at this point and handle pending hrtimer */
+       *count = kvm_mips_read_count_running(vcpu, now);
+
+       return now;
+}
+
 
-       /* If COUNT is enabled */
-       if (!(kvm_read_c0_guest_cause(cop0) & CAUSEF_DC)) {
-               hrtimer_try_to_cancel(&vcpu->arch.comparecount_timer);
-               hrtimer_start(&vcpu->arch.comparecount_timer,
-                             ktime_set(0, MS_TO_NS(10)), HRTIMER_MODE_REL);
+/**
+ * kvm_mips_resume_hrtimer() - Resume hrtimer, updating expiry.
+ * @vcpu:      Virtual CPU.
+ * @now:       ktime at point of resume.
+ * @count:     CP0_Count at point of resume.
+ *
+ * Resumes the timer and updates the timer expiry based on @now and @count.
+ * This can be used in conjunction with kvm_mips_freeze_timer() when timer
+ * parameters need to be changed.
+ *
+ * It is guaranteed that a timer interrupt immediately after resume will be
+ * handled, but not if CP_Compare is exactly at @count. That case is already
+ * handled by kvm_mips_freeze_timer().
+ *
+ * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
+ */
+static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
+                                   ktime_t now, uint32_t count)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       uint32_t compare;
+       u64 delta;
+       ktime_t expire;
+
+       /* Calculate timeout (wrap 0 to 2^32) */
+       compare = kvm_read_c0_guest_compare(cop0);
+       delta = (u64)(uint32_t)(compare - count - 1) + 1;
+       delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
+       expire = ktime_add_ns(now, delta);
+
+       /* Update hrtimer to use new timeout */
+       hrtimer_cancel(&vcpu->arch.comparecount_timer);
+       hrtimer_start(&vcpu->arch.comparecount_timer, expire, HRTIMER_MODE_ABS);
+}
+
+/**
+ * kvm_mips_update_hrtimer() - Update next expiry time of hrtimer.
+ * @vcpu:      Virtual CPU.
+ *
+ * Recalculates and updates the expiry time of the hrtimer. This can be used
+ * after timer parameters have been altered which do not depend on the time that
+ * the change occurs (in those cases kvm_mips_freeze_hrtimer() and
+ * kvm_mips_resume_hrtimer() are used directly).
+ *
+ * It is guaranteed that no timer interrupts will be lost in the process.
+ *
+ * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
+ */
+static void kvm_mips_update_hrtimer(struct kvm_vcpu *vcpu)
+{
+       ktime_t now;
+       uint32_t count;
+
+       /*
+        * freeze_hrtimer takes care of a timer interrupts <= count, and
+        * resume_hrtimer the hrtimer takes care of a timer interrupts > count.
+        */
+       now = kvm_mips_freeze_hrtimer(vcpu, &count);
+       kvm_mips_resume_hrtimer(vcpu, now, count);
+}
+
+/**
+ * kvm_mips_write_count() - Modify the count and update timer.
+ * @vcpu:      Virtual CPU.
+ * @count:     Guest CP0_Count value to set.
+ *
+ * Sets the CP0_Count value and updates the timer accordingly.
+ */
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       ktime_t now;
+
+       /* Calculate bias */
+       now = kvm_mips_count_time(vcpu);
+       vcpu->arch.count_bias = count - kvm_mips_ktime_to_count(vcpu, now);
+
+       if (kvm_mips_count_disabled(vcpu))
+               /* The timer's disabled, adjust the static count */
+               kvm_write_c0_guest_count(cop0, count);
+       else
+               /* Update timeout */
+               kvm_mips_resume_hrtimer(vcpu, now, count);
+}
+
+/**
+ * kvm_mips_init_count() - Initialise timer.
+ * @vcpu:      Virtual CPU.
+ *
+ * Initialise the timer to a sensible frequency, namely 100MHz, zero it, and set
+ * it going if it's enabled.
+ */
+void kvm_mips_init_count(struct kvm_vcpu *vcpu)
+{
+       /* 100 MHz */
+       vcpu->arch.count_hz = 100*1000*1000;
+       vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32,
+                                         vcpu->arch.count_hz);
+       vcpu->arch.count_dyn_bias = 0;
+
+       /* Starting at 0 */
+       kvm_mips_write_count(vcpu, 0);
+}
+
+/**
+ * kvm_mips_set_count_hz() - Update the frequency of the timer.
+ * @vcpu:      Virtual CPU.
+ * @count_hz:  Frequency of CP0_Count timer in Hz.
+ *
+ * Change the frequency of the CP0_Count timer. This is done atomically so that
+ * CP0_Count is continuous and no timer interrupt is lost.
+ *
+ * Returns:    -EINVAL if @count_hz is out of range.
+ *             0 on success.
+ */
+int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       int dc;
+       ktime_t now;
+       u32 count;
+
+       /* ensure the frequency is in a sensible range... */
+       if (count_hz <= 0 || count_hz > NSEC_PER_SEC)
+               return -EINVAL;
+       /* ... and has actually changed */
+       if (vcpu->arch.count_hz == count_hz)
+               return 0;
+
+       /* Safely freeze timer so we can keep it continuous */
+       dc = kvm_mips_count_disabled(vcpu);
+       if (dc) {
+               now = kvm_mips_count_time(vcpu);
+               count = kvm_read_c0_guest_count(cop0);
        } else {
-               hrtimer_try_to_cancel(&vcpu->arch.comparecount_timer);
+               now = kvm_mips_freeze_hrtimer(vcpu, &count);
        }
 
-       return er;
+       /* Update the frequency */
+       vcpu->arch.count_hz = count_hz;
+       vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32, count_hz);
+       vcpu->arch.count_dyn_bias = 0;
+
+       /* Calculate adjusted bias so dynamic count is unchanged */
+       vcpu->arch.count_bias = count - kvm_mips_ktime_to_count(vcpu, now);
+
+       /* Update and resume hrtimer */
+       if (!dc)
+               kvm_mips_resume_hrtimer(vcpu, now, count);
+       return 0;
+}
+
+/**
+ * kvm_mips_write_compare() - Modify compare and update timer.
+ * @vcpu:      Virtual CPU.
+ * @compare:   New CP0_Compare value.
+ *
+ * Update CP0_Compare to a new value and update the timeout.
+ */
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+       /* if unchanged, must just be an ack */
+       if (kvm_read_c0_guest_compare(cop0) == compare)
+               return;
+
+       /* Update compare */
+       kvm_write_c0_guest_compare(cop0, compare);
+
+       /* Update timeout if count enabled */
+       if (!kvm_mips_count_disabled(vcpu))
+               kvm_mips_update_hrtimer(vcpu);
+}
+
+/**
+ * kvm_mips_count_disable() - Disable count.
+ * @vcpu:      Virtual CPU.
+ *
+ * Disable the CP0_Count timer. A timer interrupt on or before the final stop
+ * time will be handled but not after.
+ *
+ * Assumes CP0_Count was previously enabled but now Guest.CP0_Cause.DC or
+ * count_ctl.DC has been set (count disabled).
+ *
+ * Returns:    The time that the timer was stopped.
+ */
+static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       uint32_t count;
+       ktime_t now;
+
+       /* Stop hrtimer */
+       hrtimer_cancel(&vcpu->arch.comparecount_timer);
+
+       /* Set the static count from the dynamic count, handling pending TI */
+       now = ktime_get();
+       count = kvm_mips_read_count_running(vcpu, now);
+       kvm_write_c0_guest_count(cop0, count);
+
+       return now;
+}
+
+/**
+ * kvm_mips_count_disable_cause() - Disable count using CP0_Cause.DC.
+ * @vcpu:      Virtual CPU.
+ *
+ * Disable the CP0_Count timer and set CP0_Cause.DC. A timer interrupt on or
+ * before the final stop time will be handled if the timer isn't disabled by
+ * count_ctl.DC, but not after.
+ *
+ * Assumes CP0_Cause.DC is clear (count enabled).
+ */
+void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+       kvm_set_c0_guest_cause(cop0, CAUSEF_DC);
+       if (!(vcpu->arch.count_ctl & KVM_REG_MIPS_COUNT_CTL_DC))
+               kvm_mips_count_disable(vcpu);
+}
+
+/**
+ * kvm_mips_count_enable_cause() - Enable count using CP0_Cause.DC.
+ * @vcpu:      Virtual CPU.
+ *
+ * Enable the CP0_Count timer and clear CP0_Cause.DC. A timer interrupt after
+ * the start time will be handled if the timer isn't disabled by count_ctl.DC,
+ * potentially before even returning, so the caller should be careful with
+ * ordering of CP0_Cause modifications so as not to lose it.
+ *
+ * Assumes CP0_Cause.DC is set (count disabled).
+ */
+void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       uint32_t count;
+
+       kvm_clear_c0_guest_cause(cop0, CAUSEF_DC);
+
+       /*
+        * Set the dynamic count to match the static count.
+        * This starts the hrtimer if count_ctl.DC allows it.
+        * Otherwise it conveniently updates the biases.
+        */
+       count = kvm_read_c0_guest_count(cop0);
+       kvm_mips_write_count(vcpu, count);
+}
+
+/**
+ * kvm_mips_set_count_ctl() - Update the count control KVM register.
+ * @vcpu:      Virtual CPU.
+ * @count_ctl: Count control register new value.
+ *
+ * Set the count control KVM register. The timer is updated accordingly.
+ *
+ * Returns:    -EINVAL if reserved bits are set.
+ *             0 on success.
+ */
+int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       s64 changed = count_ctl ^ vcpu->arch.count_ctl;
+       s64 delta;
+       ktime_t expire, now;
+       uint32_t count, compare;
+
+       /* Only allow defined bits to be changed */
+       if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC))
+               return -EINVAL;
+
+       /* Apply new value */
+       vcpu->arch.count_ctl = count_ctl;
+
+       /* Master CP0_Count disable */
+       if (changed & KVM_REG_MIPS_COUNT_CTL_DC) {
+               /* Is CP0_Cause.DC already disabling CP0_Count? */
+               if (kvm_read_c0_guest_cause(cop0) & CAUSEF_DC) {
+                       if (count_ctl & KVM_REG_MIPS_COUNT_CTL_DC)
+                               /* Just record the current time */
+                               vcpu->arch.count_resume = ktime_get();
+               } else if (count_ctl & KVM_REG_MIPS_COUNT_CTL_DC) {
+                       /* disable timer and record current time */
+                       vcpu->arch.count_resume = kvm_mips_count_disable(vcpu);
+               } else {
+                       /*
+                        * Calculate timeout relative to static count at resume
+                        * time (wrap 0 to 2^32).
+                        */
+                       count = kvm_read_c0_guest_count(cop0);
+                       compare = kvm_read_c0_guest_compare(cop0);
+                       delta = (u64)(uint32_t)(compare - count - 1) + 1;
+                       delta = div_u64(delta * NSEC_PER_SEC,
+                                       vcpu->arch.count_hz);
+                       expire = ktime_add_ns(vcpu->arch.count_resume, delta);
+
+                       /* Handle pending interrupt */
+                       now = ktime_get();
+                       if (ktime_compare(now, expire) >= 0)
+                               /* Nothing should be waiting on the timeout */
+                               kvm_mips_callbacks->queue_timer_int(vcpu);
+
+                       /* Resume hrtimer without changing bias */
+                       count = kvm_mips_read_count_running(vcpu, now);
+                       kvm_mips_resume_hrtimer(vcpu, now, count);
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * kvm_mips_set_count_resume() - Update the count resume KVM register.
+ * @vcpu:              Virtual CPU.
+ * @count_resume:      Count resume register new value.
+ *
+ * Set the count resume KVM register.
+ *
+ * Returns:    -EINVAL if out of valid range (0..now).
+ *             0 on success.
+ */
+int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume)
+{
+       /*
+        * It doesn't make sense for the resume time to be in the future, as it
+        * would be possible for the next interrupt to be more than a full
+        * period in the future.
+        */
+       if (count_resume < 0 || count_resume > ktime_to_ns(ktime_get()))
+               return -EINVAL;
+
+       vcpu->arch.count_resume = ns_to_ktime(count_resume);
+       return 0;
+}
+
+/**
+ * kvm_mips_count_timeout() - Push timer forward on timeout.
+ * @vcpu:      Virtual CPU.
+ *
+ * Handle an hrtimer event by push the hrtimer forward a period.
+ *
+ * Returns:    The hrtimer_restart value to return to the hrtimer subsystem.
+ */
+enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu)
+{
+       /* Add the Count period to the current expiry time */
+       hrtimer_add_expires_ns(&vcpu->arch.comparecount_timer,
+                              vcpu->arch.count_period);
+       return HRTIMER_RESTART;
 }
 
 enum emulation_result kvm_mips_emul_eret(struct kvm_vcpu *vcpu)
@@ -471,8 +967,7 @@ kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, uint32_t cause,
 #endif
                        /* Get reg */
                        if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
-                               /* XXXKYMA: Run the Guest count register @ 1/4 the rate of the host */
-                               vcpu->arch.gprs[rt] = (read_c0_count() >> 2);
+                               vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu);
                        } else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
                                vcpu->arch.gprs[rt] = 0x0;
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
@@ -539,10 +1034,7 @@ kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, uint32_t cause,
                        }
                        /* Are we writing to COUNT */
                        else if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
-                               /* Linux doesn't seem to write into COUNT, we throw an error
-                                * if we notice a write to COUNT
-                                */
-                               /*er = EMULATE_FAIL; */
+                               kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
                                goto done;
                        } else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) {
                                kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n",
@@ -552,8 +1044,8 @@ kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, uint32_t cause,
                                /* If we are writing to COMPARE */
                                /* Clear pending timer interrupt, if any */
                                kvm_mips_callbacks->dequeue_timer_int(vcpu);
-                               kvm_write_c0_guest_compare(cop0,
-                                                          vcpu->arch.gprs[rt]);
+                               kvm_mips_write_compare(vcpu,
+                                                      vcpu->arch.gprs[rt]);
                        } else if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
                                kvm_write_c0_guest_status(cop0,
                                                          vcpu->arch.gprs[rt]);
@@ -564,6 +1056,20 @@ kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, uint32_t cause,
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
                                kvm_mips_trans_mtc0(inst, opc, vcpu);
 #endif
+                       } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
+                               uint32_t old_cause, new_cause;
+                               old_cause = kvm_read_c0_guest_cause(cop0);
+                               new_cause = vcpu->arch.gprs[rt];
+                               /* Update R/W bits */
+                               kvm_change_c0_guest_cause(cop0, 0x08800300,
+                                                         new_cause);
+                               /* DC bit enabling/disabling timer? */
+                               if ((old_cause ^ new_cause) & CAUSEF_DC) {
+                                       if (new_cause & CAUSEF_DC)
+                                               kvm_mips_count_disable_cause(vcpu);
+                                       else
+                                               kvm_mips_count_enable_cause(vcpu);
+                               }
                        } else {
                                cop0->reg[rd][sel] = vcpu->arch.gprs[rt];
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
@@ -887,7 +1393,7 @@ int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
 
        printk("%s: va: %#lx, unmapped: %#x\n", __func__, va, CKSEG0ADDR(pa));
 
-       mips32_SyncICache(CKSEG0ADDR(pa), 32);
+       local_flush_icache_range(CKSEG0ADDR(pa), 32);
        return 0;
 }
 
@@ -1325,8 +1831,12 @@ kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
                       struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DONE;
-
 #ifdef DEBUG
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
+                               (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+       int index;
+
        /*
         * If address not in the guest TLB, then we are in trouble
         */
@@ -1553,8 +2063,7 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                                             current_cpu_data.icache.linesz);
                        break;
                case 2: /* Read count register */
-                       printk("RDHWR: Cont register\n");
-                       arch->gprs[rt] = kvm_read_c0_guest_count(cop0);
+                       arch->gprs[rt] = kvm_mips_read_count(vcpu);
                        break;
                case 3: /* Count register resolution */
                        switch (current_cpu_data.cputype) {
@@ -1810,11 +2319,9 @@ kvm_mips_handle_tlbmiss(unsigned long cause, uint32_t *opc,
                                er = EMULATE_FAIL;
                        }
                } else {
-#ifdef DEBUG
                        kvm_debug
                            ("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n",
                             tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1);
-#endif
                        /* OK we have a Guest TLB entry, now inject it into the shadow host TLB */
                        kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL,
                                                             NULL);
index 50ab9c4d4a5dc6d2cf1d98270314b8aeb102623b..8a5a700ad8deed1cf2a40b37ccdf759b1440203b 100644 (file)
@@ -222,26 +222,19 @@ kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
                return -1;
        }
 
-       if (idx < 0) {
-               idx = read_c0_random() % current_cpu_data.tlbsize;
-               write_c0_index(idx);
-               mtc0_tlbw_hazard();
-       }
        write_c0_entrylo0(entrylo0);
        write_c0_entrylo1(entrylo1);
        mtc0_tlbw_hazard();
 
-       tlb_write_indexed();
+       if (idx < 0)
+               tlb_write_random();
+       else
+               tlb_write_indexed();
        tlbw_use_hazard();
 
-#ifdef DEBUG
-       if (debug) {
-               kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] "
-                         "entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n",
-                         vcpu->arch.pc, idx, read_c0_entryhi(),
-                         read_c0_entrylo0(), read_c0_entrylo1());
-       }
-#endif
+       kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n",
+                 vcpu->arch.pc, idx, read_c0_entryhi(),
+                 read_c0_entrylo0(), read_c0_entrylo1());
 
        /* Flush D-cache */
        if (flush_dcache_mask) {
@@ -348,11 +341,9 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
        mtc0_tlbw_hazard();
        tlbw_use_hazard();
 
-#ifdef DEBUG
        kvm_debug ("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
             vcpu->arch.pc, read_c0_index(), read_c0_entryhi(),
             read_c0_entrylo0(), read_c0_entrylo1());
-#endif
 
        /* Restore old ASID */
        write_c0_entryhi(old_entryhi);
@@ -400,10 +391,8 @@ kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
        entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
                        (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
 
-#ifdef DEBUG
        kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
                  tlb->tlb_lo0, tlb->tlb_lo1);
-#endif
 
        return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
                                       tlb->tlb_mask);
@@ -424,10 +413,8 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
                }
        }
 
-#ifdef DEBUG
        kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n",
                  __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1);
-#endif
 
        return index;
 }
@@ -461,9 +448,7 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
 
        local_irq_restore(flags);
 
-#ifdef DEBUG
        kvm_debug("Host TLB lookup, %#lx, idx: %2d\n", vaddr, idx);
-#endif
 
        return idx;
 }
@@ -508,12 +493,9 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 
        local_irq_restore(flags);
 
-#ifdef DEBUG
-       if (idx > 0) {
+       if (idx > 0)
                kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__,
-                         (va & VPN2_MASK) | (vcpu->arch.asid_map[va & ASID_MASK] & ASID_MASK), idx);
-       }
-#endif
+                         (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx);
 
        return 0;
 }
@@ -658,15 +640,30 @@ void kvm_local_flush_tlb_all(void)
        local_irq_restore(flags);
 }
 
+/**
+ * kvm_mips_migrate_count() - Migrate timer.
+ * @vcpu:      Virtual CPU.
+ *
+ * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
+ * if it was running prior to being cancelled.
+ *
+ * Must be called when the VCPU is migrated to a different CPU to ensure that
+ * timer expiry during guest execution interrupts the guest and causes the
+ * interrupt to be delivered in a timely manner.
+ */
+static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
+{
+       if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
+               hrtimer_restart(&vcpu->arch.comparecount_timer);
+}
+
 /* Restore ASID once we are scheduled back after preemption */
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        unsigned long flags;
        int newasid = 0;
 
-#ifdef DEBUG
        kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
-#endif
 
        /* Alocate new kernel and user ASIDs if needed */
 
@@ -682,17 +679,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                    vcpu->arch.guest_user_mm.context.asid[cpu];
                newasid++;
 
-               kvm_info("[%d]: cpu_context: %#lx\n", cpu,
-                        cpu_context(cpu, current->mm));
-               kvm_info("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-                        cpu, vcpu->arch.guest_kernel_asid[cpu]);
-               kvm_info("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
-                        vcpu->arch.guest_user_asid[cpu]);
+               kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+                         cpu_context(cpu, current->mm));
+               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
+               kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
+                         vcpu->arch.guest_user_asid[cpu]);
        }
 
        if (vcpu->arch.last_sched_cpu != cpu) {
-               kvm_info("[%d->%d]KVM VCPU[%d] switch\n",
-                        vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
+               kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
+                         vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
+               /*
+                * Migrate the timer interrupt to the current CPU so that it
+                * always interrupts the guest and synchronously triggers a
+                * guest timer interrupt.
+                */
+               kvm_mips_migrate_count(vcpu);
        }
 
        if (!newasid) {
index 30d725321db1e23dcaddd642b42edf97739448e6..693f952b2fbb532d93e76c648172dc6211c8e201 100644 (file)
@@ -32,9 +32,7 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
                gpa = KVM_INVALID_ADDR;
        }
 
-#ifdef DEBUG
        kvm_debug("%s: gva %#lx, gpa: %#llx\n", __func__, gva, gpa);
-#endif
 
        return gpa;
 }
@@ -85,11 +83,9 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 
        if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
            || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-#ifdef DEBUG
                kvm_debug
                    ("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
                     cause, opc, badvaddr);
-#endif
                er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
 
                if (er == EMULATE_DONE)
@@ -138,11 +134,9 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
                }
        } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
                   || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-#ifdef DEBUG
                kvm_debug
                    ("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
                     cause, opc, badvaddr);
-#endif
                er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
                if (er == EMULATE_DONE)
                        ret = RESUME_GUEST;
@@ -188,10 +182,8 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
                }
        } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
                   || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-#ifdef DEBUG
                kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n",
                          vcpu->arch.pc, badvaddr);
-#endif
 
                /* User Address (UA) fault, this could happen if
                 * (1) TLB entry not present/valid in both Guest and shadow host TLBs, in this
@@ -236,9 +228,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 
        if (KVM_GUEST_KERNEL_MODE(vcpu)
            && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
-#ifdef DEBUG
                kvm_debug("Emulate Store to MMIO space\n");
-#endif
                er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
                if (er == EMULATE_FAIL) {
                        printk("Emulate Store to MMIO space failed\n");
@@ -268,9 +258,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
        int ret = RESUME_GUEST;
 
        if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) {
-#ifdef DEBUG
                kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr);
-#endif
                er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
                if (er == EMULATE_FAIL) {
                        printk("Emulate Load from MMIO space failed\n");
@@ -401,6 +389,78 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
+                                    const struct kvm_one_reg *reg,
+                                    s64 *v)
+{
+       switch (reg->id) {
+       case KVM_REG_MIPS_CP0_COUNT:
+               *v = kvm_mips_read_count(vcpu);
+               break;
+       case KVM_REG_MIPS_COUNT_CTL:
+               *v = vcpu->arch.count_ctl;
+               break;
+       case KVM_REG_MIPS_COUNT_RESUME:
+               *v = ktime_to_ns(vcpu->arch.count_resume);
+               break;
+       case KVM_REG_MIPS_COUNT_HZ:
+               *v = vcpu->arch.count_hz;
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
+                                    const struct kvm_one_reg *reg,
+                                    s64 v)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       int ret = 0;
+
+       switch (reg->id) {
+       case KVM_REG_MIPS_CP0_COUNT:
+               kvm_mips_write_count(vcpu, v);
+               break;
+       case KVM_REG_MIPS_CP0_COMPARE:
+               kvm_mips_write_compare(vcpu, v);
+               break;
+       case KVM_REG_MIPS_CP0_CAUSE:
+               /*
+                * If the timer is stopped or started (DC bit) it must look
+                * atomic with changes to the interrupt pending bits (TI, IRQ5).
+                * A timer interrupt should not happen in between.
+                */
+               if ((kvm_read_c0_guest_cause(cop0) ^ v) & CAUSEF_DC) {
+                       if (v & CAUSEF_DC) {
+                               /* disable timer first */
+                               kvm_mips_count_disable_cause(vcpu);
+                               kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v);
+                       } else {
+                               /* enable timer last */
+                               kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v);
+                               kvm_mips_count_enable_cause(vcpu);
+                       }
+               } else {
+                       kvm_write_c0_guest_cause(cop0, v);
+               }
+               break;
+       case KVM_REG_MIPS_COUNT_CTL:
+               ret = kvm_mips_set_count_ctl(vcpu, v);
+               break;
+       case KVM_REG_MIPS_COUNT_RESUME:
+               ret = kvm_mips_set_count_resume(vcpu, v);
+               break;
+       case KVM_REG_MIPS_COUNT_HZ:
+               ret = kvm_mips_set_count_hz(vcpu, v);
+               break;
+       default:
+               return -EINVAL;
+       }
+       return ret;
+}
+
 static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        /* exit handlers */
        .handle_cop_unusable = kvm_trap_emul_handle_cop_unusable,
@@ -423,6 +483,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        .dequeue_io_int = kvm_mips_dequeue_io_int_cb,
        .irq_deliver = kvm_mips_irq_deliver_cb,
        .irq_clear = kvm_mips_irq_clear_cb,
+       .get_one_reg = kvm_trap_emul_get_one_reg,
+       .set_one_reg = kvm_trap_emul_set_one_reg,
 };
 
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
index 9e67cdea3c7484eb46739f3f2799ef5aae58d27b..f7b91d3a371dd07e7c69c38290f1f0c77b5b3a0a 100644 (file)
@@ -31,6 +31,7 @@ void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page,
 void (*flush_icache_range)(unsigned long start, unsigned long end);
 EXPORT_SYMBOL_GPL(flush_icache_range);
 void (*local_flush_icache_range)(unsigned long start, unsigned long end);
+EXPORT_SYMBOL_GPL(local_flush_icache_range);
 
 void (*__flush_cache_vmap)(void);
 void (*__flush_cache_vunmap)(void);
index 319009912142414c72862a04adf083136f1cdd45..3778a359f3ad98279fe32f952b725ea521eb6e0a 100644 (file)
@@ -74,18 +74,8 @@ static void __init estimate_frequencies(void)
        unsigned int giccount = 0, gicstart = 0;
 #endif
 
-#if defined (CONFIG_KVM_GUEST) && defined (CONFIG_KVM_HOST_FREQ)
-       unsigned int prid = read_c0_prid() & (PRID_COMP_MASK | PRID_IMP_MASK);
-
-       /*
-        * XXXKYMA: hardwire the CPU frequency to Host Freq/4
-        */
-       count = (CONFIG_KVM_HOST_FREQ * 1000000) >> 3;
-       if ((prid != (PRID_COMP_MIPS | PRID_IMP_20KC)) &&
-           (prid != (PRID_COMP_MIPS | PRID_IMP_25KF)))
-               count *= 2;
-
-       mips_hpt_frequency = count;
+#if defined(CONFIG_KVM_GUEST) && CONFIG_KVM_GUEST_TIMER_FREQ
+       mips_hpt_frequency = CONFIG_KVM_GUEST_TIMER_FREQ * 1000000;
        return;
 #endif
 
index 856f8deb557ab9d0ef2fef9cff50d3cf67ddeb47..6330a61b875a3d145396142294e0917c7b242905 100644 (file)
@@ -81,4 +81,38 @@ static inline unsigned int get_oc(u32 inst)
 {
        return (inst >> 11) & 0x7fff;
 }
+
+#define IS_XFORM(inst) (get_op(inst)  == 31)
+#define IS_DSFORM(inst)        (get_op(inst) >= 56)
+
+/*
+ * Create a DSISR value from the instruction
+ */
+static inline unsigned make_dsisr(unsigned instr)
+{
+       unsigned dsisr;
+
+
+       /* bits  6:15 --> 22:31 */
+       dsisr = (instr & 0x03ff0000) >> 16;
+
+       if (IS_XFORM(instr)) {
+               /* bits 29:30 --> 15:16 */
+               dsisr |= (instr & 0x00000006) << 14;
+               /* bit     25 -->    17 */
+               dsisr |= (instr & 0x00000040) << 8;
+               /* bits 21:24 --> 18:21 */
+               dsisr |= (instr & 0x00000780) << 3;
+       } else {
+               /* bit      5 -->    17 */
+               dsisr |= (instr & 0x04000000) >> 12;
+               /* bits  1: 4 --> 18:21 */
+               dsisr |= (instr & 0x78000000) >> 17;
+               /* bits 30:31 --> 12:13 */
+               if (IS_DSFORM(instr))
+                       dsisr |= (instr & 0x00000003) << 18;
+       }
+
+       return dsisr;
+}
 #endif /* __ASM_PPC_DISASSEMBLE_H__ */
index 19eb74a95b592f8c41b2562d32244906debe0468..9601741080e51ca7e9611f3ecb928b05464a2965 100644 (file)
 #define BOOK3S_INTERRUPT_PERFMON       0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC       0xf20
 #define BOOK3S_INTERRUPT_VSX           0xf40
+#define BOOK3S_INTERRUPT_FAC_UNAVAIL   0xf60
 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80
 
 #define BOOK3S_IRQPRIO_SYSTEM_RESET            0
 #define BOOK3S_IRQPRIO_FP_UNAVAIL              7
 #define BOOK3S_IRQPRIO_ALTIVEC                 8
 #define BOOK3S_IRQPRIO_VSX                     9
-#define BOOK3S_IRQPRIO_SYSCALL                 10
-#define BOOK3S_IRQPRIO_MACHINE_CHECK           11
-#define BOOK3S_IRQPRIO_DEBUG                   12
-#define BOOK3S_IRQPRIO_EXTERNAL                        13
-#define BOOK3S_IRQPRIO_DECREMENTER             14
-#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR     15
-#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL          16
-#define BOOK3S_IRQPRIO_MAX                     17
+#define BOOK3S_IRQPRIO_FAC_UNAVAIL             10
+#define BOOK3S_IRQPRIO_SYSCALL                 11
+#define BOOK3S_IRQPRIO_MACHINE_CHECK           12
+#define BOOK3S_IRQPRIO_DEBUG                   13
+#define BOOK3S_IRQPRIO_EXTERNAL                        14
+#define BOOK3S_IRQPRIO_DECREMENTER             15
+#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR     16
+#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL          17
+#define BOOK3S_IRQPRIO_MAX                     18
 
 #define BOOK3S_HFLAG_DCBZ32                    0x1
 #define BOOK3S_HFLAG_SLB                       0x2
index bb1e38a23ac76753cb24162b07a7c03a517f7126..f52f65694527ea7b42ceb1df485039fc9abba623 100644 (file)
@@ -268,9 +268,10 @@ static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
        return vcpu->arch.pc;
 }
 
+static inline u64 kvmppc_get_msr(struct kvm_vcpu *vcpu);
 static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu)
 {
-       return (vcpu->arch.shared->msr & MSR_LE) != (MSR_KERNEL & MSR_LE);
+       return (kvmppc_get_msr(vcpu) & MSR_LE) != (MSR_KERNEL & MSR_LE);
 }
 
 static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong pc)
index 51388befeddb37d454e33ab10cb196e2930f93fb..fddb72b48ce9bd2a9a4e25460c2ff5cc4bc15ad5 100644 (file)
@@ -77,34 +77,122 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
        return old == 0;
 }
 
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+       int i, shift;
+       unsigned int mask;
+
+       /* start from 1 ignoring MMU_PAGE_4K */
+       for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+               /* invalid penc */
+               if (mmu_psize_defs[psize].penc[i] == -1)
+                       continue;
+               /*
+                * encoding bits per actual page size
+                *        PTE LP     actual page size
+                *    rrrr rrrz         >=8KB
+                *    rrrr rrzz         >=16KB
+                *    rrrr rzzz         >=32KB
+                *    rrrr zzzz         >=64KB
+                * .......
+                */
+               shift = mmu_psize_defs[i].shift - LP_SHIFT;
+               if (shift > LP_BITS)
+                       shift = LP_BITS;
+               mask = (1 << shift) - 1;
+               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+                       return i;
+       }
+       return -1;
+}
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                                             unsigned long pte_index)
 {
-       unsigned long rb, va_low;
+       int b_psize, a_psize;
+       unsigned int penc;
+       unsigned long rb = 0, va_low, sllp;
+       unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+       if (!(v & HPTE_V_LARGE)) {
+               /* both base and actual psize is 4k */
+               b_psize = MMU_PAGE_4K;
+               a_psize = MMU_PAGE_4K;
+       } else {
+               for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) {
+
+                       /* valid entries have a shift value */
+                       if (!mmu_psize_defs[b_psize].shift)
+                               continue;
 
+                       a_psize = __hpte_actual_psize(lp, b_psize);
+                       if (a_psize != -1)
+                               break;
+               }
+       }
+       /*
+        * Ignore the top 14 bits of va
+        * v have top two bits covering segment size, hence move
+        * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
+        * AVA field in v also have the lower 23 bits ignored.
+        * For base page size 4K we need 14 .. 65 bits (so need to
+        * collect extra 11 bits)
+        * For others we need 14..14+i
+        */
+       /* This covers 14..54 bits of va*/
        rb = (v & ~0x7fUL) << 16;               /* AVA field */
+       /*
+        * AVA in v had cleared lower 23 bits. We need to derive
+        * that from pteg index
+        */
        va_low = pte_index >> 3;
        if (v & HPTE_V_SECONDARY)
                va_low = ~va_low;
-       /* xor vsid from AVA */
+       /*
+        * get the vpn bits from va_low using reverse of hashing.
+        * In v we have va with 23 bits dropped and then left shifted
+        * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
+        * right shift it with (SID_SHIFT - (23 - 7))
+        */
        if (!(v & HPTE_V_1TB_SEG))
-               va_low ^= v >> 12;
+               va_low ^= v >> (SID_SHIFT - 16);
        else
-               va_low ^= v >> 24;
+               va_low ^= v >> (SID_SHIFT_1T - 16);
        va_low &= 0x7ff;
-       if (v & HPTE_V_LARGE) {
-               rb |= 1;                        /* L field */
-               if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-                   (r & 0xff000)) {
-                       /* non-16MB large page, must be 64k */
-                       /* (masks depend on page size) */
-                       rb |= 0x1000;           /* page encoding in LP field */
-                       rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-                       rb |= ((va_low << 4) & 0xf0);   /* AVAL field (P7 doesn't seem to care) */
-               }
-       } else {
-               /* 4kB page */
-               rb |= (va_low & 0x7ff) << 12;   /* remaining 11b of VA */
+
+       switch (b_psize) {
+       case MMU_PAGE_4K:
+               sllp = ((mmu_psize_defs[a_psize].sllp & SLB_VSID_L) >> 6) |
+                       ((mmu_psize_defs[a_psize].sllp & SLB_VSID_LP) >> 4);
+               rb |= sllp << 5;        /*  AP field */
+               rb |= (va_low & 0x7ff) << 12;   /* remaining 11 bits of AVA */
+               break;
+       default:
+       {
+               int aval_shift;
+               /*
+                * remaining 7bits of AVA/LP fields
+                * Also contain the rr bits of LP
+                */
+               rb |= (va_low & 0x7f) << 16;
+               /*
+                * Now clear not needed LP bits based on actual psize
+                */
+               rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1);
+               /*
+                * AVAL field 58..77 - base_page_shift bits of va
+                * we have space for 58..64 bits, Missing bits should
+                * be zero filled. +1 is to take care of L bit shift
+                */
+               aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1;
+               rb |= ((va_low << aval_shift) & 0xfe);
+
+               rb |= 1;                /* L field */
+               penc = mmu_psize_defs[b_psize].penc[a_psize];
+               rb |= penc << 12;       /* LP field */
+               break;
+       }
        }
        rb |= (v >> 54) & 0x300;                /* B field */
        return rb;
@@ -112,14 +200,26 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 
 static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
 {
+       int size, a_psize;
+       /* Look at the 8 bit LP value */
+       unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
        /* only handle 4k, 64k and 16M pages for now */
        if (!(h & HPTE_V_LARGE))
-               return 1ul << 12;               /* 4k page */
-       if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
-               return 1ul << 16;               /* 64k page */
-       if ((l & 0xff000) == 0)
-               return 1ul << 24;               /* 16M page */
-       return 0;                               /* error */
+               return 1ul << 12;
+       else {
+               for (size = 0; size < MMU_PAGE_COUNT; size++) {
+                       /* valid entries have a shift value */
+                       if (!mmu_psize_defs[size].shift)
+                               continue;
+
+                       a_psize = __hpte_actual_psize(lp, size);
+                       if (a_psize != -1)
+                               return 1ul << mmu_psize_defs[a_psize].shift;
+               }
+
+       }
+       return 0;
 }
 
 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
index 821725c1bf4624d218cd54c290eb1ed411fe1f1a..5bdfb5dd34002348578b8b9c0a6164a04b3d4820 100644 (file)
@@ -104,6 +104,7 @@ struct kvmppc_host_state {
 #ifdef CONFIG_PPC_BOOK3S_64
        u64 cfar;
        u64 ppr;
+       u64 host_fscr;
 #endif
 };
 
@@ -133,6 +134,7 @@ struct kvmppc_book3s_shadow_vcpu {
                u64     esid;
                u64     vsid;
        } slb[64];                      /* guest SLB */
+       u64 shadow_fscr;
 #endif
 };
 
index 80d46b5a7efb49cc6de436e45da6b0b5e690d9cf..c7aed6105ff98901fba31e5771ae9b6257107ee6 100644 (file)
@@ -108,9 +108,4 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.fault_dear;
 }
-
-static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.shared->msr;
-}
 #endif /* __ASM_KVM_BOOKE_H__ */
index 1eaea2dea1745e6310eb3918f2a99c0e2d8bd767..bb66d8b8efdf073fb2a54c39b9d1dd984be713cd 100644 (file)
@@ -449,7 +449,9 @@ struct kvm_vcpu_arch {
        ulong pc;
        ulong ctr;
        ulong lr;
+#ifdef CONFIG_PPC_BOOK3S
        ulong tar;
+#endif
 
        ulong xer;
        u32 cr;
@@ -475,6 +477,7 @@ struct kvm_vcpu_arch {
        ulong ppr;
        ulong pspb;
        ulong fscr;
+       ulong shadow_fscr;
        ulong ebbhr;
        ulong ebbrr;
        ulong bescr;
@@ -562,6 +565,7 @@ struct kvm_vcpu_arch {
 #ifdef CONFIG_PPC_BOOK3S
        ulong fault_dar;
        u32 fault_dsisr;
+       unsigned long intr_msr;
 #endif
 
 #ifdef CONFIG_BOOKE
@@ -622,8 +626,12 @@ struct kvm_vcpu_arch {
        wait_queue_head_t cpu_run;
 
        struct kvm_vcpu_arch_shared *shared;
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+       bool shared_big_endian;
+#endif
        unsigned long magic_page_pa; /* phys addr to map the magic page to */
        unsigned long magic_page_ea; /* effect. addr to map the magic page to */
+       bool disable_kernel_nx;
 
        int irq_type;           /* one of KVM_IRQ_* */
        int irq_cpu_id;
@@ -654,7 +662,6 @@ struct kvm_vcpu_arch {
        spinlock_t tbacct_lock;
        u64 busy_stolen;
        u64 busy_preempt;
-       unsigned long intr_msr;
 #endif
 };
 
index 4096f16502a9becab431cb8e3f7c9f51b6989b4f..4a7cc453be0b0b8193a52a9900d71eea3d2d27fc 100644 (file)
@@ -448,6 +448,84 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
        }
 }
 
+/*
+ * Shared struct helpers. The shared struct can be little or big endian,
+ * depending on the guest endianness. So expose helpers to all of them.
+ */
+static inline bool kvmppc_shared_big_endian(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+       /* Only Book3S_64 PR supports bi-endian for now */
+       return vcpu->arch.shared_big_endian;
+#elif defined(CONFIG_PPC_BOOK3S_64) && defined(__LITTLE_ENDIAN__)
+       /* Book3s_64 HV on little endian is always little endian */
+       return false;
+#else
+       return true;
+#endif
+}
+
+#define SHARED_WRAPPER_GET(reg, size)                                  \
+static inline u##size kvmppc_get_##reg(struct kvm_vcpu *vcpu)  \
+{                                                                      \
+       if (kvmppc_shared_big_endian(vcpu))                             \
+              return be##size##_to_cpu(vcpu->arch.shared->reg);        \
+       else                                                            \
+              return le##size##_to_cpu(vcpu->arch.shared->reg);        \
+}                                                                      \
+
+#define SHARED_WRAPPER_SET(reg, size)                                  \
+static inline void kvmppc_set_##reg(struct kvm_vcpu *vcpu, u##size val)        \
+{                                                                      \
+       if (kvmppc_shared_big_endian(vcpu))                             \
+              vcpu->arch.shared->reg = cpu_to_be##size(val);           \
+       else                                                            \
+              vcpu->arch.shared->reg = cpu_to_le##size(val);           \
+}                                                                      \
+
+#define SHARED_WRAPPER(reg, size)                                      \
+       SHARED_WRAPPER_GET(reg, size)                                   \
+       SHARED_WRAPPER_SET(reg, size)                                   \
+
+SHARED_WRAPPER(critical, 64)
+SHARED_WRAPPER(sprg0, 64)
+SHARED_WRAPPER(sprg1, 64)
+SHARED_WRAPPER(sprg2, 64)
+SHARED_WRAPPER(sprg3, 64)
+SHARED_WRAPPER(srr0, 64)
+SHARED_WRAPPER(srr1, 64)
+SHARED_WRAPPER(dar, 64)
+SHARED_WRAPPER_GET(msr, 64)
+static inline void kvmppc_set_msr_fast(struct kvm_vcpu *vcpu, u64 val)
+{
+       if (kvmppc_shared_big_endian(vcpu))
+              vcpu->arch.shared->msr = cpu_to_be64(val);
+       else
+              vcpu->arch.shared->msr = cpu_to_le64(val);
+}
+SHARED_WRAPPER(dsisr, 32)
+SHARED_WRAPPER(int_pending, 32)
+SHARED_WRAPPER(sprg4, 64)
+SHARED_WRAPPER(sprg5, 64)
+SHARED_WRAPPER(sprg6, 64)
+SHARED_WRAPPER(sprg7, 64)
+
+static inline u32 kvmppc_get_sr(struct kvm_vcpu *vcpu, int nr)
+{
+       if (kvmppc_shared_big_endian(vcpu))
+              return be32_to_cpu(vcpu->arch.shared->sr[nr]);
+       else
+              return le32_to_cpu(vcpu->arch.shared->sr[nr]);
+}
+
+static inline void kvmppc_set_sr(struct kvm_vcpu *vcpu, int nr, u32 val)
+{
+       if (kvmppc_shared_big_endian(vcpu))
+              vcpu->arch.shared->sr[nr] = cpu_to_be32(val);
+       else
+              vcpu->arch.shared->sr[nr] = cpu_to_le32(val);
+}
+
 /*
  * Please call after prepare_to_enter. This function puts the lazy ee and irq
  * disabled tracking state back to normal mode, without actually enabling
@@ -485,7 +563,7 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
        msr_64bit = MSR_SF;
 #endif
 
-       if (!(vcpu->arch.shared->msr & msr_64bit))
+       if (!(kvmppc_get_msr(vcpu) & msr_64bit))
                ea = (uint32_t)ea;
 
        return ea;
index e5d2e0bc7e032b64890701cdc91d88e18cd665c6..4852bcf270f37a30b68519e8b0ce3b52c5fe7d8c 100644 (file)
 #define   MMCR0_PROBLEM_DISABLE MMCR0_FCP
 #define   MMCR0_FCM1   0x10000000UL /* freeze counters while MSR mark = 1 */
 #define   MMCR0_FCM0   0x08000000UL /* freeze counters while MSR mark = 0 */
-#define   MMCR0_PMXE   0x04000000UL /* performance monitor exception enable */
-#define   MMCR0_FCECE  0x02000000UL /* freeze ctrs on enabled cond or event */
+#define   MMCR0_PMXE   ASM_CONST(0x04000000) /* perf mon exception enable */
+#define   MMCR0_FCECE  ASM_CONST(0x02000000) /* freeze ctrs on enabled cond or event */
 #define   MMCR0_TBEE   0x00400000UL /* time base exception enable */
 #define   MMCR0_BHRBA  0x00200000UL /* BHRB Access allowed in userspace */
 #define   MMCR0_EBE    0x00100000UL /* Event based branch enable */
 #define   MMCR0_PMCC   0x000c0000UL /* PMC control */
 #define   MMCR0_PMCC_U6        0x00080000UL /* PMC1-6 are R/W by user (PR) */
 #define   MMCR0_PMC1CE 0x00008000UL /* PMC1 count enable*/
-#define   MMCR0_PMCjCE 0x00004000UL /* PMCj count enable*/
+#define   MMCR0_PMCjCE ASM_CONST(0x00004000) /* PMCj count enable*/
 #define   MMCR0_TRIGGER        0x00002000UL /* TRIGGER enable */
-#define   MMCR0_PMAO_SYNC 0x00000800UL /* PMU interrupt is synchronous */
-#define   MMCR0_PMAO   0x00000080UL /* performance monitor alert has occurred, set to 0 after handling exception */
+#define   MMCR0_PMAO_SYNC ASM_CONST(0x00000800) /* PMU intr is synchronous */
+#define   MMCR0_C56RUN ASM_CONST(0x00000100) /* PMC5/6 count when RUN=0 */
+/* performance monitor alert has occurred, set to 0 after handling exception */
+#define   MMCR0_PMAO   ASM_CONST(0x00000080)
 #define   MMCR0_SHRFC  0x00000040UL /* SHRre freeze conditions between threads */
 #define   MMCR0_FC56   0x00000010UL /* freeze counters 5 and 6 */
 #define   MMCR0_FCTI   0x00000008UL /* freeze counters in tags inactive mode */
index 163c3b05a76e9d42f1a2067f510fd44cc7232dde..464f1089b532ab2dc0d3624330c20dd844b1270d 100644 (file)
 
 /* Bit definitions for L1CSR0. */
 #define L1CSR0_CPE     0x00010000      /* Data Cache Parity Enable */
+#define L1CSR0_CUL     0x00000400      /* Data Cache Unable to Lock */
 #define L1CSR0_CLFC    0x00000100      /* Cache Lock Bits Flash Clear */
 #define L1CSR0_DCFI    0x00000002      /* Data Cache Flash Invalidate */
 #define L1CSR0_CFI     0x00000002      /* Cache Flash Invalidate */
index a6665be4f3ab81dfd35ab1f08b8f6a3cce6a8493..2bc4a9409a934e4e7416e5a058191eca9dfe2e72 100644 (file)
@@ -545,7 +545,6 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_TCSCR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb1)
 #define KVM_REG_PPC_PID                (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb2)
 #define KVM_REG_PPC_ACOP       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb3)
-#define KVM_REG_PPC_WORT       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb4)
 
 #define KVM_REG_PPC_VRSAVE     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb4)
 #define KVM_REG_PPC_LPCR       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb5)
@@ -555,6 +554,7 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_ARCH_COMPAT        (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb7)
 
 #define KVM_REG_PPC_DABRX      (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb8)
+#define KVM_REG_PPC_WORT       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb9)
 
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
index e3af3286a06801643a760fcbfa2afd116ad01053..91e42f09b323fac3199386eb58659f00e4fcdc7f 100644 (file)
@@ -82,10 +82,16 @@ struct kvm_vcpu_arch_shared {
 
 #define KVM_FEATURE_MAGIC_PAGE 1
 
+/* Magic page flags from host to guest */
+
 #define KVM_MAGIC_FEAT_SR              (1 << 0)
 
 /* MASn, ESR, PIR, and high SPRGs */
 #define KVM_MAGIC_FEAT_MAS0_TO_SPRG7   (1 << 1)
 
+/* Magic page flags from guest to host */
+
+#define MAGIC_PAGE_FLAG_NOT_MAPPED_NX  (1 << 0)
+
 
 #endif /* _UAPI__POWERPC_KVM_PARA_H__ */
index 94908af308d80423dd3cf18455887aadb5d9b7bf..34f55524d4564bd83db2dce7b8455080c952987d 100644 (file)
 #include <asm/cputable.h>
 #include <asm/emulated_ops.h>
 #include <asm/switch_to.h>
+#include <asm/disassemble.h>
 
 struct aligninfo {
        unsigned char len;
        unsigned char flags;
 };
 
-#define IS_XFORM(inst) (((inst) >> 26) == 31)
-#define IS_DSFORM(inst)        (((inst) >> 26) >= 56)
 
 #define INVALID        { 0, 0 }
 
@@ -191,37 +190,6 @@ static struct aligninfo aligninfo[128] = {
        INVALID,                /* 11 1 1111 */
 };
 
-/*
- * Create a DSISR value from the instruction
- */
-static inline unsigned make_dsisr(unsigned instr)
-{
-       unsigned dsisr;
-
-
-       /* bits  6:15 --> 22:31 */
-       dsisr = (instr & 0x03ff0000) >> 16;
-
-       if (IS_XFORM(instr)) {
-               /* bits 29:30 --> 15:16 */
-               dsisr |= (instr & 0x00000006) << 14;
-               /* bit     25 -->    17 */
-               dsisr |= (instr & 0x00000040) << 8;
-               /* bits 21:24 --> 18:21 */
-               dsisr |= (instr & 0x00000780) << 3;
-       } else {
-               /* bit      5 -->    17 */
-               dsisr |= (instr & 0x04000000) >> 12;
-               /* bits  1: 4 --> 18:21 */
-               dsisr |= (instr & 0x78000000) >> 17;
-               /* bits 30:31 --> 12:13 */
-               if (IS_DSFORM(instr))
-                       dsisr |= (instr & 0x00000003) << 18;
-       }
-
-       return dsisr;
-}
-
 /*
  * The dcbz (data cache block zero) instruction
  * gives an alignment fault if used on non-cacheable
index dba8140ebc20b8d425bafabe5a0edf57bea2e1bb..93e1465c849681729b0b158097562246cf8cd685 100644 (file)
@@ -54,6 +54,7 @@
 #endif
 #if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S)
 #include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
 #endif
 
 #ifdef CONFIG_PPC32
@@ -445,7 +446,9 @@ int main(void)
        DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
        DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
        DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+#ifdef CONFIG_PPC_BOOK3S
        DEFINE(VCPU_TAR, offsetof(struct kvm_vcpu, arch.tar));
+#endif
        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
        DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -467,6 +470,9 @@ int main(void)
        DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
        DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
        DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+       DEFINE(VCPU_SHAREDBE, offsetof(struct kvm_vcpu, arch.shared_big_endian));
+#endif
 
        DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0));
        DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1));
@@ -493,7 +499,6 @@ int main(void)
        DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
        DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
        DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
-       DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
        DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -528,11 +533,13 @@ int main(void)
        DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
        DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
        DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+       DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
        DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
        DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
        DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
        DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
        DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr));
+       DEFINE(VCPU_SHADOW_FSCR, offsetof(struct kvm_vcpu, arch.shadow_fscr));
        DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb));
        DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr));
        DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr));
@@ -614,6 +621,7 @@ int main(void)
 #ifdef CONFIG_PPC64
        SVCPU_FIELD(SVCPU_SLB, slb);
        SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
+       SVCPU_FIELD(SVCPU_SHADOW_FSCR, shadow_fscr);
 #endif
 
        HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
@@ -649,6 +657,7 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S_64
        HSTATE_FIELD(HSTATE_CFAR, cfar);
        HSTATE_FIELD(HSTATE_PPR, ppr);
+       HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
index 7898be90f2dcd013ecec5d17dcb4897e9a1b62f7..d9b79358b833469be1c25d6a55cc4043877341d5 100644 (file)
@@ -47,9 +47,10 @@ static int __init early_init_dt_scan_epapr(unsigned long node,
                return -1;
 
        for (i = 0; i < (len / 4); i++) {
-               patch_instruction(epapr_hypercall_start + i, insts[i]);
+               u32 inst = be32_to_cpu(insts[i]);
+               patch_instruction(epapr_hypercall_start + i, inst);
 #if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
-               patch_instruction(epapr_ev_idle_start + i, insts[i]);
+               patch_instruction(epapr_ev_idle_start + i, inst);
 #endif
        }
 
index dd8695f6cb6d2c6de9bca80b073f4f40013cdeef..33aa4ddf597dc7f1a50ab6631b598935a6b86c31 100644 (file)
@@ -417,7 +417,7 @@ static void kvm_map_magic_page(void *data)
        ulong out[8];
 
        in[0] = KVM_MAGIC_PAGE;
-       in[1] = KVM_MAGIC_PAGE;
+       in[1] = KVM_MAGIC_PAGE | MAGIC_PAGE_FLAG_NOT_MAPPED_NX;
 
        epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
 
index ad302f845e5d5a0a358b77381137f52f0aabc295..d6e195e8cd4c4e26953d929d8cc77ae7ff9cee9d 100644 (file)
@@ -98,6 +98,9 @@ static inline void free_lppacas(void) { }
 /*
  * 3 persistent SLBs are registered here.  The buffer will be zero
  * initially, hence will all be invaild until we actually write them.
+ *
+ * If you make the number of persistent SLB entries dynamic, please also
+ * update PR KVM to flush and restore them accordingly.
  */
 static struct slb_shadow *slb_shadow;
 
index 141b2027189a8add1c90dba3278c6727f7406798..d6a53b95de94a2afe544293e6f3a12ef37acee03 100644 (file)
@@ -6,7 +6,6 @@ source "virt/kvm/Kconfig"
 
 menuconfig VIRTUALIZATION
        bool "Virtualization"
-       depends on !CPU_LITTLE_ENDIAN
        ---help---
          Say Y here to get to see options for using your Linux host to run
          other operating systems inside virtual machines (guests).
@@ -76,6 +75,7 @@ config KVM_BOOK3S_64
 config KVM_BOOK3S_64_HV
        tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
        depends on KVM_BOOK3S_64
+       depends on !CPU_LITTLE_ENDIAN
        select KVM_BOOK3S_HV_POSSIBLE
        select MMU_NOTIFIER
        select CMA
index 7af190a266b388167693163f35eb5d249eee5d55..c254c27f240e11d2d245cf938a5f104c61b3d2f4 100644 (file)
@@ -85,9 +85,9 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
        if (is_kvmppc_hv_enabled(vcpu->kvm))
                return;
        if (pending_now)
-               vcpu->arch.shared->int_pending = 1;
+               kvmppc_set_int_pending(vcpu, 1);
        else if (old_pending)
-               vcpu->arch.shared->int_pending = 0;
+               kvmppc_set_int_pending(vcpu, 0);
 }
 
 static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
@@ -99,11 +99,11 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
        if (is_kvmppc_hv_enabled(vcpu->kvm))
                return false;
 
-       crit_raw = vcpu->arch.shared->critical;
+       crit_raw = kvmppc_get_critical(vcpu);
        crit_r1 = kvmppc_get_gpr(vcpu, 1);
 
        /* Truncate crit indicators in 32 bit mode */
-       if (!(vcpu->arch.shared->msr & MSR_SF)) {
+       if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
                crit_raw &= 0xffffffff;
                crit_r1 &= 0xffffffff;
        }
@@ -111,15 +111,15 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
        /* Critical section when crit == r1 */
        crit = (crit_raw == crit_r1);
        /* ... and we're in supervisor mode */
-       crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+       crit = crit && !(kvmppc_get_msr(vcpu) & MSR_PR);
 
        return crit;
 }
 
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
-       vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
-       vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
+       kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
+       kvmppc_set_srr1(vcpu, kvmppc_get_msr(vcpu) | flags);
        kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);
        vcpu->arch.mmu.reset_msr(vcpu);
 }
@@ -145,6 +145,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
        case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;                break;
        case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC;              break;
        case 0xf40: prio = BOOK3S_IRQPRIO_VSX;                  break;
+       case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL;          break;
        default:    prio = BOOK3S_IRQPRIO_MAX;                  break;
        }
 
@@ -225,12 +226,12 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 
        switch (priority) {
        case BOOK3S_IRQPRIO_DECREMENTER:
-               deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
+               deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
                vec = BOOK3S_INTERRUPT_DECREMENTER;
                break;
        case BOOK3S_IRQPRIO_EXTERNAL:
        case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
-               deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
+               deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
                vec = BOOK3S_INTERRUPT_EXTERNAL;
                break;
        case BOOK3S_IRQPRIO_SYSTEM_RESET:
@@ -275,6 +276,9 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
        case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR:
                vec = BOOK3S_INTERRUPT_PERFMON;
                break;
+       case BOOK3S_IRQPRIO_FAC_UNAVAIL:
+               vec = BOOK3S_INTERRUPT_FAC_UNAVAIL;
+               break;
        default:
                deliver = 0;
                printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority);
@@ -343,7 +347,7 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing,
 {
        ulong mp_pa = vcpu->arch.magic_page_pa;
 
-       if (!(vcpu->arch.shared->msr & MSR_SF))
+       if (!(kvmppc_get_msr(vcpu) & MSR_SF))
                mp_pa = (uint32_t)mp_pa;
 
        /* Magic page override */
@@ -367,7 +371,7 @@ EXPORT_SYMBOL_GPL(kvmppc_gfn_to_pfn);
 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
                        bool iswrite, struct kvmppc_pte *pte)
 {
-       int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR));
+       int relocated = (kvmppc_get_msr(vcpu) & (data ? MSR_DR : MSR_IR));
        int r;
 
        if (relocated) {
@@ -498,18 +502,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        regs->ctr = kvmppc_get_ctr(vcpu);
        regs->lr = kvmppc_get_lr(vcpu);
        regs->xer = kvmppc_get_xer(vcpu);
-       regs->msr = vcpu->arch.shared->msr;
-       regs->srr0 = vcpu->arch.shared->srr0;
-       regs->srr1 = vcpu->arch.shared->srr1;
+       regs->msr = kvmppc_get_msr(vcpu);
+       regs->srr0 = kvmppc_get_srr0(vcpu);
+       regs->srr1 = kvmppc_get_srr1(vcpu);
        regs->pid = vcpu->arch.pid;
-       regs->sprg0 = vcpu->arch.shared->sprg0;
-       regs->sprg1 = vcpu->arch.shared->sprg1;
-       regs->sprg2 = vcpu->arch.shared->sprg2;
-       regs->sprg3 = vcpu->arch.shared->sprg3;
-       regs->sprg4 = vcpu->arch.shared->sprg4;
-       regs->sprg5 = vcpu->arch.shared->sprg5;
-       regs->sprg6 = vcpu->arch.shared->sprg6;
-       regs->sprg7 = vcpu->arch.shared->sprg7;
+       regs->sprg0 = kvmppc_get_sprg0(vcpu);
+       regs->sprg1 = kvmppc_get_sprg1(vcpu);
+       regs->sprg2 = kvmppc_get_sprg2(vcpu);
+       regs->sprg3 = kvmppc_get_sprg3(vcpu);
+       regs->sprg4 = kvmppc_get_sprg4(vcpu);
+       regs->sprg5 = kvmppc_get_sprg5(vcpu);
+       regs->sprg6 = kvmppc_get_sprg6(vcpu);
+       regs->sprg7 = kvmppc_get_sprg7(vcpu);
 
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -527,16 +531,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        kvmppc_set_lr(vcpu, regs->lr);
        kvmppc_set_xer(vcpu, regs->xer);
        kvmppc_set_msr(vcpu, regs->msr);
-       vcpu->arch.shared->srr0 = regs->srr0;
-       vcpu->arch.shared->srr1 = regs->srr1;
-       vcpu->arch.shared->sprg0 = regs->sprg0;
-       vcpu->arch.shared->sprg1 = regs->sprg1;
-       vcpu->arch.shared->sprg2 = regs->sprg2;
-       vcpu->arch.shared->sprg3 = regs->sprg3;
-       vcpu->arch.shared->sprg4 = regs->sprg4;
-       vcpu->arch.shared->sprg5 = regs->sprg5;
-       vcpu->arch.shared->sprg6 = regs->sprg6;
-       vcpu->arch.shared->sprg7 = regs->sprg7;
+       kvmppc_set_srr0(vcpu, regs->srr0);
+       kvmppc_set_srr1(vcpu, regs->srr1);
+       kvmppc_set_sprg0(vcpu, regs->sprg0);
+       kvmppc_set_sprg1(vcpu, regs->sprg1);
+       kvmppc_set_sprg2(vcpu, regs->sprg2);
+       kvmppc_set_sprg3(vcpu, regs->sprg3);
+       kvmppc_set_sprg4(vcpu, regs->sprg4);
+       kvmppc_set_sprg5(vcpu, regs->sprg5);
+       kvmppc_set_sprg6(vcpu, regs->sprg6);
+       kvmppc_set_sprg7(vcpu, regs->sprg7);
 
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
@@ -570,10 +574,10 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                r = 0;
                switch (reg->id) {
                case KVM_REG_PPC_DAR:
-                       val = get_reg_val(reg->id, vcpu->arch.shared->dar);
+                       val = get_reg_val(reg->id, kvmppc_get_dar(vcpu));
                        break;
                case KVM_REG_PPC_DSISR:
-                       val = get_reg_val(reg->id, vcpu->arch.shared->dsisr);
+                       val = get_reg_val(reg->id, kvmppc_get_dsisr(vcpu));
                        break;
                case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
                        i = reg->id - KVM_REG_PPC_FPR0;
@@ -627,6 +631,21 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                        val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu));
                        break;
 #endif /* CONFIG_KVM_XICS */
+               case KVM_REG_PPC_FSCR:
+                       val = get_reg_val(reg->id, vcpu->arch.fscr);
+                       break;
+               case KVM_REG_PPC_TAR:
+                       val = get_reg_val(reg->id, vcpu->arch.tar);
+                       break;
+               case KVM_REG_PPC_EBBHR:
+                       val = get_reg_val(reg->id, vcpu->arch.ebbhr);
+                       break;
+               case KVM_REG_PPC_EBBRR:
+                       val = get_reg_val(reg->id, vcpu->arch.ebbrr);
+                       break;
+               case KVM_REG_PPC_BESCR:
+                       val = get_reg_val(reg->id, vcpu->arch.bescr);
+                       break;
                default:
                        r = -EINVAL;
                        break;
@@ -660,10 +679,10 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                r = 0;
                switch (reg->id) {
                case KVM_REG_PPC_DAR:
-                       vcpu->arch.shared->dar = set_reg_val(reg->id, val);
+                       kvmppc_set_dar(vcpu, set_reg_val(reg->id, val));
                        break;
                case KVM_REG_PPC_DSISR:
-                       vcpu->arch.shared->dsisr = set_reg_val(reg->id, val);
+                       kvmppc_set_dsisr(vcpu, set_reg_val(reg->id, val));
                        break;
                case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
                        i = reg->id - KVM_REG_PPC_FPR0;
@@ -716,6 +735,21 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                                                set_reg_val(reg->id, val));
                        break;
 #endif /* CONFIG_KVM_XICS */
+               case KVM_REG_PPC_FSCR:
+                       vcpu->arch.fscr = set_reg_val(reg->id, val);
+                       break;
+               case KVM_REG_PPC_TAR:
+                       vcpu->arch.tar = set_reg_val(reg->id, val);
+                       break;
+               case KVM_REG_PPC_EBBHR:
+                       vcpu->arch.ebbhr = set_reg_val(reg->id, val);
+                       break;
+               case KVM_REG_PPC_EBBRR:
+                       vcpu->arch.ebbrr = set_reg_val(reg->id, val);
+                       break;
+               case KVM_REG_PPC_BESCR:
+                       vcpu->arch.bescr = set_reg_val(reg->id, val);
+                       break;
                default:
                        r = -EINVAL;
                        break;
index 76a64ce6a5b6c641207cfa40b6dd421299f788a0..93503bbdae43df28efe2b0ab90f52352e0600898 100644 (file)
@@ -91,7 +91,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 
 static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-       return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf];
+       return kvmppc_get_sr(vcpu, (eaddr >> 28) & 0xf);
 }
 
 static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -131,7 +131,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu,
        pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash;
 
        dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n",
-               kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg,
+               kvmppc_get_pc(vcpu), eaddr, vcpu_book3s->sdr1, pteg,
                sr_vsid(sre));
 
        r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT);
@@ -160,7 +160,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
                else
                        bat = &vcpu_book3s->ibat[i];
 
-               if (vcpu->arch.shared->msr & MSR_PR) {
+               if (kvmppc_get_msr(vcpu) & MSR_PR) {
                        if (!bat->vp)
                                continue;
                } else {
@@ -208,6 +208,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
        u32 sre;
        hva_t ptegp;
        u32 pteg[16];
+       u32 pte0, pte1;
        u32 ptem = 0;
        int i;
        int found = 0;
@@ -233,14 +234,16 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
        }
 
        for (i=0; i<16; i+=2) {
-               if (ptem == pteg[i]) {
+               pte0 = be32_to_cpu(pteg[i]);
+               pte1 = be32_to_cpu(pteg[i + 1]);
+               if (ptem == pte0) {
                        u8 pp;
 
-                       pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF);
-                       pp = pteg[i+1] & 3;
+                       pte->raddr = (pte1 & ~(0xFFFULL)) | (eaddr & 0xFFF);
+                       pp = pte1 & 3;
 
-                       if ((sr_kp(sre) &&  (vcpu->arch.shared->msr & MSR_PR)) ||
-                           (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR)))
+                       if ((sr_kp(sre) &&  (kvmppc_get_msr(vcpu) & MSR_PR)) ||
+                           (sr_ks(sre) && !(kvmppc_get_msr(vcpu) & MSR_PR)))
                                pp |= 4;
 
                        pte->may_write = false;
@@ -260,7 +263,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
                        }
 
                        dprintk_pte("MMU: Found PTE -> %x %x - %x\n",
-                                   pteg[i], pteg[i+1], pp);
+                                   pte0, pte1, pp);
                        found = 1;
                        break;
                }
@@ -269,8 +272,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
        /* Update PTE C and A bits, so the guest's swapper knows we used the
           page */
        if (found) {
-               u32 pte_r = pteg[i+1];
-               char __user *addr = (char __user *) &pteg[i+1];
+               u32 pte_r = pte1;
+               char __user *addr = (char __user *) (ptegp + (i+1) * sizeof(u32));
 
                /*
                 * Use single-byte writes to update the HPTE, to
@@ -296,7 +299,8 @@ no_page_found:
                            to_book3s(vcpu)->sdr1, ptegp);
                for (i=0; i<16; i+=2) {
                        dprintk_pte("   %02d: 0x%x - 0x%x (0x%x)\n",
-                                   i, pteg[i], pteg[i+1], ptem);
+                                   i, be32_to_cpu(pteg[i]),
+                                   be32_to_cpu(pteg[i+1]), ptem);
                }
        }
 
@@ -316,7 +320,7 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        /* Magic page override */
        if (unlikely(mp_ea) &&
            unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
-           !(vcpu->arch.shared->msr & MSR_PR)) {
+           !(kvmppc_get_msr(vcpu) & MSR_PR)) {
                pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
                pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff);
                pte->raddr &= KVM_PAM;
@@ -341,13 +345,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)
 {
-       return vcpu->arch.shared->sr[srnum];
+       return kvmppc_get_sr(vcpu, srnum);
 }
 
 static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
                                        ulong value)
 {
-       vcpu->arch.shared->sr[srnum] = value;
+       kvmppc_set_sr(vcpu, srnum, value);
        kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);
 }
 
@@ -367,8 +371,9 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
        ulong ea = esid << SID_SHIFT;
        u32 sr;
        u64 gvsid = esid;
+       u64 msr = kvmppc_get_msr(vcpu);
 
-       if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+       if (msr & (MSR_DR|MSR_IR)) {
                sr = find_sr(vcpu, ea);
                if (sr_valid(sr))
                        gvsid = sr_vsid(sr);
@@ -377,7 +382,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
        /* In case we only have one of MSR_IR or MSR_DR set, let's put
           that in the real-mode context (and hope RM doesn't access
           high memory) */
-       switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+       switch (msr & (MSR_DR|MSR_IR)) {
        case 0:
                *vsid = VSID_REAL | esid;
                break;
@@ -397,7 +402,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
                BUG();
        }
 
-       if (vcpu->arch.shared->msr & MSR_PR)
+       if (msr & MSR_PR)
                *vsid |= VSID_PR;
 
        return 0;
index 5fac89dfe4cdf2dd1673753d703dbf32c8b26b03..678e753704959775d85054293f10280843dd2339 100644 (file)
@@ -92,7 +92,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
        struct kvmppc_sid_map *map;
        u16 sid_map_mask;
 
-       if (vcpu->arch.shared->msr & MSR_PR)
+       if (kvmppc_get_msr(vcpu) & MSR_PR)
                gvsid |= VSID_PR;
 
        sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
@@ -279,7 +279,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
        u16 sid_map_mask;
        static int backwards_map = 0;
 
-       if (vcpu->arch.shared->msr & MSR_PR)
+       if (kvmppc_get_msr(vcpu) & MSR_PR)
                gvsid |= VSID_PR;
 
        /* We might get collisions that trap in preceding order, so let's
index 83da1f868fd5356dce92c721d963de0b794584f2..774a253ca4e1eaa2ab9c8277d22cbcae0d85f676 100644 (file)
@@ -38,7 +38,7 @@
 
 static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 {
-       kvmppc_set_msr(vcpu, MSR_SF);
+       kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
@@ -226,7 +226,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        /* Magic page override */
        if (unlikely(mp_ea) &&
            unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
-           !(vcpu->arch.shared->msr & MSR_PR)) {
+           !(kvmppc_get_msr(vcpu) & MSR_PR)) {
                gpte->eaddr = eaddr;
                gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
                gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff);
@@ -269,18 +269,21 @@ do_second:
                goto no_page_found;
        }
 
-       if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp)
+       if ((kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Kp)
                key = 4;
-       else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks)
+       else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Ks)
                key = 4;
 
        for (i=0; i<16; i+=2) {
+               u64 pte0 = be64_to_cpu(pteg[i]);
+               u64 pte1 = be64_to_cpu(pteg[i + 1]);
+
                /* Check all relevant fields of 1st dword */
-               if ((pteg[i] & v_mask) == v_val) {
+               if ((pte0 & v_mask) == v_val) {
                        /* If large page bit is set, check pgsize encoding */
                        if (slbe->large &&
                            (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) {
-                               pgsize = decode_pagesize(slbe, pteg[i+1]);
+                               pgsize = decode_pagesize(slbe, pte1);
                                if (pgsize < 0)
                                        continue;
                        }
@@ -297,8 +300,8 @@ do_second:
                goto do_second;
        }
 
-       v = pteg[i];
-       r = pteg[i+1];
+       v = be64_to_cpu(pteg[i]);
+       r = be64_to_cpu(pteg[i+1]);
        pp = (r & HPTE_R_PP) | key;
        if (r & HPTE_R_PP0)
                pp |= 8;
@@ -310,6 +313,9 @@ do_second:
        gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
        gpte->page_size = pgsize;
        gpte->may_execute = ((r & HPTE_R_N) ? false : true);
+       if (unlikely(vcpu->arch.disable_kernel_nx) &&
+           !(kvmppc_get_msr(vcpu) & MSR_PR))
+               gpte->may_execute = true;
        gpte->may_read = false;
        gpte->may_write = false;
 
@@ -342,14 +348,14 @@ do_second:
                 * non-PAPR platforms such as mac99, and this is
                 * what real hardware does.
                 */
-               char __user *addr = (char __user *) &pteg[i+1];
+                char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64));
                r |= HPTE_R_R;
                put_user(r >> 8, addr + 6);
        }
        if (iswrite && gpte->may_write && !(r & HPTE_R_C)) {
                /* Set the dirty flag */
                /* Use a single byte write */
-               char __user *addr = (char __user *) &pteg[i+1];
+                char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64));
                r |= HPTE_R_C;
                put_user(r, addr + 7);
        }
@@ -479,7 +485,7 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
                vcpu->arch.slb[i].origv = 0;
        }
 
-       if (vcpu->arch.shared->msr & MSR_IR) {
+       if (kvmppc_get_msr(vcpu) & MSR_IR) {
                kvmppc_mmu_flush_segments(vcpu);
                kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
        }
@@ -563,7 +569,7 @@ static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid)
 {
        ulong mp_ea = vcpu->arch.magic_page_ea;
 
-       return mp_ea && !(vcpu->arch.shared->msr & MSR_PR) &&
+       return mp_ea && !(kvmppc_get_msr(vcpu) & MSR_PR) &&
                (mp_ea >> SID_SHIFT) == esid;
 }
 #endif
@@ -576,8 +582,9 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
        u64 gvsid = esid;
        ulong mp_ea = vcpu->arch.magic_page_ea;
        int pagesize = MMU_PAGE_64K;
+       u64 msr = kvmppc_get_msr(vcpu);
 
-       if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+       if (msr & (MSR_DR|MSR_IR)) {
                slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
                if (slb) {
                        gvsid = slb->vsid;
@@ -590,7 +597,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
                }
        }
 
-       switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+       switch (msr & (MSR_DR|MSR_IR)) {
        case 0:
                gvsid = VSID_REAL | esid;
                break;
@@ -623,7 +630,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
                gvsid |= VSID_64K;
 #endif
 
-       if (vcpu->arch.shared->msr & MSR_PR)
+       if (kvmppc_get_msr(vcpu) & MSR_PR)
                gvsid |= VSID_PR;
 
        *vsid = gvsid;
@@ -633,7 +640,7 @@ no_slb:
        /* Catch magic page case */
        if (unlikely(mp_ea) &&
            unlikely(esid == (mp_ea >> SID_SHIFT)) &&
-           !(vcpu->arch.shared->msr & MSR_PR)) {
+           !(kvmppc_get_msr(vcpu) & MSR_PR)) {
                *vsid = VSID_REAL | esid;
                return 0;
        }
index 0d513af62bba179ad7ccd777bb3f1aa4217b1cb3..0ac98392f3635f82152405f48af891a3ce27b3dc 100644 (file)
@@ -58,7 +58,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
        struct kvmppc_sid_map *map;
        u16 sid_map_mask;
 
-       if (vcpu->arch.shared->msr & MSR_PR)
+       if (kvmppc_get_msr(vcpu) & MSR_PR)
                gvsid |= VSID_PR;
 
        sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
@@ -230,7 +230,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
        u16 sid_map_mask;
        static int backwards_map = 0;
 
-       if (vcpu->arch.shared->msr & MSR_PR)
+       if (kvmppc_get_msr(vcpu) & MSR_PR)
                gvsid |= VSID_PR;
 
        /* We might get collisions that trap in preceding order, so let's
@@ -271,11 +271,8 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
        int found_inval = -1;
        int r;
 
-       if (!svcpu->slb_max)
-               svcpu->slb_max = 1;
-
        /* Are we overwriting? */
-       for (i = 1; i < svcpu->slb_max; i++) {
+       for (i = 0; i < svcpu->slb_max; i++) {
                if (!(svcpu->slb[i].esid & SLB_ESID_V))
                        found_inval = i;
                else if ((svcpu->slb[i].esid & ESID_MASK) == esid) {
@@ -285,7 +282,7 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
        }
 
        /* Found a spare entry that was invalidated before */
-       if (found_inval > 0) {
+       if (found_inval >= 0) {
                r = found_inval;
                goto out;
        }
@@ -359,7 +356,7 @@ void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size)
        ulong seg_mask = -seg_size;
        int i;
 
-       for (i = 1; i < svcpu->slb_max; i++) {
+       for (i = 0; i < svcpu->slb_max; i++) {
                if ((svcpu->slb[i].esid & SLB_ESID_V) &&
                    (svcpu->slb[i].esid & seg_mask) == ea) {
                        /* Invalidate this entry */
@@ -373,7 +370,7 @@ void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size)
 void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       svcpu->slb_max = 1;
+       svcpu->slb_max = 0;
        svcpu->slb[0].esid = 0;
        svcpu_put(svcpu);
 }
index fb25ebc0af0ce212469180eee027615bd20ea631..80561074078d01ca8d101f7997a55699e8589e3f 100644 (file)
@@ -52,7 +52,7 @@ static void kvmppc_rmap_reset(struct kvm *kvm);
 
 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
-       unsigned long hpt;
+       unsigned long hpt = 0;
        struct revmap_entry *rev;
        struct page *page = NULL;
        long order = KVM_DEFAULT_HPT_ORDER;
@@ -64,22 +64,11 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
        }
 
        kvm->arch.hpt_cma_alloc = 0;
-       /*
-        * try first to allocate it from the kernel page allocator.
-        * We keep the CMA reserved for failed allocation.
-        */
-       hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT |
-                              __GFP_NOWARN, order - PAGE_SHIFT);
-
-       /* Next try to allocate from the preallocated pool */
-       if (!hpt) {
-               VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
-               page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
-               if (page) {
-                       hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
-                       kvm->arch.hpt_cma_alloc = 1;
-               } else
-                       --order;
+       VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
+       page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
+       if (page) {
+               hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+               kvm->arch.hpt_cma_alloc = 1;
        }
 
        /* Lastly try successively smaller sizes from the page allocator */
@@ -596,6 +585,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        struct kvm *kvm = vcpu->kvm;
        unsigned long *hptep, hpte[3], r;
        unsigned long mmu_seq, psize, pte_size;
+       unsigned long gpa_base, gfn_base;
        unsigned long gpa, gfn, hva, pfn;
        struct kvm_memory_slot *memslot;
        unsigned long *rmap;
@@ -634,7 +624,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        /* Translate the logical address and get the page */
        psize = hpte_page_size(hpte[0], r);
-       gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
+       gpa_base = r & HPTE_R_RPN & ~(psize - 1);
+       gfn_base = gpa_base >> PAGE_SHIFT;
+       gpa = gpa_base | (ea & (psize - 1));
        gfn = gpa >> PAGE_SHIFT;
        memslot = gfn_to_memslot(kvm, gfn);
 
@@ -646,6 +638,13 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        if (!kvm->arch.using_mmu_notifiers)
                return -EFAULT;         /* should never get here */
 
+       /*
+        * This should never happen, because of the slot_is_aligned()
+        * check in kvmppc_do_h_enter().
+        */
+       if (gfn_base < memslot->base_gfn)
+               return -EFAULT;
+
        /* used to check for invalidations in progress */
        mmu_seq = kvm->mmu_notifier_seq;
        smp_rmb();
@@ -738,7 +737,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                goto out_unlock;
        hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 
-       rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+       /* Always put the HPTE in the rmap chain for the page base address */
+       rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
        lock_rmap(rmap);
 
        /* Check if we might have been invalidated; let the guest retry if so */
@@ -1060,22 +1060,33 @@ void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
        kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 }
 
-static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+static int vcpus_running(struct kvm *kvm)
+{
+       return atomic_read(&kvm->arch.vcpus_running) != 0;
+}
+
+/*
+ * Returns the number of system pages that are dirty.
+ * This can be more than 1 if we find a huge-page HPTE.
+ */
+static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 {
        struct revmap_entry *rev = kvm->arch.revmap;
        unsigned long head, i, j;
+       unsigned long n;
+       unsigned long v, r;
        unsigned long *hptep;
-       int ret = 0;
+       int npages_dirty = 0;
 
  retry:
        lock_rmap(rmapp);
        if (*rmapp & KVMPPC_RMAP_CHANGED) {
                *rmapp &= ~KVMPPC_RMAP_CHANGED;
-               ret = 1;
+               npages_dirty = 1;
        }
        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
                unlock_rmap(rmapp);
-               return ret;
+               return npages_dirty;
        }
 
        i = head = *rmapp & KVMPPC_RMAP_INDEX;
@@ -1083,7 +1094,22 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
                hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
                j = rev[i].forw;
 
-               if (!(hptep[1] & HPTE_R_C))
+               /*
+                * Checking the C (changed) bit here is racy since there
+                * is no guarantee about when the hardware writes it back.
+                * If the HPTE is not writable then it is stable since the
+                * page can't be written to, and we would have done a tlbie
+                * (which forces the hardware to complete any writeback)
+                * when making the HPTE read-only.
+                * If vcpus are running then this call is racy anyway
+                * since the page could get dirtied subsequently, so we
+                * expect there to be a further call which would pick up
+                * any delayed C bit writeback.
+                * Otherwise we need to do the tlbie even if C==0 in
+                * order to pick up any delayed writeback of C.
+                */
+               if (!(hptep[1] & HPTE_R_C) &&
+                   (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
                        continue;
 
                if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
@@ -1095,24 +1121,33 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
                }
 
                /* Now check and modify the HPTE */
-               if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
-                       /* need to make it temporarily absent to clear C */
-                       hptep[0] |= HPTE_V_ABSENT;
-                       kvmppc_invalidate_hpte(kvm, hptep, i);
-                       hptep[1] &= ~HPTE_R_C;
-                       eieio();
-                       hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+               if (!(hptep[0] & HPTE_V_VALID))
+                       continue;
+
+               /* need to make it temporarily absent so C is stable */
+               hptep[0] |= HPTE_V_ABSENT;
+               kvmppc_invalidate_hpte(kvm, hptep, i);
+               v = hptep[0];
+               r = hptep[1];
+               if (r & HPTE_R_C) {
+                       hptep[1] = r & ~HPTE_R_C;
                        if (!(rev[i].guest_rpte & HPTE_R_C)) {
                                rev[i].guest_rpte |= HPTE_R_C;
                                note_hpte_modification(kvm, &rev[i]);
                        }
-                       ret = 1;
+                       n = hpte_page_size(v, r);
+                       n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                       if (n > npages_dirty)
+                               npages_dirty = n;
+                       eieio();
                }
-               hptep[0] &= ~HPTE_V_HVLOCK;
+               v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+               v |= HPTE_V_VALID;
+               hptep[0] = v;
        } while ((i = j) != head);
 
        unlock_rmap(rmapp);
-       return ret;
+       return npages_dirty;
 }
 
 static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
@@ -1136,15 +1171,22 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
                             unsigned long *map)
 {
-       unsigned long i;
+       unsigned long i, j;
        unsigned long *rmapp;
        struct kvm_vcpu *vcpu;
 
        preempt_disable();
        rmapp = memslot->arch.rmap;
        for (i = 0; i < memslot->npages; ++i) {
-               if (kvm_test_clear_dirty(kvm, rmapp) && map)
-                       __set_bit_le(i, map);
+               int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
+               /*
+                * Note that if npages > 0 then i must be a multiple of npages,
+                * since we always put huge-page HPTEs in the rmap chain
+                * corresponding to their page base address.
+                */
+               if (npages && map)
+                       for (j = i; npages; ++j, --npages)
+                               __set_bit_le(j, map);
                ++rmapp;
        }
 
index 4f12e8f0c7187b3bf2887e4db6af39da36ce44d2..3589c4e3d49bbc62541538d19e030e9e4dc0ae41 100644 (file)
  * Authors: Alexander Graf <agraf@suse.de>
  */
 
-#ifdef __LITTLE_ENDIAN__
-#error Need to fix SLB shadow accesses in little endian mode
-#endif
-
-#define SHADOW_SLB_ESID(num)   (SLBSHADOW_SAVEAREA + (num * 0x10))
-#define SHADOW_SLB_VSID(num)   (SLBSHADOW_SAVEAREA + (num * 0x10) + 0x8)
-#define UNBOLT_SLB_ENTRY(num) \
-       ld      r9, SHADOW_SLB_ESID(num)(r12); \
-       /* Invalid? Skip. */; \
-       rldicl. r0, r9, 37, 63; \
-       beq     slb_entry_skip_ ## num; \
-       xoris   r9, r9, SLB_ESID_V@h; \
-       std     r9, SHADOW_SLB_ESID(num)(r12); \
-  slb_entry_skip_ ## num:
-
-#define REBOLT_SLB_ENTRY(num) \
-       ld      r10, SHADOW_SLB_ESID(num)(r11); \
-       cmpdi   r10, 0; \
-       beq     slb_exit_skip_ ## num; \
-       oris    r10, r10, SLB_ESID_V@h; \
-       ld      r9, SHADOW_SLB_VSID(num)(r11); \
-       slbmte  r9, r10; \
-       std     r10, SHADOW_SLB_ESID(num)(r11); \
-slb_exit_skip_ ## num:
+#define SHADOW_SLB_ENTRY_LEN   0x10
+#define OFFSET_ESID(x)         (SHADOW_SLB_ENTRY_LEN * x)
+#define OFFSET_VSID(x)         ((SHADOW_SLB_ENTRY_LEN * x) + 8)
 
 /******************************************************************************
  *                                                                            *
@@ -64,20 +43,15 @@ slb_exit_skip_ ## num:
         * SVCPU[LR]  = guest LR
         */
 
-       /* Remove LPAR shadow entries */
+BEGIN_FW_FTR_SECTION
 
-#if SLB_NUM_BOLTED == 3
+       /* Declare SLB shadow as 0 entries big */
 
-       ld      r12, PACA_SLBSHADOWPTR(r13)
+       ld      r11, PACA_SLBSHADOWPTR(r13)
+       li      r8, 0
+       stb     r8, 3(r11)
 
-       /* Remove bolted entries */
-       UNBOLT_SLB_ENTRY(0)
-       UNBOLT_SLB_ENTRY(1)
-       UNBOLT_SLB_ENTRY(2)
-       
-#else
-#error unknown number of bolted entries
-#endif
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR)
 
        /* Flush SLB */
 
@@ -100,7 +74,7 @@ slb_loop_enter:
 
        ld      r10, 0(r11)
 
-       rldicl. r0, r10, 37, 63
+       andis.  r9, r10, SLB_ESID_V@h
        beq     slb_loop_enter_skip
 
        ld      r9, 8(r11)
@@ -137,23 +111,42 @@ slb_do_enter:
         *
         */
 
-       /* Restore bolted entries from the shadow and fix it along the way */
+       /* Remove all SLB entries that are in use. */
 
-       /* We don't store anything in entry 0, so we don't need to take care of it */
+       li      r0, r0
+       slbmte  r0, r0
        slbia
-       isync
 
-#if SLB_NUM_BOLTED == 3
+       /* Restore bolted entries from the shadow */
 
        ld      r11, PACA_SLBSHADOWPTR(r13)
 
-       REBOLT_SLB_ENTRY(0)
-       REBOLT_SLB_ENTRY(1)
-       REBOLT_SLB_ENTRY(2)
-       
-#else
-#error unknown number of bolted entries
-#endif
+BEGIN_FW_FTR_SECTION
+
+       /* Declare SLB shadow as SLB_NUM_BOLTED entries big */
+
+       li      r8, SLB_NUM_BOLTED
+       stb     r8, 3(r11)
+
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR)
+
+       /* Manually load all entries from shadow SLB */
+
+       li      r8, SLBSHADOW_SAVEAREA
+       li      r7, SLBSHADOW_SAVEAREA + 8
+
+       .rept   SLB_NUM_BOLTED
+       LDX_BE  r10, r11, r8
+       cmpdi   r10, 0
+       beq     1f
+       LDX_BE  r9, r11, r7
+       slbmte  r9, r10
+1:     addi    r7, r7, SHADOW_SLB_ENTRY_LEN
+       addi    r8, r8, SHADOW_SLB_ENTRY_LEN
+       .endr
+
+       isync
+       sync
 
 slb_do_exit:
 
index 99d40f8977e8abcaaf54f499af4d15a1ae1f268e..3f295269af37e7625869d677241c3f12cf6e0d22 100644 (file)
@@ -80,7 +80,7 @@ static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level)
                return false;
 
        /* Limit user space to its own small SPR set */
-       if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM)
+       if ((kvmppc_get_msr(vcpu) & MSR_PR) && level > PRIV_PROBLEM)
                return false;
 
        return true;
@@ -94,14 +94,31 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
        int rs = get_rs(inst);
        int ra = get_ra(inst);
        int rb = get_rb(inst);
+       u32 inst_sc = 0x44000002;
 
        switch (get_op(inst)) {
+       case 0:
+               emulated = EMULATE_FAIL;
+               if ((kvmppc_get_msr(vcpu) & MSR_LE) &&
+                   (inst == swab32(inst_sc))) {
+                       /*
+                        * This is the byte reversed syscall instruction of our
+                        * hypercall handler. Early versions of LE Linux didn't
+                        * swap the instructions correctly and ended up in
+                        * illegal instructions.
+                        * Just always fail hypercalls on these broken systems.
+                        */
+                       kvmppc_set_gpr(vcpu, 3, EV_UNIMPLEMENTED);
+                       kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+                       emulated = EMULATE_DONE;
+               }
+               break;
        case 19:
                switch (get_xop(inst)) {
                case OP_19_XOP_RFID:
                case OP_19_XOP_RFI:
-                       kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0);
-                       kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
+                       kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu));
+                       kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu));
                        *advance = 0;
                        break;
 
@@ -113,16 +130,16 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
        case 31:
                switch (get_xop(inst)) {
                case OP_31_XOP_MFMSR:
-                       kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);
+                       kvmppc_set_gpr(vcpu, rt, kvmppc_get_msr(vcpu));
                        break;
                case OP_31_XOP_MTMSRD:
                {
                        ulong rs_val = kvmppc_get_gpr(vcpu, rs);
                        if (inst & 0x10000) {
-                               ulong new_msr = vcpu->arch.shared->msr;
+                               ulong new_msr = kvmppc_get_msr(vcpu);
                                new_msr &= ~(MSR_RI | MSR_EE);
                                new_msr |= rs_val & (MSR_RI | MSR_EE);
-                               vcpu->arch.shared->msr = new_msr;
+                               kvmppc_set_msr_fast(vcpu, new_msr);
                        } else
                                kvmppc_set_msr(vcpu, rs_val);
                        break;
@@ -179,7 +196,7 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        ulong cmd = kvmppc_get_gpr(vcpu, 3);
                        int i;
 
-                       if ((vcpu->arch.shared->msr & MSR_PR) ||
+                       if ((kvmppc_get_msr(vcpu) & MSR_PR) ||
                            !vcpu->arch.papr_enabled) {
                                emulated = EMULATE_FAIL;
                                break;
@@ -261,14 +278,14 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                ra_val = kvmppc_get_gpr(vcpu, ra);
 
                        addr = (ra_val + rb_val) & ~31ULL;
-                       if (!(vcpu->arch.shared->msr & MSR_SF))
+                       if (!(kvmppc_get_msr(vcpu) & MSR_SF))
                                addr &= 0xffffffff;
                        vaddr = addr;
 
                        r = kvmppc_st(vcpu, &addr, 32, zeros, true);
                        if ((r == -ENOENT) || (r == -EPERM)) {
                                *advance = 0;
-                               vcpu->arch.shared->dar = vaddr;
+                               kvmppc_set_dar(vcpu, vaddr);
                                vcpu->arch.fault_dar = vaddr;
 
                                dsisr = DSISR_ISSTORE;
@@ -277,7 +294,7 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                else if (r == -EPERM)
                                        dsisr |= DSISR_PROTFAULT;
 
-                               vcpu->arch.shared->dsisr = dsisr;
+                               kvmppc_set_dsisr(vcpu, dsisr);
                                vcpu->arch.fault_dsisr = dsisr;
 
                                kvmppc_book3s_queue_irqprio(vcpu,
@@ -356,10 +373,10 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
                to_book3s(vcpu)->sdr1 = spr_val;
                break;
        case SPRN_DSISR:
-               vcpu->arch.shared->dsisr = spr_val;
+               kvmppc_set_dsisr(vcpu, spr_val);
                break;
        case SPRN_DAR:
-               vcpu->arch.shared->dar = spr_val;
+               kvmppc_set_dar(vcpu, spr_val);
                break;
        case SPRN_HIOR:
                to_book3s(vcpu)->hior = spr_val;
@@ -438,6 +455,31 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
        case SPRN_GQR7:
                to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val;
                break;
+       case SPRN_FSCR:
+               vcpu->arch.fscr = spr_val;
+               break;
+#ifdef CONFIG_PPC_BOOK3S_64
+       case SPRN_BESCR:
+               vcpu->arch.bescr = spr_val;
+               break;
+       case SPRN_EBBHR:
+               vcpu->arch.ebbhr = spr_val;
+               break;
+       case SPRN_EBBRR:
+               vcpu->arch.ebbrr = spr_val;
+               break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       case SPRN_TFHAR:
+               vcpu->arch.tfhar = spr_val;
+               break;
+       case SPRN_TEXASR:
+               vcpu->arch.texasr = spr_val;
+               break;
+       case SPRN_TFIAR:
+               vcpu->arch.tfiar = spr_val;
+               break;
+#endif
+#endif
        case SPRN_ICTC:
        case SPRN_THRM1:
        case SPRN_THRM2:
@@ -455,6 +497,13 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
        case SPRN_WPAR_GEKKO:
        case SPRN_MSSSR0:
        case SPRN_DABR:
+#ifdef CONFIG_PPC_BOOK3S_64
+       case SPRN_MMCRS:
+       case SPRN_MMCRA:
+       case SPRN_MMCR0:
+       case SPRN_MMCR1:
+       case SPRN_MMCR2:
+#endif
                break;
 unprivileged:
        default:
@@ -493,10 +542,10 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
                *spr_val = to_book3s(vcpu)->sdr1;
                break;
        case SPRN_DSISR:
-               *spr_val = vcpu->arch.shared->dsisr;
+               *spr_val = kvmppc_get_dsisr(vcpu);
                break;
        case SPRN_DAR:
-               *spr_val = vcpu->arch.shared->dar;
+               *spr_val = kvmppc_get_dar(vcpu);
                break;
        case SPRN_HIOR:
                *spr_val = to_book3s(vcpu)->hior;
@@ -538,6 +587,31 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
        case SPRN_GQR7:
                *spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0];
                break;
+       case SPRN_FSCR:
+               *spr_val = vcpu->arch.fscr;
+               break;
+#ifdef CONFIG_PPC_BOOK3S_64
+       case SPRN_BESCR:
+               *spr_val = vcpu->arch.bescr;
+               break;
+       case SPRN_EBBHR:
+               *spr_val = vcpu->arch.ebbhr;
+               break;
+       case SPRN_EBBRR:
+               *spr_val = vcpu->arch.ebbrr;
+               break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       case SPRN_TFHAR:
+               *spr_val = vcpu->arch.tfhar;
+               break;
+       case SPRN_TEXASR:
+               *spr_val = vcpu->arch.texasr;
+               break;
+       case SPRN_TFIAR:
+               *spr_val = vcpu->arch.tfiar;
+               break;
+#endif
+#endif
        case SPRN_THRM1:
        case SPRN_THRM2:
        case SPRN_THRM3:
@@ -553,6 +627,14 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
        case SPRN_WPAR_GEKKO:
        case SPRN_MSSSR0:
        case SPRN_DABR:
+#ifdef CONFIG_PPC_BOOK3S_64
+       case SPRN_MMCRS:
+       case SPRN_MMCRA:
+       case SPRN_MMCR0:
+       case SPRN_MMCR1:
+       case SPRN_MMCR2:
+       case SPRN_TIR:
+#endif
                *spr_val = 0;
                break;
        default:
@@ -569,48 +651,17 @@ unprivileged:
 
 u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
 {
-       u32 dsisr = 0;
-
-       /*
-        * This is what the spec says about DSISR bits (not mentioned = 0):
-        *
-        * 12:13                [DS]    Set to bits 30:31
-        * 15:16                [X]     Set to bits 29:30
-        * 17                   [X]     Set to bit 25
-        *                      [D/DS]  Set to bit 5
-        * 18:21                [X]     Set to bits 21:24
-        *                      [D/DS]  Set to bits 1:4
-        * 22:26                        Set to bits 6:10 (RT/RS/FRT/FRS)
-        * 27:31                        Set to bits 11:15 (RA)
-        */
-
-       switch (get_op(inst)) {
-       /* D-form */
-       case OP_LFS:
-       case OP_LFD:
-       case OP_STFD:
-       case OP_STFS:
-               dsisr |= (inst >> 12) & 0x4000; /* bit 17 */
-               dsisr |= (inst >> 17) & 0x3c00; /* bits 18:21 */
-               break;
-       /* X-form */
-       case 31:
-               dsisr |= (inst << 14) & 0x18000; /* bits 15:16 */
-               dsisr |= (inst << 8)  & 0x04000; /* bit 17 */
-               dsisr |= (inst << 3)  & 0x03c00; /* bits 18:21 */
-               break;
-       default:
-               printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst);
-               break;
-       }
-
-       dsisr |= (inst >> 16) & 0x03ff; /* bits 22:31 */
-
-       return dsisr;
+       return make_dsisr(inst);
 }
 
 ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
 {
+#ifdef CONFIG_PPC_BOOK3S_64
+       /*
+        * Linux's fix_alignment() assumes that DAR is valid, so can we
+        */
+       return vcpu->arch.fault_dar;
+#else
        ulong dar = 0;
        ulong ra = get_ra(inst);
        ulong rb = get_rb(inst);
@@ -635,4 +686,5 @@ ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
        }
 
        return dar;
+#endif
 }
index 20d4ea8e656d3eacf3202ef42f806168d44841f4..0d013fbc2e13f876588a3af04a7019a87d2d6da7 100644 (file)
@@ -18,6 +18,7 @@
  */
 
 #include <linux/export.h>
+#include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
index 8227dba5af0f4f3ebd70c051530ce98fa4a6e30d..aba05bbb3e744b6e516afbe9093923793762040d 100644 (file)
@@ -879,24 +879,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_IAMR:
                *val = get_reg_val(id, vcpu->arch.iamr);
                break;
-       case KVM_REG_PPC_FSCR:
-               *val = get_reg_val(id, vcpu->arch.fscr);
-               break;
        case KVM_REG_PPC_PSPB:
                *val = get_reg_val(id, vcpu->arch.pspb);
                break;
-       case KVM_REG_PPC_EBBHR:
-               *val = get_reg_val(id, vcpu->arch.ebbhr);
-               break;
-       case KVM_REG_PPC_EBBRR:
-               *val = get_reg_val(id, vcpu->arch.ebbrr);
-               break;
-       case KVM_REG_PPC_BESCR:
-               *val = get_reg_val(id, vcpu->arch.bescr);
-               break;
-       case KVM_REG_PPC_TAR:
-               *val = get_reg_val(id, vcpu->arch.tar);
-               break;
        case KVM_REG_PPC_DPDES:
                *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
                break;
@@ -1091,24 +1076,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_IAMR:
                vcpu->arch.iamr = set_reg_val(id, *val);
                break;
-       case KVM_REG_PPC_FSCR:
-               vcpu->arch.fscr = set_reg_val(id, *val);
-               break;
        case KVM_REG_PPC_PSPB:
                vcpu->arch.pspb = set_reg_val(id, *val);
                break;
-       case KVM_REG_PPC_EBBHR:
-               vcpu->arch.ebbhr = set_reg_val(id, *val);
-               break;
-       case KVM_REG_PPC_EBBRR:
-               vcpu->arch.ebbrr = set_reg_val(id, *val);
-               break;
-       case KVM_REG_PPC_BESCR:
-               vcpu->arch.bescr = set_reg_val(id, *val);
-               break;
-       case KVM_REG_PPC_TAR:
-               vcpu->arch.tar = set_reg_val(id, *val);
-               break;
        case KVM_REG_PPC_DPDES:
                vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
                break;
@@ -1280,6 +1250,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
                goto free_vcpu;
 
        vcpu->arch.shared = &vcpu->arch.shregs;
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+       /*
+        * The shared struct is never shared on HV,
+        * so we can always use host endianness
+        */
+#ifdef __BIG_ENDIAN__
+       vcpu->arch.shared_big_endian = true;
+#else
+       vcpu->arch.shared_big_endian = false;
+#endif
+#endif
        vcpu->arch.mmcr[0] = MMCR0_FC;
        vcpu->arch.ctrl = CTRL_RUNLATCH;
        /* default to host PVR, since we can't spoof it */
@@ -1949,6 +1930,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
         * support pte_enc here
         */
        (*sps)->enc[0].pte_enc = def->penc[linux_psize];
+       /*
+        * Add 16MB MPSS support if host supports it
+        */
+       if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
+               (*sps)->enc[1].page_shift = 24;
+               (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+       }
        (*sps)++;
 }
 
index 8fcc36306a02153dab0234e9f60558587715333f..6e6224318c36aaf166e33e7c57c8a8e0dcd9544e 100644 (file)
@@ -42,13 +42,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags)
 
        /*
         * If there is only one vcore, and it's currently running,
+        * as indicated by local_paca->kvm_hstate.kvm_vcpu being set,
         * we can use tlbiel as long as we mark all other physical
         * cores as potentially having stale TLB entries for this lpid.
         * If we're not using MMU notifiers, we never take pages away
         * from the guest, so we can use tlbiel if requested.
         * Otherwise, don't use tlbiel.
         */
-       if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
+       if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu)
                global = 0;
        else if (kvm->arch.using_mmu_notifiers)
                global = 1;
index 07c8b5b0f9d256d80ef4853c741a882f7e4f560f..974793435a2e20e82a22780b3e7ad330c17ccf40 100644 (file)
@@ -86,6 +86,12 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        lbz     r4, LPPACA_PMCINUSE(r3)
        cmpwi   r4, 0
        beq     23f                     /* skip if not */
+BEGIN_FTR_SECTION
+       ld      r3, HSTATE_MMCR(r13)
+       andi.   r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+       cmpwi   r4, MMCR0_PMAO
+       beql    kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
        lwz     r3, HSTATE_PMC(r13)
        lwz     r4, HSTATE_PMC + 4(r13)
        lwz     r5, HSTATE_PMC + 8(r13)
@@ -737,6 +743,12 @@ skip_tm:
        sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
        mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
        isync
+BEGIN_FTR_SECTION
+       ld      r3, VCPU_MMCR(r4)
+       andi.   r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+       cmpwi   r5, MMCR0_PMAO
+       beql    kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
        lwz     r3, VCPU_PMC(r4)        /* always load up guest PMU registers */
        lwz     r5, VCPU_PMC + 4(r4)    /* to prevent information leak */
        lwz     r6, VCPU_PMC + 8(r4)
@@ -1439,6 +1451,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM)
 25:
        /* Save PMU registers if requested */
        /* r8 and cr0.eq are live here */
+BEGIN_FTR_SECTION
+       /*
+        * POWER8 seems to have a hardware bug where setting
+        * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
+        * when some counters are already negative doesn't seem
+        * to cause a performance monitor alert (and hence interrupt).
+        * The effect of this is that when saving the PMU state,
+        * if there is no PMU alert pending when we read MMCR0
+        * before freezing the counters, but one becomes pending
+        * before we read the counters, we lose it.
+        * To work around this, we need a way to freeze the counters
+        * before reading MMCR0.  Normally, freezing the counters
+        * is done by writing MMCR0 (to set MMCR0[FC]) which
+        * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
+        * we can also freeze the counters using MMCR2, by writing
+        * 1s to all the counter freeze condition bits (there are
+        * 9 bits each for 6 counters).
+        */
+       li      r3, -1                  /* set all freeze bits */
+       clrrdi  r3, r3, 10
+       mfspr   r10, SPRN_MMCR2
+       mtspr   SPRN_MMCR2, r3
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        li      r3, 1
        sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
        mfspr   r4, SPRN_MMCR0          /* save MMCR0 */
@@ -1462,6 +1498,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
        std     r4, VCPU_MMCR(r9)
        std     r5, VCPU_MMCR + 8(r9)
        std     r6, VCPU_MMCR + 16(r9)
+BEGIN_FTR_SECTION
+       std     r10, VCPU_MMCR + 24(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        std     r7, VCPU_SIAR(r9)
        std     r8, VCPU_SDAR(r9)
        mfspr   r3, SPRN_PMC1
@@ -1485,12 +1524,10 @@ BEGIN_FTR_SECTION
        stw     r11, VCPU_PMC + 28(r9)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 BEGIN_FTR_SECTION
-       mfspr   r4, SPRN_MMCR2
        mfspr   r5, SPRN_SIER
        mfspr   r6, SPRN_SPMC1
        mfspr   r7, SPRN_SPMC2
        mfspr   r8, SPRN_MMCRS
-       std     r4, VCPU_MMCR + 24(r9)
        std     r5, VCPU_SIER(r9)
        stw     r6, VCPU_PMC + 24(r9)
        stw     r7, VCPU_PMC + 28(r9)
@@ -2227,6 +2264,7 @@ machine_check_realmode:
        beq     mc_cont
        /* If not, deliver a machine check.  SRR0/1 are already set */
        li      r10, BOOK3S_INTERRUPT_MACHINE_CHECK
+       ld      r11, VCPU_MSR(r9)
        bl      kvmppc_msr_interrupt
        b       fast_interrupt_c_return
 
@@ -2431,3 +2469,21 @@ kvmppc_msr_interrupt:
        li      r0, 1
 1:     rldimi  r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
        blr
+
+/*
+ * This works around a hardware bug on POWER8E processors, where
+ * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
+ * performance monitor interrupt.  Instead, when we need to have
+ * an interrupt pending, we have to arrange for a counter to overflow.
+ */
+kvmppc_fix_pmao:
+       li      r3, 0
+       mtspr   SPRN_MMCR2, r3
+       lis     r3, (MMCR0_PMXE | MMCR0_FCECE)@h
+       ori     r3, r3, MMCR0_PMCjCE | MMCR0_C56RUN
+       mtspr   SPRN_MMCR0, r3
+       lis     r3, 0x7fff
+       ori     r3, r3, 0xffff
+       mtspr   SPRN_PMC6, r3
+       isync
+       blr
index 3533c999194a3db8d835d570bd7f4d86bb494bbb..e2c29e381dc7096d13af053526e0d98cf951d08e 100644 (file)
@@ -104,8 +104,27 @@ kvm_start_lightweight:
        stb     r3, HSTATE_RESTORE_HID5(r13)
 
        /* Load up guest SPRG3 value, since it's user readable */
-       ld      r3, VCPU_SHARED(r4)
-       ld      r3, VCPU_SHARED_SPRG3(r3)
+       lwz     r3, VCPU_SHAREDBE(r4)
+       cmpwi   r3, 0
+       ld      r5, VCPU_SHARED(r4)
+       beq     sprg3_little_endian
+sprg3_big_endian:
+#ifdef __BIG_ENDIAN__
+       ld      r3, VCPU_SHARED_SPRG3(r5)
+#else
+       addi    r5, r5, VCPU_SHARED_SPRG3
+       ldbrx   r3, 0, r5
+#endif
+       b       after_sprg3_load
+sprg3_little_endian:
+#ifdef __LITTLE_ENDIAN__
+       ld      r3, VCPU_SHARED_SPRG3(r5)
+#else
+       addi    r5, r5, VCPU_SHARED_SPRG3
+       ldbrx   r3, 0, r5
+#endif
+
+after_sprg3_load:
        mtspr   SPRN_SPRG3, r3
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
index c1abd95063f47d56a2092c57352bc0fff7ff9632..6c8011fd57e621fd702dfd5bb1259d81e9718ada 100644 (file)
@@ -165,16 +165,18 @@ static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
 
 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
 {
-       u64 dsisr;
-       struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared;
+       u32 dsisr;
+       u64 msr = kvmppc_get_msr(vcpu);
 
-       shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0);
-       shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0);
-       shared->dar = eaddr;
+       msr = kvmppc_set_field(msr, 33, 36, 0);
+       msr = kvmppc_set_field(msr, 42, 47, 0);
+       kvmppc_set_msr(vcpu, msr);
+       kvmppc_set_dar(vcpu, eaddr);
        /* Page Fault */
        dsisr = kvmppc_set_field(0, 33, 33, 1);
        if (is_store)
-               shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
+               dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
+       kvmppc_set_dsisr(vcpu, dsisr);
        kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
 }
 
@@ -660,7 +662,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
        if (!kvmppc_inst_is_paired_single(vcpu, inst))
                return EMULATE_FAIL;
 
-       if (!(vcpu->arch.shared->msr & MSR_FP)) {
+       if (!(kvmppc_get_msr(vcpu) & MSR_FP)) {
                kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL);
                return EMULATE_AGAIN;
        }
index 02f1defd8bb9aa092dca510f4c80edeb45fafce6..8eef1e5190773c9c290bf46581e7fd025f7dfaeb 100644 (file)
@@ -53,6 +53,7 @@
 
 static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
                             ulong msr);
+static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
 /* Some compatibility defines */
 #ifdef CONFIG_PPC_BOOK3S_32
@@ -89,6 +90,7 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
 #endif
 
        kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+       kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
        vcpu->cpu = -1;
 }
 
@@ -115,6 +117,9 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
        svcpu->ctr = vcpu->arch.ctr;
        svcpu->lr  = vcpu->arch.lr;
        svcpu->pc  = vcpu->arch.pc;
+#ifdef CONFIG_PPC_BOOK3S_64
+       svcpu->shadow_fscr = vcpu->arch.shadow_fscr;
+#endif
        svcpu->in_use = true;
 }
 
@@ -158,6 +163,9 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
        vcpu->arch.fault_dar   = svcpu->fault_dar;
        vcpu->arch.fault_dsisr = svcpu->fault_dsisr;
        vcpu->arch.last_inst   = svcpu->last_inst;
+#ifdef CONFIG_PPC_BOOK3S_64
+       vcpu->arch.shadow_fscr = svcpu->shadow_fscr;
+#endif
        svcpu->in_use = false;
 
 out:
@@ -246,14 +254,15 @@ static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
 
 static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 {
-       ulong smsr = vcpu->arch.shared->msr;
+       ulong guest_msr = kvmppc_get_msr(vcpu);
+       ulong smsr = guest_msr;
 
        /* Guest MSR values */
-       smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
+       smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
        /* Process MSR values */
        smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
        /* External providers the guest reserved */
-       smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
+       smsr |= (guest_msr & vcpu->arch.guest_owned_ext);
        /* 64-bit Process MSR values */
 #ifdef CONFIG_PPC_BOOK3S_64
        smsr |= MSR_ISF | MSR_HV;
@@ -263,14 +272,14 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 
 static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 {
-       ulong old_msr = vcpu->arch.shared->msr;
+       ulong old_msr = kvmppc_get_msr(vcpu);
 
 #ifdef EXIT_DEBUG
        printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
 #endif
 
        msr &= to_book3s(vcpu)->msr_mask;
-       vcpu->arch.shared->msr = msr;
+       kvmppc_set_msr_fast(vcpu, msr);
        kvmppc_recalc_shadow_msr(vcpu);
 
        if (msr & MSR_POW) {
@@ -281,11 +290,11 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 
                        /* Unset POW bit after we woke up */
                        msr &= ~MSR_POW;
-                       vcpu->arch.shared->msr = msr;
+                       kvmppc_set_msr_fast(vcpu, msr);
                }
        }
 
-       if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
+       if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) !=
                   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
                kvmppc_mmu_flush_segments(vcpu);
                kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
@@ -317,7 +326,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
        }
 
        /* Preload FPU if it's enabled */
-       if (vcpu->arch.shared->msr & MSR_FP)
+       if (kvmppc_get_msr(vcpu) & MSR_FP)
                kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 }
 
@@ -427,8 +436,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 
        /* patch dcbz into reserved instruction, so we trap */
        for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
-               if ((page[i] & 0xff0007ff) == INS_DCBZ)
-                       page[i] &= 0xfffffff7;
+               if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ)
+                       page[i] &= cpu_to_be32(0xfffffff7);
 
        kunmap_atomic(page);
        put_page(hpage);
@@ -438,7 +447,7 @@ static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        ulong mp_pa = vcpu->arch.magic_page_pa;
 
-       if (!(vcpu->arch.shared->msr & MSR_SF))
+       if (!(kvmppc_get_msr(vcpu) & MSR_SF))
                mp_pa = (uint32_t)mp_pa;
 
        if (unlikely(mp_pa) &&
@@ -459,8 +468,8 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        int page_found = 0;
        struct kvmppc_pte pte;
        bool is_mmio = false;
-       bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
-       bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
+       bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false;
+       bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false;
        u64 vsid;
 
        relocated = data ? dr : ir;
@@ -480,7 +489,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                pte.page_size = MMU_PAGE_64K;
        }
 
-       switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+       switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) {
        case 0:
                pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
                break;
@@ -488,7 +497,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        case MSR_IR:
                vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
 
-               if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
+               if ((kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) == MSR_DR)
                        pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
                else
                        pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
@@ -511,22 +520,25 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        if (page_found == -ENOENT) {
                /* Page not found in guest PTE entries */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr;
-               vcpu->arch.shared->msr |=
-                       vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL;
+               u64 ssrr1 = vcpu->arch.shadow_srr1;
+               u64 msr = kvmppc_get_msr(vcpu);
+               kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
+               kvmppc_set_dsisr(vcpu, vcpu->arch.fault_dsisr);
+               kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL));
                kvmppc_book3s_queue_irqprio(vcpu, vec);
        } else if (page_found == -EPERM) {
                /* Storage protection */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE;
-               vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
-               vcpu->arch.shared->msr |=
-                       vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL;
+               u32 dsisr = vcpu->arch.fault_dsisr;
+               u64 ssrr1 = vcpu->arch.shadow_srr1;
+               u64 msr = kvmppc_get_msr(vcpu);
+               kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
+               dsisr = (dsisr & ~DSISR_NOHPTE) | DSISR_PROTFAULT;
+               kvmppc_set_dsisr(vcpu, dsisr);
+               kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL));
                kvmppc_book3s_queue_irqprio(vcpu, vec);
        } else if (page_found == -EINVAL) {
                /* Page not found in guest SLB */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+               kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
                kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
        } else if (!is_mmio &&
                   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
@@ -606,6 +618,25 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
        kvmppc_recalc_shadow_msr(vcpu);
 }
 
+/* Give up facility (TAR / EBB / DSCR) */
+static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+       if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) {
+               /* Facility not available to the guest, ignore giveup request*/
+               return;
+       }
+
+       switch (fac) {
+       case FSCR_TAR_LG:
+               vcpu->arch.tar = mfspr(SPRN_TAR);
+               mtspr(SPRN_TAR, current->thread.tar);
+               vcpu->arch.shadow_fscr &= ~FSCR_TAR;
+               break;
+       }
+#endif
+}
+
 static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
 {
        ulong srr0 = kvmppc_get_pc(vcpu);
@@ -614,11 +645,12 @@ static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
 
        ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
        if (ret == -ENOENT) {
-               ulong msr = vcpu->arch.shared->msr;
+               ulong msr = kvmppc_get_msr(vcpu);
 
                msr = kvmppc_set_field(msr, 33, 33, 1);
                msr = kvmppc_set_field(msr, 34, 36, 0);
-               vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
+               msr = kvmppc_set_field(msr, 42, 47, 0);
+               kvmppc_set_msr_fast(vcpu, msr);
                kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
                return EMULATE_AGAIN;
        }
@@ -651,7 +683,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
        if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
                return RESUME_GUEST;
 
-       if (!(vcpu->arch.shared->msr & msr)) {
+       if (!(kvmppc_get_msr(vcpu) & msr)) {
                kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
                return RESUME_GUEST;
        }
@@ -683,16 +715,20 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #endif
 
        if (msr & MSR_FP) {
+               preempt_disable();
                enable_kernel_fp();
                load_fp_state(&vcpu->arch.fp);
                t->fp_save_area = &vcpu->arch.fp;
+               preempt_enable();
        }
 
        if (msr & MSR_VEC) {
 #ifdef CONFIG_ALTIVEC
+               preempt_disable();
                enable_kernel_altivec();
                load_vr_state(&vcpu->arch.vr);
                t->vr_save_area = &vcpu->arch.vr;
+               preempt_enable();
 #endif
        }
 
@@ -716,18 +752,90 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
                return;
 
        if (lost_ext & MSR_FP) {
+               preempt_disable();
                enable_kernel_fp();
                load_fp_state(&vcpu->arch.fp);
+               preempt_enable();
        }
 #ifdef CONFIG_ALTIVEC
        if (lost_ext & MSR_VEC) {
+               preempt_disable();
                enable_kernel_altivec();
                load_vr_state(&vcpu->arch.vr);
+               preempt_enable();
        }
 #endif
        current->thread.regs->msr |= lost_ext;
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+
+static void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac)
+{
+       /* Inject the Interrupt Cause field and trigger a guest interrupt */
+       vcpu->arch.fscr &= ~(0xffULL << 56);
+       vcpu->arch.fscr |= (fac << 56);
+       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL);
+}
+
+static void kvmppc_emulate_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+       enum emulation_result er = EMULATE_FAIL;
+
+       if (!(kvmppc_get_msr(vcpu) & MSR_PR))
+               er = kvmppc_emulate_instruction(vcpu->run, vcpu);
+
+       if ((er != EMULATE_DONE) && (er != EMULATE_AGAIN)) {
+               /* Couldn't emulate, trigger interrupt in guest */
+               kvmppc_trigger_fac_interrupt(vcpu, fac);
+       }
+}
+
+/* Enable facilities (TAR, EBB, DSCR) for the guest */
+static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+       bool guest_fac_enabled;
+       BUG_ON(!cpu_has_feature(CPU_FTR_ARCH_207S));
+
+       /*
+        * Not every facility is enabled by FSCR bits, check whether the
+        * guest has this facility enabled at all.
+        */
+       switch (fac) {
+       case FSCR_TAR_LG:
+       case FSCR_EBB_LG:
+               guest_fac_enabled = (vcpu->arch.fscr & (1ULL << fac));
+               break;
+       case FSCR_TM_LG:
+               guest_fac_enabled = kvmppc_get_msr(vcpu) & MSR_TM;
+               break;
+       default:
+               guest_fac_enabled = false;
+               break;
+       }
+
+       if (!guest_fac_enabled) {
+               /* Facility not enabled by the guest */
+               kvmppc_trigger_fac_interrupt(vcpu, fac);
+               return RESUME_GUEST;
+       }
+
+       switch (fac) {
+       case FSCR_TAR_LG:
+               /* TAR switching isn't lazy in Linux yet */
+               current->thread.tar = mfspr(SPRN_TAR);
+               mtspr(SPRN_TAR, vcpu->arch.tar);
+               vcpu->arch.shadow_fscr |= FSCR_TAR;
+               break;
+       default:
+               kvmppc_emulate_fac(vcpu, fac);
+               break;
+       }
+
+       return RESUME_GUEST;
+}
+#endif
+
 int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                          unsigned int exit_nr)
 {
@@ -784,7 +892,9 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
                        r = RESUME_GUEST;
                } else {
-                       vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000;
+                       u64 msr = kvmppc_get_msr(vcpu);
+                       msr |= shadow_srr1 & 0x58000000;
+                       kvmppc_set_msr_fast(vcpu, msr);
                        kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
                        r = RESUME_GUEST;
                }
@@ -824,8 +934,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
                        srcu_read_unlock(&vcpu->kvm->srcu, idx);
                } else {
-                       vcpu->arch.shared->dar = dar;
-                       vcpu->arch.shared->dsisr = fault_dsisr;
+                       kvmppc_set_dar(vcpu, dar);
+                       kvmppc_set_dsisr(vcpu, fault_dsisr);
                        kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
                        r = RESUME_GUEST;
                }
@@ -833,7 +943,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
        }
        case BOOK3S_INTERRUPT_DATA_SEGMENT:
                if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
-                       vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+                       kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
                        kvmppc_book3s_queue_irqprio(vcpu,
                                BOOK3S_INTERRUPT_DATA_SEGMENT);
                }
@@ -871,7 +981,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 program_interrupt:
                flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
 
-               if (vcpu->arch.shared->msr & MSR_PR) {
+               if (kvmppc_get_msr(vcpu) & MSR_PR) {
 #ifdef EXIT_DEBUG
                        printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
 #endif
@@ -913,7 +1023,7 @@ program_interrupt:
        case BOOK3S_INTERRUPT_SYSCALL:
                if (vcpu->arch.papr_enabled &&
                    (kvmppc_get_last_sc(vcpu) == 0x44000022) &&
-                   !(vcpu->arch.shared->msr & MSR_PR)) {
+                   !(kvmppc_get_msr(vcpu) & MSR_PR)) {
                        /* SC 1 papr hypercalls */
                        ulong cmd = kvmppc_get_gpr(vcpu, 3);
                        int i;
@@ -945,7 +1055,7 @@ program_interrupt:
                                gprs[i] = kvmppc_get_gpr(vcpu, i);
                        vcpu->arch.osi_needed = 1;
                        r = RESUME_HOST_NV;
-               } else if (!(vcpu->arch.shared->msr & MSR_PR) &&
+               } else if (!(kvmppc_get_msr(vcpu) & MSR_PR) &&
                    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
                        /* KVM PV hypercalls */
                        kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
@@ -986,14 +1096,26 @@ program_interrupt:
        }
        case BOOK3S_INTERRUPT_ALIGNMENT:
                if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
-                       vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
-                               kvmppc_get_last_inst(vcpu));
-                       vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
-                               kvmppc_get_last_inst(vcpu));
+                       u32 last_inst = kvmppc_get_last_inst(vcpu);
+                       u32 dsisr;
+                       u64 dar;
+
+                       dsisr = kvmppc_alignment_dsisr(vcpu, last_inst);
+                       dar = kvmppc_alignment_dar(vcpu, last_inst);
+
+                       kvmppc_set_dsisr(vcpu, dsisr);
+                       kvmppc_set_dar(vcpu, dar);
+
                        kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
                }
                r = RESUME_GUEST;
                break;
+#ifdef CONFIG_PPC_BOOK3S_64
+       case BOOK3S_INTERRUPT_FAC_UNAVAIL:
+               kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56);
+               r = RESUME_GUEST;
+               break;
+#endif
        case BOOK3S_INTERRUPT_MACHINE_CHECK:
        case BOOK3S_INTERRUPT_TRACE:
                kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
@@ -1054,7 +1176,7 @@ static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu,
                }
        } else {
                for (i = 0; i < 16; i++)
-                       sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
+                       sregs->u.s.ppc32.sr[i] = kvmppc_get_sr(vcpu, i);
 
                for (i = 0; i < 8; i++) {
                        sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
@@ -1110,6 +1232,15 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_HIOR:
                *val = get_reg_val(id, to_book3s(vcpu)->hior);
                break;
+       case KVM_REG_PPC_LPCR:
+               /*
+                * We are only interested in the LPCR_ILE bit
+                */
+               if (vcpu->arch.intr_msr & MSR_LE)
+                       *val = get_reg_val(id, LPCR_ILE);
+               else
+                       *val = get_reg_val(id, 0);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -1118,6 +1249,14 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
        return r;
 }
 
+static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr)
+{
+       if (new_lpcr & LPCR_ILE)
+               vcpu->arch.intr_msr |= MSR_LE;
+       else
+               vcpu->arch.intr_msr &= ~MSR_LE;
+}
+
 static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
                                 union kvmppc_one_reg *val)
 {
@@ -1128,6 +1267,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
                to_book3s(vcpu)->hior = set_reg_val(id, *val);
                to_book3s(vcpu)->hior_explicit = true;
                break;
+       case KVM_REG_PPC_LPCR:
+               kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
+               break;
        default:
                r = -EINVAL;
                break;
@@ -1170,8 +1312,14 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
                goto uninit_vcpu;
        /* the real shared page fills the last 4k of our page */
        vcpu->arch.shared = (void *)(p + PAGE_SIZE - 4096);
-
 #ifdef CONFIG_PPC_BOOK3S_64
+       /* Always start the shared struct in native endian mode */
+#ifdef __BIG_ENDIAN__
+        vcpu->arch.shared_big_endian = true;
+#else
+        vcpu->arch.shared_big_endian = false;
+#endif
+
        /*
         * Default to the same as the host if we're on sufficiently
         * recent machine that we have 1TB segments;
@@ -1180,6 +1328,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
        vcpu->arch.pvr = 0x3C0301;
        if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
                vcpu->arch.pvr = mfspr(SPRN_PVR);
+       vcpu->arch.intr_msr = MSR_SF;
 #else
        /* default to book3s_32 (750) */
        vcpu->arch.pvr = 0x84202;
@@ -1187,7 +1336,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
        kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr);
        vcpu->arch.slb_nr = 64;
 
-       vcpu->arch.shadow_msr = MSR_USER64;
+       vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE;
 
        err = kvmppc_mmu_init(vcpu);
        if (err < 0)
@@ -1264,7 +1413,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 #endif
 
        /* Preload FPU if it's enabled */
-       if (vcpu->arch.shared->msr & MSR_FP)
+       if (kvmppc_get_msr(vcpu) & MSR_FP)
                kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 
        kvmppc_fix_ee_before_entry();
@@ -1277,6 +1426,9 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        /* Make sure we save the guest FPU/Altivec/VSX state */
        kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
 
+       /* Make sure we save the guest TAR/EBB/DSCR state */
+       kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+
 out:
        vcpu->mode = OUTSIDE_GUEST_MODE;
        return ret;
index 5efa97b993d899faf4d16e65fc4541a241e1b5a9..52a63bfe3f071b60018e87f3e78b61895a0c5c6b 100644 (file)
@@ -57,7 +57,7 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
                for (i = 0; ; ++i) {
                        if (i == 8)
                                goto done;
-                       if ((*hpte & HPTE_V_VALID) == 0)
+                       if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0)
                                break;
                        hpte += 2;
                }
@@ -67,8 +67,8 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
                        goto done;
        }
 
-       hpte[0] = kvmppc_get_gpr(vcpu, 6);
-       hpte[1] = kvmppc_get_gpr(vcpu, 7);
+       hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6));
+       hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7));
        pteg_addr += i * HPTE_SIZE;
        copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE);
        kvmppc_set_gpr(vcpu, 4, pte_index | i);
@@ -93,6 +93,8 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
        pteg = get_pteg_addr(vcpu, pte_index);
        mutex_lock(&vcpu->kvm->arch.hpt_mutex);
        copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+       pte[0] = be64_to_cpu(pte[0]);
+       pte[1] = be64_to_cpu(pte[1]);
 
        ret = H_NOT_FOUND;
        if ((pte[0] & HPTE_V_VALID) == 0 ||
@@ -169,6 +171,8 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu)
 
                pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX);
                copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+               pte[0] = be64_to_cpu(pte[0]);
+               pte[1] = be64_to_cpu(pte[1]);
 
                /* tsl = AVPN */
                flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26;
@@ -207,6 +211,8 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
        pteg = get_pteg_addr(vcpu, pte_index);
        mutex_lock(&vcpu->kvm->arch.hpt_mutex);
        copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+       pte[0] = be64_to_cpu(pte[0]);
+       pte[1] = be64_to_cpu(pte[1]);
 
        ret = H_NOT_FOUND;
        if ((pte[0] & HPTE_V_VALID) == 0 ||
@@ -225,6 +231,8 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
 
        rb = compute_tlbie_rb(v, r, pte_index);
        vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+       pte[0] = cpu_to_be64(pte[0]);
+       pte[1] = cpu_to_be64(pte[1]);
        copy_to_user((void __user *)pteg, pte, sizeof(pte));
        ret = H_SUCCESS;
 
@@ -270,7 +278,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
        case H_PUT_TCE:
                return kvmppc_h_pr_put_tce(vcpu);
        case H_CEDE:
-               vcpu->arch.shared->msr |= MSR_EE;
+               kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
                kvm_vcpu_block(vcpu);
                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
                vcpu->stat.halt_wakeup++;
index 7a053157483b64f9dfbc3a6216d486cc50d628fa..edb14ba992b34fa74f3e7cf11c1b9a8a1b01e921 100644 (file)
@@ -205,6 +205,32 @@ int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
        return rc;
 }
 
+static void kvmppc_rtas_swap_endian_in(struct rtas_args *args)
+{
+#ifdef __LITTLE_ENDIAN__
+       int i;
+
+       args->token = be32_to_cpu(args->token);
+       args->nargs = be32_to_cpu(args->nargs);
+       args->nret = be32_to_cpu(args->nret);
+       for (i = 0; i < args->nargs; i++)
+               args->args[i] = be32_to_cpu(args->args[i]);
+#endif
+}
+
+static void kvmppc_rtas_swap_endian_out(struct rtas_args *args)
+{
+#ifdef __LITTLE_ENDIAN__
+       int i;
+
+       for (i = 0; i < args->nret; i++)
+               args->args[i] = cpu_to_be32(args->args[i]);
+       args->token = cpu_to_be32(args->token);
+       args->nargs = cpu_to_be32(args->nargs);
+       args->nret = cpu_to_be32(args->nret);
+#endif
+}
+
 int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
 {
        struct rtas_token_definition *d;
@@ -223,6 +249,8 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
        if (rc)
                goto fail;
 
+       kvmppc_rtas_swap_endian_in(&args);
+
        /*
         * args->rets is a pointer into args->args. Now that we've
         * copied args we need to fix it up to point into our copy,
@@ -247,6 +275,7 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
 
        if (rc == 0) {
                args.rets = orig_rets;
+               kvmppc_rtas_swap_endian_out(&args);
                rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args));
                if (rc)
                        goto fail;
index 1e0cc2adfd40d9aaee3ee93662d6f4ee48d8d659..acee37cde840a3b18a512a09bfc79634a8e26f07 100644 (file)
@@ -90,6 +90,15 @@ kvmppc_handler_trampoline_enter:
        LOAD_GUEST_SEGMENTS
 
 #ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FTR_SECTION
+       /* Save host FSCR */
+       mfspr   r8, SPRN_FSCR
+       std     r8, HSTATE_HOST_FSCR(r13)
+       /* Set FSCR during guest execution */
+       ld      r9, SVCPU_SHADOW_FSCR(r13)
+       mtspr   SPRN_FSCR, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
        /* Some guests may need to have dcbz set to 32 byte length.
         *
         * Usually we ensure that by patching the guest's instructions
@@ -255,6 +264,10 @@ BEGIN_FTR_SECTION
        cmpwi   r12, BOOK3S_INTERRUPT_H_EMUL_ASSIST
        beq-    ld_last_inst
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+BEGIN_FTR_SECTION
+       cmpwi   r12, BOOK3S_INTERRUPT_FAC_UNAVAIL
+       beq-    ld_last_inst
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #endif
 
        b       no_ld_last_inst
@@ -311,6 +324,18 @@ no_ld_last_inst:
 
 no_dcbz32_off:
 
+BEGIN_FTR_SECTION
+       /* Save guest FSCR on a FAC_UNAVAIL interrupt */
+       cmpwi   r12, BOOK3S_INTERRUPT_FAC_UNAVAIL
+       bne+    no_fscr_save
+       mfspr   r7, SPRN_FSCR
+       std     r7, SVCPU_SHADOW_FSCR(r13)
+no_fscr_save:
+       /* Restore host FSCR */
+       ld      r8, HSTATE_HOST_FSCR(r13)
+       mtspr   SPRN_FSCR, r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
        /*
index 89b7f821f6c41d84acfd997215b907047b0cd0a1..002d51764143b7a8b0275ae0b3590a5d82932385 100644 (file)
@@ -19,6 +19,7 @@
 #include "booke.h"
 #include "e500.h"
 
+#define XOP_DCBTLS  166
 #define XOP_MSGSND  206
 #define XOP_MSGCLR  238
 #define XOP_TLBIVAX 786
@@ -103,6 +104,15 @@ static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu,
        return emulated;
 }
 
+static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+       /* Always fail to lock the cache */
+       vcpu_e500->l1csr0 |= L1CSR0_CUL;
+       return EMULATE_DONE;
+}
+
 int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                unsigned int inst, int *advance)
 {
@@ -116,6 +126,10 @@ int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
        case 31:
                switch (get_xop(inst)) {
 
+               case XOP_DCBTLS:
+                       emulated = kvmppc_e500_emul_dcbtls(vcpu);
+                       break;
+
 #ifdef CONFIG_KVM_E500MC
                case XOP_MSGSND:
                        emulated = kvmppc_e500_emul_msgsnd(vcpu, rb);
@@ -222,6 +236,7 @@ int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_va
                break;
        case SPRN_L1CSR1:
                vcpu_e500->l1csr1 = spr_val;
+               vcpu_e500->l1csr1 &= ~(L1CSR1_ICFI | L1CSR1_ICLFR);
                break;
        case SPRN_HID0:
                vcpu_e500->hid0 = spr_val;
index c2b887be2c2922bebf772cedbba0ab14cc74c52f..da86d9ba34761d27af1714ecacbb5badc312696a 100644 (file)
@@ -97,10 +97,10 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 
        switch (sprn) {
        case SPRN_SRR0:
-               vcpu->arch.shared->srr0 = spr_val;
+               kvmppc_set_srr0(vcpu, spr_val);
                break;
        case SPRN_SRR1:
-               vcpu->arch.shared->srr1 = spr_val;
+               kvmppc_set_srr1(vcpu, spr_val);
                break;
 
        /* XXX We need to context-switch the timebase for
@@ -114,16 +114,16 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
                break;
 
        case SPRN_SPRG0:
-               vcpu->arch.shared->sprg0 = spr_val;
+               kvmppc_set_sprg0(vcpu, spr_val);
                break;
        case SPRN_SPRG1:
-               vcpu->arch.shared->sprg1 = spr_val;
+               kvmppc_set_sprg1(vcpu, spr_val);
                break;
        case SPRN_SPRG2:
-               vcpu->arch.shared->sprg2 = spr_val;
+               kvmppc_set_sprg2(vcpu, spr_val);
                break;
        case SPRN_SPRG3:
-               vcpu->arch.shared->sprg3 = spr_val;
+               kvmppc_set_sprg3(vcpu, spr_val);
                break;
 
        /* PIR can legally be written, but we ignore it */
@@ -150,10 +150,10 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 
        switch (sprn) {
        case SPRN_SRR0:
-               spr_val = vcpu->arch.shared->srr0;
+               spr_val = kvmppc_get_srr0(vcpu);
                break;
        case SPRN_SRR1:
-               spr_val = vcpu->arch.shared->srr1;
+               spr_val = kvmppc_get_srr1(vcpu);
                break;
        case SPRN_PVR:
                spr_val = vcpu->arch.pvr;
@@ -173,16 +173,16 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
                break;
 
        case SPRN_SPRG0:
-               spr_val = vcpu->arch.shared->sprg0;
+               spr_val = kvmppc_get_sprg0(vcpu);
                break;
        case SPRN_SPRG1:
-               spr_val = vcpu->arch.shared->sprg1;
+               spr_val = kvmppc_get_sprg1(vcpu);
                break;
        case SPRN_SPRG2:
-               spr_val = vcpu->arch.shared->sprg2;
+               spr_val = kvmppc_get_sprg2(vcpu);
                break;
        case SPRN_SPRG3:
-               spr_val = vcpu->arch.shared->sprg3;
+               spr_val = kvmppc_get_sprg3(vcpu);
                break;
        /* Note: SPRG4-7 are user-readable, so we don't get
         * a trap. */
index efbd9962a209c999ff9ab8ff4959d9e4522d296b..b68d0dc9479a820dd469a36c3c21bc5f6b7e8188 100644 (file)
@@ -126,6 +126,8 @@ static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
                                      u32 val, int idx);
 static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
                                     u32 *ptr, int idx);
+static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
+                                   uint32_t val);
 
 enum irq_type {
        IRQ_TYPE_NORMAL = 0,
@@ -528,7 +530,6 @@ static void openpic_reset(struct openpic *opp)
        /* Initialise IRQ sources */
        for (i = 0; i < opp->max_irq; i++) {
                opp->src[i].ivpr = opp->ivpr_reset;
-               opp->src[i].idr = opp->idr_reset;
 
                switch (opp->src[i].type) {
                case IRQ_TYPE_NORMAL:
@@ -543,6 +544,8 @@ static void openpic_reset(struct openpic *opp)
                case IRQ_TYPE_FSLSPECIAL:
                        break;
                }
+
+               write_IRQreg_idr(opp, i, opp->idr_reset);
        }
        /* Initialise IRQ destinations */
        for (i = 0; i < MAX_CPU; i++) {
index 3cf541a53e2aef14a00e467e0c4daec16c255e7a..bab20f4104430489674b5c05e4a1223036e2eac3 100644 (file)
@@ -125,6 +125,27 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter);
 
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+static void kvmppc_swab_shared(struct kvm_vcpu *vcpu)
+{
+       struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared;
+       int i;
+
+       shared->sprg0 = swab64(shared->sprg0);
+       shared->sprg1 = swab64(shared->sprg1);
+       shared->sprg2 = swab64(shared->sprg2);
+       shared->sprg3 = swab64(shared->sprg3);
+       shared->srr0 = swab64(shared->srr0);
+       shared->srr1 = swab64(shared->srr1);
+       shared->dar = swab64(shared->dar);
+       shared->msr = swab64(shared->msr);
+       shared->dsisr = swab32(shared->dsisr);
+       shared->int_pending = swab32(shared->int_pending);
+       for (i = 0; i < ARRAY_SIZE(shared->sr); i++)
+               shared->sr[i] = swab32(shared->sr[i]);
+}
+#endif
+
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 {
        int nr = kvmppc_get_gpr(vcpu, 11);
@@ -135,7 +156,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
        unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
        unsigned long r2 = 0;
 
-       if (!(vcpu->arch.shared->msr & MSR_SF)) {
+       if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
                /* 32 bit mode */
                param1 &= 0xffffffff;
                param2 &= 0xffffffff;
@@ -146,8 +167,28 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
        switch (nr) {
        case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):
        {
-               vcpu->arch.magic_page_pa = param1;
-               vcpu->arch.magic_page_ea = param2;
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+               /* Book3S can be little endian, find it out here */
+               int shared_big_endian = true;
+               if (vcpu->arch.intr_msr & MSR_LE)
+                       shared_big_endian = false;
+               if (shared_big_endian != vcpu->arch.shared_big_endian)
+                       kvmppc_swab_shared(vcpu);
+               vcpu->arch.shared_big_endian = shared_big_endian;
+#endif
+
+               if (!(param2 & MAGIC_PAGE_FLAG_NOT_MAPPED_NX)) {
+                       /*
+                        * Older versions of the Linux magic page code had
+                        * a bug where they would map their trampoline code
+                        * NX. If that's the case, remove !PR NX capability.
+                        */
+                       vcpu->arch.disable_kernel_nx = true;
+                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+               }
+
+               vcpu->arch.magic_page_pa = param1 & ~0xfffULL;
+               vcpu->arch.magic_page_ea = param2 & ~0xfffULL;
 
                r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
 
@@ -375,6 +416,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_SPAPR_TCE:
        case KVM_CAP_PPC_ALLOC_HTAB:
        case KVM_CAP_PPC_RTAS:
+       case KVM_CAP_PPC_FIXUP_HCALL:
 #ifdef CONFIG_KVM_XICS
        case KVM_CAP_IRQ_XICS:
 #endif
@@ -1015,10 +1057,10 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
        u32 inst_nop = 0x60000000;
 #ifdef CONFIG_KVM_BOOKE_HV
        u32 inst_sc1 = 0x44000022;
-       pvinfo->hcall[0] = inst_sc1;
-       pvinfo->hcall[1] = inst_nop;
-       pvinfo->hcall[2] = inst_nop;
-       pvinfo->hcall[3] = inst_nop;
+       pvinfo->hcall[0] = cpu_to_be32(inst_sc1);
+       pvinfo->hcall[1] = cpu_to_be32(inst_nop);
+       pvinfo->hcall[2] = cpu_to_be32(inst_nop);
+       pvinfo->hcall[3] = cpu_to_be32(inst_nop);
 #else
        u32 inst_lis = 0x3c000000;
        u32 inst_ori = 0x60000000;
@@ -1034,10 +1076,10 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
         *    sc
         *    nop
         */
-       pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask);
-       pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask);
-       pvinfo->hcall[2] = inst_sc;
-       pvinfo->hcall[3] = inst_nop;
+       pvinfo->hcall[0] = cpu_to_be32(inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask));
+       pvinfo->hcall[1] = cpu_to_be32(inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask));
+       pvinfo->hcall[2] = cpu_to_be32(inst_sc);
+       pvinfo->hcall[3] = cpu_to_be32(inst_nop);
 #endif
 
        pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;
index 8b22e4748344c498d95415610c30b55a6af458fa..e1357cd8dc1f48a50eb69a83efd209ce586af002 100644 (file)
@@ -255,7 +255,7 @@ TRACE_EVENT(kvm_exit,
                __entry->exit_nr        = exit_nr;
                __entry->pc             = kvmppc_get_pc(vcpu);
                __entry->dar            = kvmppc_get_fault_dar(vcpu);
-               __entry->msr            = vcpu->arch.shared->msr;
+               __entry->msr            = kvmppc_get_msr(vcpu);
                __entry->srr1           = vcpu->arch.shadow_srr1;
                __entry->last_inst      = vcpu->arch.last_inst;
        ),
index 9d1d33cd2be528598eb9a3e0c436a0c041094656..964a5f61488a1a7e726e2e68e10cc8446e0fcb62 100644 (file)
@@ -97,7 +97,7 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
 static void __slb_flush_and_rebolt(void)
 {
        /* If you change this make sure you change SLB_NUM_BOLTED
-        * appropriately too. */
+        * and PR KVM appropriately too. */
        unsigned long linear_llp, vmalloc_llp, lflags, vflags;
        unsigned long ksp_esid_data, ksp_vsid_data;
 
index 4e63f1a13600a4e126d470138e074d2e4e98682d..31ab9f346d7e37f1c7ce8048db783ba4a15d128c 100644 (file)
@@ -57,6 +57,20 @@ static inline void __ctl_clear_bit(unsigned int cr, unsigned int bit)
 void smp_ctl_set_bit(int cr, int bit);
 void smp_ctl_clear_bit(int cr, int bit);
 
+union ctlreg0 {
+       unsigned long val;
+       struct {
+#ifdef CONFIG_64BIT
+               unsigned long      : 32;
+#endif
+               unsigned long      : 3;
+               unsigned long lap  : 1; /* Low-address-protection control */
+               unsigned long      : 4;
+               unsigned long edat : 1; /* Enhanced-DAT-enablement control */
+               unsigned long      : 23;
+       };
+};
+
 #ifdef CONFIG_SMP
 # define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit)
 # define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit)
index 154b60089be996de483f07844f9229c728918892..4181d7baabba99d810cbb53db383d65c12116409 100644 (file)
 #define KVM_NR_IRQCHIPS 1
 #define KVM_IRQCHIP_NUM_PINS 4096
 
+#define SIGP_CTRL_C    0x00800000
+
 struct sca_entry {
-       atomic_t scn;
+       atomic_t ctrl;
        __u32   reserved;
        __u64   sda;
        __u64   reserved2[2];
 } __attribute__((packed));
 
+union ipte_control {
+       unsigned long val;
+       struct {
+               unsigned long k  : 1;
+               unsigned long kh : 31;
+               unsigned long kg : 32;
+       };
+};
 
 struct sca_block {
-       __u64   ipte_control;
+       union ipte_control ipte_control;
        __u64   reserved[5];
        __u64   mcn;
        __u64   reserved2;
@@ -64,6 +74,7 @@ struct sca_block {
 #define CPUSTAT_ZARCH      0x00000800
 #define CPUSTAT_MCDS       0x00000100
 #define CPUSTAT_SM         0x00000080
+#define CPUSTAT_IBS        0x00000040
 #define CPUSTAT_G          0x00000008
 #define CPUSTAT_GED        0x00000004
 #define CPUSTAT_J          0x00000002
@@ -71,7 +82,9 @@ struct sca_block {
 
 struct kvm_s390_sie_block {
        atomic_t cpuflags;              /* 0x0000 */
-       __u32   prefix;                 /* 0x0004 */
+       __u32 : 1;                      /* 0x0004 */
+       __u32 prefix : 18;
+       __u32 : 13;
        __u8    reserved08[4];          /* 0x0008 */
 #define PROG_IN_SIE (1<<0)
        __u32   prog0c;                 /* 0x000c */
@@ -85,12 +98,27 @@ struct kvm_s390_sie_block {
        __u8    reserved40[4];          /* 0x0040 */
 #define LCTL_CR0       0x8000
 #define LCTL_CR6       0x0200
+#define LCTL_CR9       0x0040
+#define LCTL_CR10      0x0020
+#define LCTL_CR11      0x0010
 #define LCTL_CR14      0x0002
        __u16   lctl;                   /* 0x0044 */
        __s16   icpua;                  /* 0x0046 */
-#define ICTL_LPSW 0x00400000
+#define ICTL_PINT      0x20000000
+#define ICTL_LPSW      0x00400000
+#define ICTL_STCTL     0x00040000
+#define ICTL_ISKE      0x00004000
+#define ICTL_SSKE      0x00002000
+#define ICTL_RRBE      0x00001000
+#define ICTL_TPROT     0x00000200
        __u32   ictl;                   /* 0x0048 */
        __u32   eca;                    /* 0x004c */
+#define ICPT_INST      0x04
+#define ICPT_PROGI     0x08
+#define ICPT_INSTPROGI 0x0C
+#define ICPT_OPEREXC   0x2C
+#define ICPT_PARTEXEC  0x38
+#define ICPT_IOINST    0x40
        __u8    icptcode;               /* 0x0050 */
        __u8    reserved51;             /* 0x0051 */
        __u16   ihcpu;                  /* 0x0052 */
@@ -109,9 +137,24 @@ struct kvm_s390_sie_block {
        psw_t   gpsw;                   /* 0x0090 */
        __u64   gg14;                   /* 0x00a0 */
        __u64   gg15;                   /* 0x00a8 */
-       __u8    reservedb0[30];         /* 0x00b0 */
-       __u16   iprcc;                  /* 0x00ce */
-       __u8    reservedd0[48];         /* 0x00d0 */
+       __u8    reservedb0[20];         /* 0x00b0 */
+       __u16   extcpuaddr;             /* 0x00c4 */
+       __u16   eic;                    /* 0x00c6 */
+       __u32   reservedc8;             /* 0x00c8 */
+       __u16   pgmilc;                 /* 0x00cc */
+       __u16   iprcc;                  /* 0x00ce */
+       __u32   dxc;                    /* 0x00d0 */
+       __u16   mcn;                    /* 0x00d4 */
+       __u8    perc;                   /* 0x00d6 */
+       __u8    peratmid;               /* 0x00d7 */
+       __u64   peraddr;                /* 0x00d8 */
+       __u8    eai;                    /* 0x00e0 */
+       __u8    peraid;                 /* 0x00e1 */
+       __u8    oai;                    /* 0x00e2 */
+       __u8    armid;                  /* 0x00e3 */
+       __u8    reservede4[4];          /* 0x00e4 */
+       __u64   tecmc;                  /* 0x00e8 */
+       __u8    reservedf0[16];         /* 0x00f0 */
        __u64   gcr[16];                /* 0x0100 */
        __u64   gbea;                   /* 0x0180 */
        __u8    reserved188[24];        /* 0x0188 */
@@ -146,6 +189,8 @@ struct kvm_vcpu_stat {
        u32 exit_instruction;
        u32 instruction_lctl;
        u32 instruction_lctlg;
+       u32 instruction_stctl;
+       u32 instruction_stctg;
        u32 exit_program_interruption;
        u32 exit_instr_and_program;
        u32 deliver_external_call;
@@ -164,6 +209,7 @@ struct kvm_vcpu_stat {
        u32 instruction_stpx;
        u32 instruction_stap;
        u32 instruction_storage_key;
+       u32 instruction_ipte_interlock;
        u32 instruction_stsch;
        u32 instruction_chsc;
        u32 instruction_stsi;
@@ -183,13 +229,58 @@ struct kvm_vcpu_stat {
        u32 diagnose_9c;
 };
 
-#define PGM_OPERATION            0x01
-#define PGM_PRIVILEGED_OP       0x02
-#define PGM_EXECUTE              0x03
-#define PGM_PROTECTION           0x04
-#define PGM_ADDRESSING           0x05
-#define PGM_SPECIFICATION        0x06
-#define PGM_DATA                 0x07
+#define PGM_OPERATION                  0x01
+#define PGM_PRIVILEGED_OP              0x02
+#define PGM_EXECUTE                    0x03
+#define PGM_PROTECTION                 0x04
+#define PGM_ADDRESSING                 0x05
+#define PGM_SPECIFICATION              0x06
+#define PGM_DATA                       0x07
+#define PGM_FIXED_POINT_OVERFLOW       0x08
+#define PGM_FIXED_POINT_DIVIDE         0x09
+#define PGM_DECIMAL_OVERFLOW           0x0a
+#define PGM_DECIMAL_DIVIDE             0x0b
+#define PGM_HFP_EXPONENT_OVERFLOW      0x0c
+#define PGM_HFP_EXPONENT_UNDERFLOW     0x0d
+#define PGM_HFP_SIGNIFICANCE           0x0e
+#define PGM_HFP_DIVIDE                 0x0f
+#define PGM_SEGMENT_TRANSLATION                0x10
+#define PGM_PAGE_TRANSLATION           0x11
+#define PGM_TRANSLATION_SPEC           0x12
+#define PGM_SPECIAL_OPERATION          0x13
+#define PGM_OPERAND                    0x15
+#define PGM_TRACE_TABEL                        0x16
+#define PGM_SPACE_SWITCH               0x1c
+#define PGM_HFP_SQUARE_ROOT            0x1d
+#define PGM_PC_TRANSLATION_SPEC                0x1f
+#define PGM_AFX_TRANSLATION            0x20
+#define PGM_ASX_TRANSLATION            0x21
+#define PGM_LX_TRANSLATION             0x22
+#define PGM_EX_TRANSLATION             0x23
+#define PGM_PRIMARY_AUTHORITY          0x24
+#define PGM_SECONDARY_AUTHORITY                0x25
+#define PGM_LFX_TRANSLATION            0x26
+#define PGM_LSX_TRANSLATION            0x27
+#define PGM_ALET_SPECIFICATION         0x28
+#define PGM_ALEN_TRANSLATION           0x29
+#define PGM_ALE_SEQUENCE               0x2a
+#define PGM_ASTE_VALIDITY              0x2b
+#define PGM_ASTE_SEQUENCE              0x2c
+#define PGM_EXTENDED_AUTHORITY         0x2d
+#define PGM_LSTE_SEQUENCE              0x2e
+#define PGM_ASTE_INSTANCE              0x2f
+#define PGM_STACK_FULL                 0x30
+#define PGM_STACK_EMPTY                        0x31
+#define PGM_STACK_SPECIFICATION                0x32
+#define PGM_STACK_TYPE                 0x33
+#define PGM_STACK_OPERATION            0x34
+#define PGM_ASCE_TYPE                  0x38
+#define PGM_REGION_FIRST_TRANS         0x39
+#define PGM_REGION_SECOND_TRANS                0x3a
+#define PGM_REGION_THIRD_TRANS         0x3b
+#define PGM_MONITOR                    0x40
+#define PGM_PER                                0x80
+#define PGM_CRYPTO_OPERATION           0x119
 
 struct kvm_s390_interrupt_info {
        struct list_head list;
@@ -229,6 +320,45 @@ struct kvm_s390_float_interrupt {
        unsigned int irq_count;
 };
 
+struct kvm_hw_wp_info_arch {
+       unsigned long addr;
+       unsigned long phys_addr;
+       int len;
+       char *old_data;
+};
+
+struct kvm_hw_bp_info_arch {
+       unsigned long addr;
+       int len;
+};
+
+/*
+ * Only the upper 16 bits of kvm_guest_debug->control are arch specific.
+ * Further KVM_GUESTDBG flags which an be used from userspace can be found in
+ * arch/s390/include/uapi/asm/kvm.h
+ */
+#define KVM_GUESTDBG_EXIT_PENDING 0x10000000
+
+#define guestdbg_enabled(vcpu) \
+               (vcpu->guest_debug & KVM_GUESTDBG_ENABLE)
+#define guestdbg_sstep_enabled(vcpu) \
+               (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+#define guestdbg_hw_bp_enabled(vcpu) \
+               (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+#define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
+               (vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
+
+struct kvm_guestdbg_info_arch {
+       unsigned long cr0;
+       unsigned long cr9;
+       unsigned long cr10;
+       unsigned long cr11;
+       struct kvm_hw_bp_info_arch *hw_bp_info;
+       struct kvm_hw_wp_info_arch *hw_wp_info;
+       int nr_hw_bp;
+       int nr_hw_wp;
+       unsigned long last_bp;
+};
 
 struct kvm_vcpu_arch {
        struct kvm_s390_sie_block *sie_block;
@@ -238,11 +368,13 @@ struct kvm_vcpu_arch {
        struct kvm_s390_local_interrupt local_int;
        struct hrtimer    ckc_timer;
        struct tasklet_struct tasklet;
+       struct kvm_s390_pgm_info pgm;
        union  {
                struct cpuid    cpu_id;
                u64             stidp_data;
        };
        struct gmap *gmap;
+       struct kvm_guestdbg_info_arch guestdbg;
 #define KVM_S390_PFAULT_TOKEN_INVALID  (-1UL)
        unsigned long pfault_token;
        unsigned long pfault_select;
@@ -285,7 +417,10 @@ struct kvm_arch{
        struct gmap *gmap;
        int css_support;
        int use_irqchip;
+       int use_cmma;
        struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
+       wait_queue_head_t ipte_wq;
+       spinlock_t start_stop_lock;
 };
 
 #define KVM_HVA_ERR_BAD                (-1UL)
index 2070cad80e9e4b10eec5218d7e5105801a94d499..4349197ab9df9019ea5a23938f4b473e8d95442f 100644 (file)
@@ -56,13 +56,14 @@ struct _lowcore {
        __u16   pgm_code;                       /* 0x008e */
        __u32   trans_exc_code;                 /* 0x0090 */
        __u16   mon_class_num;                  /* 0x0094 */
-       __u16   per_perc_atmid;                 /* 0x0096 */
+       __u8    per_code;                       /* 0x0096 */
+       __u8    per_atmid;                      /* 0x0097 */
        __u32   per_address;                    /* 0x0098 */
        __u32   monitor_code;                   /* 0x009c */
        __u8    exc_access_id;                  /* 0x00a0 */
        __u8    per_access_id;                  /* 0x00a1 */
        __u8    op_access_id;                   /* 0x00a2 */
-       __u8    ar_access_id;                   /* 0x00a3 */
+       __u8    ar_mode_id;                     /* 0x00a3 */
        __u8    pad_0x00a4[0x00b8-0x00a4];      /* 0x00a4 */
        __u16   subchannel_id;                  /* 0x00b8 */
        __u16   subchannel_nr;                  /* 0x00ba */
@@ -195,12 +196,13 @@ struct _lowcore {
        __u16   pgm_code;                       /* 0x008e */
        __u32   data_exc_code;                  /* 0x0090 */
        __u16   mon_class_num;                  /* 0x0094 */
-       __u16   per_perc_atmid;                 /* 0x0096 */
+       __u8    per_code;                       /* 0x0096 */
+       __u8    per_atmid;                      /* 0x0097 */
        __u64   per_address;                    /* 0x0098 */
        __u8    exc_access_id;                  /* 0x00a0 */
        __u8    per_access_id;                  /* 0x00a1 */
        __u8    op_access_id;                   /* 0x00a2 */
-       __u8    ar_access_id;                   /* 0x00a3 */
+       __u8    ar_mode_id;                     /* 0x00a3 */
        __u8    pad_0x00a4[0x00a8-0x00a4];      /* 0x00a4 */
        __u64   trans_exc_code;                 /* 0x00a8 */
        __u64   monitor_code;                   /* 0x00b0 */
index f77695a82f647dbad92539be85dbac0babc60e2d..a5e656260a70183dd4f3768c3be082fed5988603 100644 (file)
@@ -16,6 +16,8 @@ typedef struct {
        unsigned long vdso_base;
        /* The mmu context has extended page tables. */
        unsigned int has_pgste:1;
+       /* The mmu context uses storage keys. */
+       unsigned int use_skey:1;
 } mm_context_t;
 
 #define INIT_MM_CONTEXT(name)                                                \
index 056d7eff2a1655b72bb140b80c092926c8d9f181..c28f32a45af5d9003de5cb36f4bc0d5a35722c57 100644 (file)
@@ -23,6 +23,7 @@ static inline int init_new_context(struct task_struct *tsk,
        mm->context.asce_bits |= _ASCE_TYPE_REGION3;
 #endif
        mm->context.has_pgste = 0;
+       mm->context.use_skey = 0;
        mm->context.asce_limit = STACK_TOP_MAX;
        crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
        return 0;
index 884017cbfa9fade412372f7f781e503b3f39513b..9e18a61d3df39c0c96f81032ae5f67ad427fa32c 100644 (file)
@@ -22,7 +22,8 @@ unsigned long *page_table_alloc(struct mm_struct *, unsigned long);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *);
 
-void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long);
+void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long,
+                           bool init_skey);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
                          unsigned long key, bool nq);
 
index 12f75313e086d4695ee768bde41beac4d3418de9..fcba5e03839f560d0348a4df0d7493ea5151e90b 100644 (file)
@@ -309,7 +309,8 @@ extern unsigned long MODULES_END;
 #define PGSTE_HC_BIT   0x00200000UL
 #define PGSTE_GR_BIT   0x00040000UL
 #define PGSTE_GC_BIT   0x00020000UL
-#define PGSTE_IN_BIT   0x00008000UL    /* IPTE notify bit */
+#define PGSTE_UC_BIT   0x00008000UL    /* user dirty (migration) */
+#define PGSTE_IN_BIT   0x00004000UL    /* IPTE notify bit */
 
 #else /* CONFIG_64BIT */
 
@@ -391,7 +392,8 @@ extern unsigned long MODULES_END;
 #define PGSTE_HC_BIT   0x0020000000000000UL
 #define PGSTE_GR_BIT   0x0004000000000000UL
 #define PGSTE_GC_BIT   0x0002000000000000UL
-#define PGSTE_IN_BIT   0x0000800000000000UL    /* IPTE notify bit */
+#define PGSTE_UC_BIT   0x0000800000000000UL    /* user dirty (migration) */
+#define PGSTE_IN_BIT   0x0000400000000000UL    /* IPTE notify bit */
 
 #endif /* CONFIG_64BIT */
 
@@ -466,6 +468,16 @@ static inline int mm_has_pgste(struct mm_struct *mm)
 #endif
        return 0;
 }
+
+static inline int mm_use_skey(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+       if (mm->context.use_skey)
+               return 1;
+#endif
+       return 0;
+}
+
 /*
  * pgd/pmd/pte query functions
  */
@@ -699,26 +711,17 @@ static inline void pgste_set(pte_t *ptep, pgste_t pgste)
 #endif
 }
 
-static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste,
+                                      struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
        unsigned long address, bits, skey;
 
-       if (pte_val(*ptep) & _PAGE_INVALID)
+       if (!mm_use_skey(mm) || pte_val(*ptep) & _PAGE_INVALID)
                return pgste;
        address = pte_val(*ptep) & PAGE_MASK;
        skey = (unsigned long) page_get_storage_key(address);
        bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
-       if (!(pgste_val(pgste) & PGSTE_HC_BIT) && (bits & _PAGE_CHANGED)) {
-               /* Transfer dirty + referenced bit to host bits in pgste */
-               pgste_val(pgste) |= bits << 52;
-               page_set_storage_key(address, skey ^ bits, 0);
-       } else if (!(pgste_val(pgste) & PGSTE_HR_BIT) &&
-                  (bits & _PAGE_REFERENCED)) {
-               /* Transfer referenced bit to host bit in pgste */
-               pgste_val(pgste) |= PGSTE_HR_BIT;
-               page_reset_referenced(address);
-       }
        /* Transfer page changed & referenced bit to guest bits in pgste */
        pgste_val(pgste) |= bits << 48;         /* GR bit & GC bit */
        /* Copy page access key and fetch protection bit to pgste */
@@ -729,25 +732,14 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
 
 }
 
-static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-       if (pte_val(*ptep) & _PAGE_INVALID)
-               return pgste;
-       /* Get referenced bit from storage key */
-       if (page_reset_referenced(pte_val(*ptep) & PAGE_MASK))
-               pgste_val(pgste) |= PGSTE_HR_BIT | PGSTE_GR_BIT;
-#endif
-       return pgste;
-}
-
-static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry)
+static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry,
+                                struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
        unsigned long address;
        unsigned long nkey;
 
-       if (pte_val(entry) & _PAGE_INVALID)
+       if (!mm_use_skey(mm) || pte_val(entry) & _PAGE_INVALID)
                return;
        VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
        address = pte_val(entry) & PAGE_MASK;
@@ -757,23 +749,30 @@ static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry)
         * key C/R to 0.
         */
        nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+       nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
        page_set_storage_key(address, nkey, 0);
 #endif
 }
 
-static inline void pgste_set_pte(pte_t *ptep, pte_t entry)
+static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
 {
-       if (!MACHINE_HAS_ESOP &&
-           (pte_val(entry) & _PAGE_PRESENT) &&
-           (pte_val(entry) & _PAGE_WRITE)) {
-               /*
-                * Without enhanced suppression-on-protection force
-                * the dirty bit on for all writable ptes.
-                */
-               pte_val(entry) |= _PAGE_DIRTY;
-               pte_val(entry) &= ~_PAGE_PROTECT;
+       if ((pte_val(entry) & _PAGE_PRESENT) &&
+           (pte_val(entry) & _PAGE_WRITE) &&
+           !(pte_val(entry) & _PAGE_INVALID)) {
+               if (!MACHINE_HAS_ESOP) {
+                       /*
+                        * Without enhanced suppression-on-protection force
+                        * the dirty bit on for all writable ptes.
+                        */
+                       pte_val(entry) |= _PAGE_DIRTY;
+                       pte_val(entry) &= ~_PAGE_PROTECT;
+               }
+               if (!(pte_val(entry) & _PAGE_PROTECT))
+                       /* This pte allows write access, set user-dirty */
+                       pgste_val(pgste) |= PGSTE_UC_BIT;
        }
        *ptep = entry;
+       return pgste;
 }
 
 /**
@@ -839,6 +838,8 @@ unsigned long __gmap_fault(unsigned long address, struct gmap *);
 unsigned long gmap_fault(unsigned long address, struct gmap *);
 void gmap_discard(unsigned long from, unsigned long to, struct gmap *);
 void __gmap_zap(unsigned long address, struct gmap *);
+bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *);
+
 
 void gmap_register_ipte_notifier(struct gmap_notifier *);
 void gmap_unregister_ipte_notifier(struct gmap_notifier *);
@@ -870,8 +871,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
        if (mm_has_pgste(mm)) {
                pgste = pgste_get_lock(ptep);
                pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
-               pgste_set_key(ptep, pgste, entry);
-               pgste_set_pte(ptep, entry);
+               pgste_set_key(ptep, pgste, entry, mm);
+               pgste = pgste_set_pte(ptep, pgste, entry);
                pgste_set_unlock(ptep, pgste);
        } else {
                if (!(pte_val(entry) & _PAGE_INVALID) && MACHINE_HAS_EDAT1)
@@ -1017,45 +1018,6 @@ static inline pte_t pte_mkhuge(pte_t pte)
 }
 #endif
 
-/*
- * Get (and clear) the user dirty bit for a pte.
- */
-static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm,
-                                                pte_t *ptep)
-{
-       pgste_t pgste;
-       int dirty = 0;
-
-       if (mm_has_pgste(mm)) {
-               pgste = pgste_get_lock(ptep);
-               pgste = pgste_update_all(ptep, pgste);
-               dirty = !!(pgste_val(pgste) & PGSTE_HC_BIT);
-               pgste_val(pgste) &= ~PGSTE_HC_BIT;
-               pgste_set_unlock(ptep, pgste);
-               return dirty;
-       }
-       return dirty;
-}
-
-/*
- * Get (and clear) the user referenced bit for a pte.
- */
-static inline int ptep_test_and_clear_user_young(struct mm_struct *mm,
-                                                pte_t *ptep)
-{
-       pgste_t pgste;
-       int young = 0;
-
-       if (mm_has_pgste(mm)) {
-               pgste = pgste_get_lock(ptep);
-               pgste = pgste_update_young(ptep, pgste);
-               young = !!(pgste_val(pgste) & PGSTE_HR_BIT);
-               pgste_val(pgste) &= ~PGSTE_HR_BIT;
-               pgste_set_unlock(ptep, pgste);
-       }
-       return young;
-}
-
 static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
 {
        unsigned long pto = (unsigned long) ptep;
@@ -1118,6 +1080,36 @@ static inline void ptep_flush_lazy(struct mm_struct *mm,
        atomic_sub(0x10000, &mm->context.attach_count);
 }
 
+/*
+ * Get (and clear) the user dirty bit for a pte.
+ */
+static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm,
+                                                unsigned long addr,
+                                                pte_t *ptep)
+{
+       pgste_t pgste;
+       pte_t pte;
+       int dirty;
+
+       if (!mm_has_pgste(mm))
+               return 0;
+       pgste = pgste_get_lock(ptep);
+       dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
+       pgste_val(pgste) &= ~PGSTE_UC_BIT;
+       pte = *ptep;
+       if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
+               pgste = pgste_ipte_notify(mm, ptep, pgste);
+               __ptep_ipte(addr, ptep);
+               if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
+                       pte_val(pte) |= _PAGE_PROTECT;
+               else
+                       pte_val(pte) |= _PAGE_INVALID;
+               *ptep = pte;
+       }
+       pgste_set_unlock(ptep, pgste);
+       return dirty;
+}
+
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long addr, pte_t *ptep)
@@ -1137,7 +1129,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
        pte = pte_mkold(pte);
 
        if (mm_has_pgste(vma->vm_mm)) {
-               pgste_set_pte(ptep, pte);
+               pgste = pgste_set_pte(ptep, pgste, pte);
                pgste_set_unlock(ptep, pgste);
        } else
                *ptep = pte;
@@ -1182,7 +1174,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
        pte_val(*ptep) = _PAGE_INVALID;
 
        if (mm_has_pgste(mm)) {
-               pgste = pgste_update_all(&pte, pgste);
+               pgste = pgste_update_all(&pte, pgste, mm);
                pgste_set_unlock(ptep, pgste);
        }
        return pte;
@@ -1205,7 +1197,7 @@ static inline pte_t ptep_modify_prot_start(struct mm_struct *mm,
        ptep_flush_lazy(mm, address, ptep);
 
        if (mm_has_pgste(mm)) {
-               pgste = pgste_update_all(&pte, pgste);
+               pgste = pgste_update_all(&pte, pgste, mm);
                pgste_set(ptep, pgste);
        }
        return pte;
@@ -1219,8 +1211,8 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 
        if (mm_has_pgste(mm)) {
                pgste = pgste_get(ptep);
-               pgste_set_key(ptep, pgste, pte);
-               pgste_set_pte(ptep, pte);
+               pgste_set_key(ptep, pgste, pte, mm);
+               pgste = pgste_set_pte(ptep, pgste, pte);
                pgste_set_unlock(ptep, pgste);
        } else
                *ptep = pte;
@@ -1246,7 +1238,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
                if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
                    _PGSTE_GPS_USAGE_UNUSED)
                        pte_val(pte) |= _PAGE_UNUSED;
-               pgste = pgste_update_all(&pte, pgste);
+               pgste = pgste_update_all(&pte, pgste, vma->vm_mm);
                pgste_set_unlock(ptep, pgste);
        }
        return pte;
@@ -1278,7 +1270,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
        pte_val(*ptep) = _PAGE_INVALID;
 
        if (!full && mm_has_pgste(mm)) {
-               pgste = pgste_update_all(&pte, pgste);
+               pgste = pgste_update_all(&pte, pgste, mm);
                pgste_set_unlock(ptep, pgste);
        }
        return pte;
@@ -1301,7 +1293,7 @@ static inline pte_t ptep_set_wrprotect(struct mm_struct *mm,
                pte = pte_wrprotect(pte);
 
                if (mm_has_pgste(mm)) {
-                       pgste_set_pte(ptep, pte);
+                       pgste = pgste_set_pte(ptep, pgste, pte);
                        pgste_set_unlock(ptep, pgste);
                } else
                        *ptep = pte;
@@ -1326,7 +1318,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
        ptep_flush_direct(vma->vm_mm, address, ptep);
 
        if (mm_has_pgste(vma->vm_mm)) {
-               pgste_set_pte(ptep, entry);
+               pgste = pgste_set_pte(ptep, pgste, entry);
                pgste_set_unlock(ptep, pgste);
        } else
                *ptep = entry;
@@ -1734,6 +1726,7 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
+extern void s390_enable_skey(void);
 
 /*
  * No page table caches to initialise
index 1b5300cd6d22ce86e0c88e589e37e39a8ff19737..55d69dd7473c82db6f61855856fbd60e8c029272 100644 (file)
                         PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \
                         PSW_MASK_PSTATE | PSW_ASC_PRIMARY)
 
+struct psw_bits {
+       unsigned long long      : 1;
+       unsigned long long r    : 1; /* PER-Mask */
+       unsigned long long      : 3;
+       unsigned long long t    : 1; /* DAT Mode */
+       unsigned long long i    : 1; /* Input/Output Mask */
+       unsigned long long e    : 1; /* External Mask */
+       unsigned long long key  : 4; /* PSW Key */
+       unsigned long long      : 1;
+       unsigned long long m    : 1; /* Machine-Check Mask */
+       unsigned long long w    : 1; /* Wait State */
+       unsigned long long p    : 1; /* Problem State */
+       unsigned long long as   : 2; /* Address Space Control */
+       unsigned long long cc   : 2; /* Condition Code */
+       unsigned long long pm   : 4; /* Program Mask */
+       unsigned long long ri   : 1; /* Runtime Instrumentation */
+       unsigned long long      : 6;
+       unsigned long long eaba : 2; /* Addressing Mode */
+#ifdef CONFIG_64BIT
+       unsigned long long      : 31;
+       unsigned long long ia   : 64;/* Instruction Address */
+#else
+       unsigned long long ia   : 31;/* Instruction Address */
+#endif
+};
+
+enum {
+       PSW_AMODE_24BIT = 0,
+       PSW_AMODE_31BIT = 1,
+       PSW_AMODE_64BIT = 3
+};
+
+enum {
+       PSW_AS_PRIMARY   = 0,
+       PSW_AS_ACCREG    = 1,
+       PSW_AS_SECONDARY = 2,
+       PSW_AS_HOME      = 3
+};
+
+#define psw_bits(__psw) (*({                   \
+       typecheck(psw_t, __psw);                \
+       &(*(struct psw_bits *)(&(__psw)));      \
+}))
+
 /*
  * The pt_regs struct defines the way the registers are stored on
  * the stack during a system call.
index 2f5e9932b4defddda4587c6593492712f2fb85c1..1aba89b53cb9e1e8614524c8786a7dced2960278 100644 (file)
@@ -28,7 +28,11 @@ struct sclp_ipl_info {
 
 struct sclp_cpu_entry {
        u8 address;
-       u8 reserved0[13];
+       u8 reserved0[2];
+       u8 : 3;
+       u8 siif : 1;
+       u8 : 4;
+       u8 reserved2[10];
        u8 type;
        u8 reserved1;
 } __attribute__((packed));
@@ -61,5 +65,7 @@ int sclp_pci_deconfigure(u32 fid);
 int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode);
 unsigned long sclp_get_hsa_size(void);
 void sclp_early_detect(void);
+int sclp_has_siif(void);
+unsigned int sclp_get_ibc(void);
 
 #endif /* _ASM_S390_SCLP_H */
index c003c6a73b1e3e883b814aff03704a43c3c0d9a4..0fc26430a1e5e62cdedc0ce982e4e7981dcd910c 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/types.h>
 
 #define __KVM_S390
+#define __KVM_HAVE_GUEST_DEBUG
 
 /* Device control API: s390-specific devices */
 #define KVM_DEV_FLIC_GET_ALL_IRQS      1
@@ -54,6 +55,13 @@ struct kvm_s390_io_adapter_req {
        __u64 addr;
 };
 
+/* kvm attr_group  on vm fd */
+#define KVM_S390_VM_MEM_CTRL           0
+
+/* kvm attributes for mem_ctrl */
+#define KVM_S390_VM_MEM_ENABLE_CMMA    0
+#define KVM_S390_VM_MEM_CLR_CMMA       1
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
        /* general purpose regs for s390 */
@@ -72,11 +80,31 @@ struct kvm_fpu {
        __u64 fprs[16];
 };
 
+#define KVM_GUESTDBG_USE_HW_BP         0x00010000
+
+#define KVM_HW_BP                      1
+#define KVM_HW_WP_WRITE                        2
+#define KVM_SINGLESTEP                 4
+
 struct kvm_debug_exit_arch {
+       __u64 addr;
+       __u8 type;
+       __u8 pad[7]; /* Should be set to 0 */
+};
+
+struct kvm_hw_breakpoint {
+       __u64 addr;
+       __u64 phys_addr;
+       __u64 len;
+       __u8 type;
+       __u8 pad[7]; /* Should be set to 0 */
 };
 
 /* for KVM_SET_GUEST_DEBUG */
 struct kvm_guest_debug_arch {
+       __u32 nr_hw_bp;
+       __u32 pad; /* Should be set to 0 */
+       struct kvm_hw_breakpoint __user *hw_bp;
 };
 
 #define KVM_SYNC_PREFIX (1UL << 0)
diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
new file mode 100644 (file)
index 0000000..3d97f61
--- /dev/null
@@ -0,0 +1,245 @@
+#ifndef _UAPI_ASM_S390_SIE_H
+#define _UAPI_ASM_S390_SIE_H
+
+#include <asm/sigp.h>
+
+#define diagnose_codes                                         \
+       { 0x10, "DIAG (0x10) release pages" },                  \
+       { 0x44, "DIAG (0x44) time slice end" },                 \
+       { 0x9c, "DIAG (0x9c) time slice end directed" },        \
+       { 0x204, "DIAG (0x204) logical-cpu utilization" },      \
+       { 0x258, "DIAG (0x258) page-reference services" },      \
+       { 0x308, "DIAG (0x308) ipl functions" },                \
+       { 0x500, "DIAG (0x500) KVM virtio functions" },         \
+       { 0x501, "DIAG (0x501) KVM breakpoint" }
+
+#define sigp_order_codes                                               \
+       { SIGP_SENSE, "SIGP sense" },                                   \
+       { SIGP_EXTERNAL_CALL, "SIGP external call" },                   \
+       { SIGP_EMERGENCY_SIGNAL, "SIGP emergency signal" },             \
+       { SIGP_STOP, "SIGP stop" },                                     \
+       { SIGP_STOP_AND_STORE_STATUS, "SIGP stop and store status" },   \
+       { SIGP_SET_ARCHITECTURE, "SIGP set architecture" },             \
+       { SIGP_SET_PREFIX, "SIGP set prefix" },                         \
+       { SIGP_SENSE_RUNNING, "SIGP sense running" },                   \
+       { SIGP_RESTART, "SIGP restart" },                               \
+       { SIGP_INITIAL_CPU_RESET, "SIGP initial cpu reset" },           \
+       { SIGP_STORE_STATUS_AT_ADDRESS, "SIGP store status at address" }
+
+#define icpt_prog_codes                                                \
+       { 0x0001, "Prog Operation" },                           \
+       { 0x0002, "Prog Privileged Operation" },                \
+       { 0x0003, "Prog Execute" },                             \
+       { 0x0004, "Prog Protection" },                          \
+       { 0x0005, "Prog Addressing" },                          \
+       { 0x0006, "Prog Specification" },                       \
+       { 0x0007, "Prog Data" },                                \
+       { 0x0008, "Prog Fixedpoint overflow" },                 \
+       { 0x0009, "Prog Fixedpoint divide" },                   \
+       { 0x000A, "Prog Decimal overflow" },                    \
+       { 0x000B, "Prog Decimal divide" },                      \
+       { 0x000C, "Prog HFP exponent overflow" },               \
+       { 0x000D, "Prog HFP exponent underflow" },              \
+       { 0x000E, "Prog HFP significance" },                    \
+       { 0x000F, "Prog HFP divide" },                          \
+       { 0x0010, "Prog Segment translation" },                 \
+       { 0x0011, "Prog Page translation" },                    \
+       { 0x0012, "Prog Translation specification" },           \
+       { 0x0013, "Prog Special operation" },                   \
+       { 0x0015, "Prog Operand" },                             \
+       { 0x0016, "Prog Trace table" },                         \
+       { 0x0017, "Prog ASNtranslation specification" },        \
+       { 0x001C, "Prog Spaceswitch event" },                   \
+       { 0x001D, "Prog HFP square root" },                     \
+       { 0x001F, "Prog PCtranslation specification" },         \
+       { 0x0020, "Prog AFX translation" },                     \
+       { 0x0021, "Prog ASX translation" },                     \
+       { 0x0022, "Prog LX translation" },                      \
+       { 0x0023, "Prog EX translation" },                      \
+       { 0x0024, "Prog Primary authority" },                   \
+       { 0x0025, "Prog Secondary authority" },                 \
+       { 0x0026, "Prog LFXtranslation exception" },            \
+       { 0x0027, "Prog LSXtranslation exception" },            \
+       { 0x0028, "Prog ALET specification" },                  \
+       { 0x0029, "Prog ALEN translation" },                    \
+       { 0x002A, "Prog ALE sequence" },                        \
+       { 0x002B, "Prog ASTE validity" },                       \
+       { 0x002C, "Prog ASTE sequence" },                       \
+       { 0x002D, "Prog Extended authority" },                  \
+       { 0x002E, "Prog LSTE sequence" },                       \
+       { 0x002F, "Prog ASTE instance" },                       \
+       { 0x0030, "Prog Stack full" },                          \
+       { 0x0031, "Prog Stack empty" },                         \
+       { 0x0032, "Prog Stack specification" },                 \
+       { 0x0033, "Prog Stack type" },                          \
+       { 0x0034, "Prog Stack operation" },                     \
+       { 0x0039, "Prog Region first translation" },            \
+       { 0x003A, "Prog Region second translation" },           \
+       { 0x003B, "Prog Region third translation" },            \
+       { 0x0040, "Prog Monitor event" },                       \
+       { 0x0080, "Prog PER event" },                           \
+       { 0x0119, "Prog Crypto operation" }
+
+#define exit_code_ipa0(ipa0, opcode, mnemonic)         \
+       { (ipa0 << 8 | opcode), #ipa0 " " mnemonic }
+#define exit_code(opcode, mnemonic)                    \
+       { opcode, mnemonic }
+
+#define icpt_insn_codes                                \
+       exit_code_ipa0(0x01, 0x01, "PR"),       \
+       exit_code_ipa0(0x01, 0x04, "PTFF"),     \
+       exit_code_ipa0(0x01, 0x07, "SCKPF"),    \
+       exit_code_ipa0(0xAA, 0x00, "RINEXT"),   \
+       exit_code_ipa0(0xAA, 0x01, "RION"),     \
+       exit_code_ipa0(0xAA, 0x02, "TRIC"),     \
+       exit_code_ipa0(0xAA, 0x03, "RIOFF"),    \
+       exit_code_ipa0(0xAA, 0x04, "RIEMIT"),   \
+       exit_code_ipa0(0xB2, 0x02, "STIDP"),    \
+       exit_code_ipa0(0xB2, 0x04, "SCK"),      \
+       exit_code_ipa0(0xB2, 0x05, "STCK"),     \
+       exit_code_ipa0(0xB2, 0x06, "SCKC"),     \
+       exit_code_ipa0(0xB2, 0x07, "STCKC"),    \
+       exit_code_ipa0(0xB2, 0x08, "SPT"),      \
+       exit_code_ipa0(0xB2, 0x09, "STPT"),     \
+       exit_code_ipa0(0xB2, 0x0d, "PTLB"),     \
+       exit_code_ipa0(0xB2, 0x10, "SPX"),      \
+       exit_code_ipa0(0xB2, 0x11, "STPX"),     \
+       exit_code_ipa0(0xB2, 0x12, "STAP"),     \
+       exit_code_ipa0(0xB2, 0x14, "SIE"),      \
+       exit_code_ipa0(0xB2, 0x16, "SETR"),     \
+       exit_code_ipa0(0xB2, 0x17, "STETR"),    \
+       exit_code_ipa0(0xB2, 0x18, "PC"),       \
+       exit_code_ipa0(0xB2, 0x20, "SERVC"),    \
+       exit_code_ipa0(0xB2, 0x28, "PT"),       \
+       exit_code_ipa0(0xB2, 0x29, "ISKE"),     \
+       exit_code_ipa0(0xB2, 0x2a, "RRBE"),     \
+       exit_code_ipa0(0xB2, 0x2b, "SSKE"),     \
+       exit_code_ipa0(0xB2, 0x2c, "TB"),       \
+       exit_code_ipa0(0xB2, 0x2e, "PGIN"),     \
+       exit_code_ipa0(0xB2, 0x2f, "PGOUT"),    \
+       exit_code_ipa0(0xB2, 0x30, "CSCH"),     \
+       exit_code_ipa0(0xB2, 0x31, "HSCH"),     \
+       exit_code_ipa0(0xB2, 0x32, "MSCH"),     \
+       exit_code_ipa0(0xB2, 0x33, "SSCH"),     \
+       exit_code_ipa0(0xB2, 0x34, "STSCH"),    \
+       exit_code_ipa0(0xB2, 0x35, "TSCH"),     \
+       exit_code_ipa0(0xB2, 0x36, "TPI"),      \
+       exit_code_ipa0(0xB2, 0x37, "SAL"),      \
+       exit_code_ipa0(0xB2, 0x38, "RSCH"),     \
+       exit_code_ipa0(0xB2, 0x39, "STCRW"),    \
+       exit_code_ipa0(0xB2, 0x3a, "STCPS"),    \
+       exit_code_ipa0(0xB2, 0x3b, "RCHP"),     \
+       exit_code_ipa0(0xB2, 0x3c, "SCHM"),     \
+       exit_code_ipa0(0xB2, 0x40, "BAKR"),     \
+       exit_code_ipa0(0xB2, 0x48, "PALB"),     \
+       exit_code_ipa0(0xB2, 0x4c, "TAR"),      \
+       exit_code_ipa0(0xB2, 0x50, "CSP"),      \
+       exit_code_ipa0(0xB2, 0x54, "MVPG"),     \
+       exit_code_ipa0(0xB2, 0x58, "BSG"),      \
+       exit_code_ipa0(0xB2, 0x5a, "BSA"),      \
+       exit_code_ipa0(0xB2, 0x5f, "CHSC"),     \
+       exit_code_ipa0(0xB2, 0x74, "SIGA"),     \
+       exit_code_ipa0(0xB2, 0x76, "XSCH"),     \
+       exit_code_ipa0(0xB2, 0x78, "STCKE"),    \
+       exit_code_ipa0(0xB2, 0x7c, "STCKF"),    \
+       exit_code_ipa0(0xB2, 0x7d, "STSI"),     \
+       exit_code_ipa0(0xB2, 0xb0, "STFLE"),    \
+       exit_code_ipa0(0xB2, 0xb1, "STFL"),     \
+       exit_code_ipa0(0xB2, 0xb2, "LPSWE"),    \
+       exit_code_ipa0(0xB2, 0xf8, "TEND"),     \
+       exit_code_ipa0(0xB2, 0xfc, "TABORT"),   \
+       exit_code_ipa0(0xB9, 0x1e, "KMAC"),     \
+       exit_code_ipa0(0xB9, 0x28, "PCKMO"),    \
+       exit_code_ipa0(0xB9, 0x2a, "KMF"),      \
+       exit_code_ipa0(0xB9, 0x2b, "KMO"),      \
+       exit_code_ipa0(0xB9, 0x2d, "KMCTR"),    \
+       exit_code_ipa0(0xB9, 0x2e, "KM"),       \
+       exit_code_ipa0(0xB9, 0x2f, "KMC"),      \
+       exit_code_ipa0(0xB9, 0x3e, "KIMD"),     \
+       exit_code_ipa0(0xB9, 0x3f, "KLMD"),     \
+       exit_code_ipa0(0xB9, 0x8a, "CSPG"),     \
+       exit_code_ipa0(0xB9, 0x8d, "EPSW"),     \
+       exit_code_ipa0(0xB9, 0x8e, "IDTE"),     \
+       exit_code_ipa0(0xB9, 0x8f, "CRDTE"),    \
+       exit_code_ipa0(0xB9, 0x9c, "EQBS"),     \
+       exit_code_ipa0(0xB9, 0xa2, "PTF"),      \
+       exit_code_ipa0(0xB9, 0xab, "ESSA"),     \
+       exit_code_ipa0(0xB9, 0xae, "RRBM"),     \
+       exit_code_ipa0(0xB9, 0xaf, "PFMF"),     \
+       exit_code_ipa0(0xE3, 0x03, "LRAG"),     \
+       exit_code_ipa0(0xE3, 0x13, "LRAY"),     \
+       exit_code_ipa0(0xE3, 0x25, "NTSTG"),    \
+       exit_code_ipa0(0xE5, 0x00, "LASP"),     \
+       exit_code_ipa0(0xE5, 0x01, "TPROT"),    \
+       exit_code_ipa0(0xE5, 0x60, "TBEGIN"),   \
+       exit_code_ipa0(0xE5, 0x61, "TBEGINC"),  \
+       exit_code_ipa0(0xEB, 0x25, "STCTG"),    \
+       exit_code_ipa0(0xEB, 0x2f, "LCTLG"),    \
+       exit_code_ipa0(0xEB, 0x60, "LRIC"),     \
+       exit_code_ipa0(0xEB, 0x61, "STRIC"),    \
+       exit_code_ipa0(0xEB, 0x62, "MRIC"),     \
+       exit_code_ipa0(0xEB, 0x8a, "SQBS"),     \
+       exit_code_ipa0(0xC8, 0x01, "ECTG"),     \
+       exit_code(0x0a, "SVC"),                 \
+       exit_code(0x80, "SSM"),                 \
+       exit_code(0x82, "LPSW"),                \
+       exit_code(0x83, "DIAG"),                \
+       exit_code(0xae, "SIGP"),                \
+       exit_code(0xac, "STNSM"),               \
+       exit_code(0xad, "STOSM"),               \
+       exit_code(0xb1, "LRA"),                 \
+       exit_code(0xb6, "STCTL"),               \
+       exit_code(0xb7, "LCTL"),                \
+       exit_code(0xee, "PLO")
+
+#define sie_intercept_code                                     \
+       { 0x00, "Host interruption" },                          \
+       { 0x04, "Instruction" },                                \
+       { 0x08, "Program interruption" },                       \
+       { 0x0c, "Instruction and program interruption" },       \
+       { 0x10, "External request" },                           \
+       { 0x14, "External interruption" },                      \
+       { 0x18, "I/O request" },                                \
+       { 0x1c, "Wait state" },                                 \
+       { 0x20, "Validity" },                                   \
+       { 0x28, "Stop request" },                               \
+       { 0x2c, "Operation exception" },                        \
+       { 0x38, "Partial-execution" },                          \
+       { 0x3c, "I/O interruption" },                           \
+       { 0x40, "I/O instruction" },                            \
+       { 0x48, "Timing subset" }
+
+/*
+ * This is the simple interceptable instructions decoder.
+ *
+ * It will be used as userspace interface and it can be used in places
+ * that does not allow to use general decoder functions,
+ * such as trace events declarations.
+ *
+ * Some userspace tools may want to parse this code
+ * and would be confused by switch(), if() and other statements,
+ * but they can understand conditional operator.
+ */
+#define INSN_DECODE_IPA0(ipa0, insn, rshift, mask)             \
+       (insn >> 56) == (ipa0) ?                                \
+               ((ipa0 << 8) | ((insn >> rshift) & mask)) :
+
+#define INSN_DECODE(insn) (insn >> 56)
+
+/*
+ * The macro icpt_insn_decoder() takes an intercepted instruction
+ * and returns a key, which can be used to find a mnemonic name
+ * of the instruction in the icpt_insn_codes table.
+ */
+#define icpt_insn_decoder(insn)                        \
+       INSN_DECODE_IPA0(0x01, insn, 48, 0xff)  \
+       INSN_DECODE_IPA0(0xaa, insn, 48, 0x0f)  \
+       INSN_DECODE_IPA0(0xb2, insn, 48, 0xff)  \
+       INSN_DECODE_IPA0(0xb9, insn, 48, 0xff)  \
+       INSN_DECODE_IPA0(0xe3, insn, 48, 0xff)  \
+       INSN_DECODE_IPA0(0xe5, insn, 48, 0xff)  \
+       INSN_DECODE_IPA0(0xeb, insn, 16, 0xff)  \
+       INSN_DECODE_IPA0(0xc8, insn, 48, 0x0f)  \
+       INSN_DECODE(insn)
+
+#endif /* _UAPI_ASM_S390_SIE_H */
index 0c070c44cde2d18ed27a03a72532a924886329b9..afe1715a4eb765cb06d1ab3277bce0b4ce8fc3c2 100644 (file)
@@ -90,16 +90,22 @@ int main(void)
        DEFINE(__LC_PGM_ILC, offsetof(struct _lowcore, pgm_ilc));
        DEFINE(__LC_PGM_INT_CODE, offsetof(struct _lowcore, pgm_code));
        DEFINE(__LC_TRANS_EXC_CODE, offsetof(struct _lowcore, trans_exc_code));
-       DEFINE(__LC_PER_CAUSE, offsetof(struct _lowcore, per_perc_atmid));
+       DEFINE(__LC_MON_CLASS_NR, offsetof(struct _lowcore, mon_class_num));
+       DEFINE(__LC_PER_CODE, offsetof(struct _lowcore, per_code));
+       DEFINE(__LC_PER_ATMID, offsetof(struct _lowcore, per_atmid));
        DEFINE(__LC_PER_ADDRESS, offsetof(struct _lowcore, per_address));
-       DEFINE(__LC_PER_PAID, offsetof(struct _lowcore, per_access_id));
-       DEFINE(__LC_AR_MODE_ID, offsetof(struct _lowcore, ar_access_id));
+       DEFINE(__LC_EXC_ACCESS_ID, offsetof(struct _lowcore, exc_access_id));
+       DEFINE(__LC_PER_ACCESS_ID, offsetof(struct _lowcore, per_access_id));
+       DEFINE(__LC_OP_ACCESS_ID, offsetof(struct _lowcore, op_access_id));
+       DEFINE(__LC_AR_MODE_ID, offsetof(struct _lowcore, ar_mode_id));
+       DEFINE(__LC_MON_CODE, offsetof(struct _lowcore, monitor_code));
        DEFINE(__LC_SUBCHANNEL_ID, offsetof(struct _lowcore, subchannel_id));
        DEFINE(__LC_SUBCHANNEL_NR, offsetof(struct _lowcore, subchannel_nr));
        DEFINE(__LC_IO_INT_PARM, offsetof(struct _lowcore, io_int_parm));
        DEFINE(__LC_IO_INT_WORD, offsetof(struct _lowcore, io_int_word));
        DEFINE(__LC_STFL_FAC_LIST, offsetof(struct _lowcore, stfl_fac_list));
        DEFINE(__LC_MCCK_CODE, offsetof(struct _lowcore, mcck_interruption_code));
+       DEFINE(__LC_MCCK_EXT_DAM_CODE, offsetof(struct _lowcore, external_damage_code));
        DEFINE(__LC_RST_OLD_PSW, offsetof(struct _lowcore, restart_old_psw));
        DEFINE(__LC_EXT_OLD_PSW, offsetof(struct _lowcore, external_old_psw));
        DEFINE(__LC_SVC_OLD_PSW, offsetof(struct _lowcore, svc_old_psw));
@@ -157,6 +163,8 @@ int main(void)
 #ifdef CONFIG_32BIT
        DEFINE(SAVE_AREA_BASE, offsetof(struct _lowcore, extended_save_area_addr));
 #else /* CONFIG_32BIT */
+       DEFINE(__LC_DATA_EXC_CODE, offsetof(struct _lowcore, data_exc_code));
+       DEFINE(__LC_MCCK_FAIL_STOR_ADDR, offsetof(struct _lowcore, failing_storage_address));
        DEFINE(__LC_EXT_PARAMS2, offsetof(struct _lowcore, ext_params2));
        DEFINE(SAVE_AREA_BASE, offsetof(struct _lowcore, floating_pt_save_area));
        DEFINE(__LC_PASTE, offsetof(struct _lowcore, paste));
index 18e5af848f9a67b7aa7374fe46da9a62d4f6ad95..70203265196fa704fcc14723ce1df68f53ead4d6 100644 (file)
@@ -389,8 +389,8 @@ ENTRY(pgm_check_handler)
        jz      pgm_kprobe
        oi      __PT_FLAGS+3(%r11),_PIF_PER_TRAP
        mvc     __THREAD_per_address(4,%r1),__LC_PER_ADDRESS
-       mvc     __THREAD_per_cause(2,%r1),__LC_PER_CAUSE
-       mvc     __THREAD_per_paid(1,%r1),__LC_PER_PAID
+       mvc     __THREAD_per_cause(2,%r1),__LC_PER_CODE
+       mvc     __THREAD_per_paid(1,%r1),__LC_PER_ACCESS_ID
 0:     REENABLE_IRQS
        xc      __SF_BACKCHAIN(4,%r15),__SF_BACKCHAIN(%r15)
        l       %r1,BASED(.Ljump_table)
index c41f3f9067200647441127790afa81ae070f22d7..f2e674c702e1f904de800b94c33bedfd5fd30b91 100644 (file)
@@ -420,8 +420,8 @@ ENTRY(pgm_check_handler)
        jz      pgm_kprobe
        oi      __PT_FLAGS+7(%r11),_PIF_PER_TRAP
        mvc     __THREAD_per_address(8,%r14),__LC_PER_ADDRESS
-       mvc     __THREAD_per_cause(2,%r14),__LC_PER_CAUSE
-       mvc     __THREAD_per_paid(1,%r14),__LC_PER_PAID
+       mvc     __THREAD_per_cause(2,%r14),__LC_PER_CODE
+       mvc     __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID
 0:     REENABLE_IRQS
        xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
        larl    %r1,pgm_check_table
index d3adb37e93a4c99a8d1debd8bf06069ad1d38a24..b3b553469650888fd31df5d975fe4b93b2399f70 100644 (file)
@@ -11,5 +11,7 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
-kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o diag.o
+kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
+kvm-objs += diag.o gaccess.o guestdbg.o
+
 obj-$(CONFIG_KVM) += kvm.o
index 08dfc839a6cfeeb3655f64d850ce1ed6e60d49cc..0161675878a2483c7a3114ef6a82afa624a4ebcc 100644 (file)
@@ -23,7 +23,7 @@
 static int diag_release_pages(struct kvm_vcpu *vcpu)
 {
        unsigned long start, end;
-       unsigned long prefix  = vcpu->arch.sie_block->prefix;
+       unsigned long prefix  = kvm_s390_get_prefix(vcpu);
 
        start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
        end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096;
@@ -64,12 +64,12 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
        int rc;
        u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4;
        u16 ry = (vcpu->arch.sie_block->ipa & 0x0f);
-       unsigned long hva_token = KVM_HVA_ERR_BAD;
 
        if (vcpu->run->s.regs.gprs[rx] & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-       if (copy_from_guest(vcpu, &parm, vcpu->run->s.regs.gprs[rx], sizeof(parm)))
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm));
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
        if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
@@ -89,8 +89,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
                    parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
                        return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-               hva_token = gfn_to_hva(vcpu->kvm, gpa_to_gfn(parm.token_addr));
-               if (kvm_is_error_hva(hva_token))
+               if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr))
                        return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
                vcpu->arch.pfault_token = parm.token_addr;
@@ -167,23 +166,17 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
 
        VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode);
        switch (subcode) {
-       case 0:
-       case 1:
-               page_table_reset_pgste(current->mm, 0, TASK_SIZE);
-               return -EOPNOTSUPP;
        case 3:
                vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR;
-               page_table_reset_pgste(current->mm, 0, TASK_SIZE);
                break;
        case 4:
                vcpu->run->s390_reset_flags = 0;
-               page_table_reset_pgste(current->mm, 0, TASK_SIZE);
                break;
        default:
                return -EOPNOTSUPP;
        }
 
-       atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_vcpu_stop(vcpu);
        vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM;
        vcpu->run->s390_reset_flags |= KVM_S390_RESET_IPL;
        vcpu->run->s390_reset_flags |= KVM_S390_RESET_CPU_INIT;
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
new file mode 100644 (file)
index 0000000..4653ac6
--- /dev/null
@@ -0,0 +1,726 @@
+/*
+ * guest access functions
+ *
+ * Copyright IBM Corp. 2014
+ *
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/err.h>
+#include <asm/pgtable.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+union asce {
+       unsigned long val;
+       struct {
+               unsigned long origin : 52; /* Region- or Segment-Table Origin */
+               unsigned long    : 2;
+               unsigned long g  : 1; /* Subspace Group Control */
+               unsigned long p  : 1; /* Private Space Control */
+               unsigned long s  : 1; /* Storage-Alteration-Event Control */
+               unsigned long x  : 1; /* Space-Switch-Event Control */
+               unsigned long r  : 1; /* Real-Space Control */
+               unsigned long    : 1;
+               unsigned long dt : 2; /* Designation-Type Control */
+               unsigned long tl : 2; /* Region- or Segment-Table Length */
+       };
+};
+
+enum {
+       ASCE_TYPE_SEGMENT = 0,
+       ASCE_TYPE_REGION3 = 1,
+       ASCE_TYPE_REGION2 = 2,
+       ASCE_TYPE_REGION1 = 3
+};
+
+union region1_table_entry {
+       unsigned long val;
+       struct {
+               unsigned long rto: 52;/* Region-Table Origin */
+               unsigned long    : 2;
+               unsigned long p  : 1; /* DAT-Protection Bit */
+               unsigned long    : 1;
+               unsigned long tf : 2; /* Region-Second-Table Offset */
+               unsigned long i  : 1; /* Region-Invalid Bit */
+               unsigned long    : 1;
+               unsigned long tt : 2; /* Table-Type Bits */
+               unsigned long tl : 2; /* Region-Second-Table Length */
+       };
+};
+
+union region2_table_entry {
+       unsigned long val;
+       struct {
+               unsigned long rto: 52;/* Region-Table Origin */
+               unsigned long    : 2;
+               unsigned long p  : 1; /* DAT-Protection Bit */
+               unsigned long    : 1;
+               unsigned long tf : 2; /* Region-Third-Table Offset */
+               unsigned long i  : 1; /* Region-Invalid Bit */
+               unsigned long    : 1;
+               unsigned long tt : 2; /* Table-Type Bits */
+               unsigned long tl : 2; /* Region-Third-Table Length */
+       };
+};
+
+struct region3_table_entry_fc0 {
+       unsigned long sto: 52;/* Segment-Table Origin */
+       unsigned long    : 1;
+       unsigned long fc : 1; /* Format-Control */
+       unsigned long p  : 1; /* DAT-Protection Bit */
+       unsigned long    : 1;
+       unsigned long tf : 2; /* Segment-Table Offset */
+       unsigned long i  : 1; /* Region-Invalid Bit */
+       unsigned long cr : 1; /* Common-Region Bit */
+       unsigned long tt : 2; /* Table-Type Bits */
+       unsigned long tl : 2; /* Segment-Table Length */
+};
+
+struct region3_table_entry_fc1 {
+       unsigned long rfaa : 33; /* Region-Frame Absolute Address */
+       unsigned long    : 14;
+       unsigned long av : 1; /* ACCF-Validity Control */
+       unsigned long acc: 4; /* Access-Control Bits */
+       unsigned long f  : 1; /* Fetch-Protection Bit */
+       unsigned long fc : 1; /* Format-Control */
+       unsigned long p  : 1; /* DAT-Protection Bit */
+       unsigned long co : 1; /* Change-Recording Override */
+       unsigned long    : 2;
+       unsigned long i  : 1; /* Region-Invalid Bit */
+       unsigned long cr : 1; /* Common-Region Bit */
+       unsigned long tt : 2; /* Table-Type Bits */
+       unsigned long    : 2;
+};
+
+union region3_table_entry {
+       unsigned long val;
+       struct region3_table_entry_fc0 fc0;
+       struct region3_table_entry_fc1 fc1;
+       struct {
+               unsigned long    : 53;
+               unsigned long fc : 1; /* Format-Control */
+               unsigned long    : 4;
+               unsigned long i  : 1; /* Region-Invalid Bit */
+               unsigned long cr : 1; /* Common-Region Bit */
+               unsigned long tt : 2; /* Table-Type Bits */
+               unsigned long    : 2;
+       };
+};
+
+struct segment_entry_fc0 {
+       unsigned long pto: 53;/* Page-Table Origin */
+       unsigned long fc : 1; /* Format-Control */
+       unsigned long p  : 1; /* DAT-Protection Bit */
+       unsigned long    : 3;
+       unsigned long i  : 1; /* Segment-Invalid Bit */
+       unsigned long cs : 1; /* Common-Segment Bit */
+       unsigned long tt : 2; /* Table-Type Bits */
+       unsigned long    : 2;
+};
+
+struct segment_entry_fc1 {
+       unsigned long sfaa : 44; /* Segment-Frame Absolute Address */
+       unsigned long    : 3;
+       unsigned long av : 1; /* ACCF-Validity Control */
+       unsigned long acc: 4; /* Access-Control Bits */
+       unsigned long f  : 1; /* Fetch-Protection Bit */
+       unsigned long fc : 1; /* Format-Control */
+       unsigned long p  : 1; /* DAT-Protection Bit */
+       unsigned long co : 1; /* Change-Recording Override */
+       unsigned long    : 2;
+       unsigned long i  : 1; /* Segment-Invalid Bit */
+       unsigned long cs : 1; /* Common-Segment Bit */
+       unsigned long tt : 2; /* Table-Type Bits */
+       unsigned long    : 2;
+};
+
+union segment_table_entry {
+       unsigned long val;
+       struct segment_entry_fc0 fc0;
+       struct segment_entry_fc1 fc1;
+       struct {
+               unsigned long    : 53;
+               unsigned long fc : 1; /* Format-Control */
+               unsigned long    : 4;
+               unsigned long i  : 1; /* Segment-Invalid Bit */
+               unsigned long cs : 1; /* Common-Segment Bit */
+               unsigned long tt : 2; /* Table-Type Bits */
+               unsigned long    : 2;
+       };
+};
+
+enum {
+       TABLE_TYPE_SEGMENT = 0,
+       TABLE_TYPE_REGION3 = 1,
+       TABLE_TYPE_REGION2 = 2,
+       TABLE_TYPE_REGION1 = 3
+};
+
+union page_table_entry {
+       unsigned long val;
+       struct {
+               unsigned long pfra : 52; /* Page-Frame Real Address */
+               unsigned long z  : 1; /* Zero Bit */
+               unsigned long i  : 1; /* Page-Invalid Bit */
+               unsigned long p  : 1; /* DAT-Protection Bit */
+               unsigned long co : 1; /* Change-Recording Override */
+               unsigned long    : 8;
+       };
+};
+
+/*
+ * vaddress union in order to easily decode a virtual address into its
+ * region first index, region second index etc. parts.
+ */
+union vaddress {
+       unsigned long addr;
+       struct {
+               unsigned long rfx : 11;
+               unsigned long rsx : 11;
+               unsigned long rtx : 11;
+               unsigned long sx  : 11;
+               unsigned long px  : 8;
+               unsigned long bx  : 12;
+       };
+       struct {
+               unsigned long rfx01 : 2;
+               unsigned long       : 9;
+               unsigned long rsx01 : 2;
+               unsigned long       : 9;
+               unsigned long rtx01 : 2;
+               unsigned long       : 9;
+               unsigned long sx01  : 2;
+               unsigned long       : 29;
+       };
+};
+
+/*
+ * raddress union which will contain the result (real or absolute address)
+ * after a page table walk. The rfaa, sfaa and pfra members are used to
+ * simply assign them the value of a region, segment or page table entry.
+ */
+union raddress {
+       unsigned long addr;
+       unsigned long rfaa : 33; /* Region-Frame Absolute Address */
+       unsigned long sfaa : 44; /* Segment-Frame Absolute Address */
+       unsigned long pfra : 52; /* Page-Frame Real Address */
+};
+
+static int ipte_lock_count;
+static DEFINE_MUTEX(ipte_mutex);
+
+int ipte_lock_held(struct kvm_vcpu *vcpu)
+{
+       union ipte_control *ic = &vcpu->kvm->arch.sca->ipte_control;
+
+       if (vcpu->arch.sie_block->eca & 1)
+               return ic->kh != 0;
+       return ipte_lock_count != 0;
+}
+
+static void ipte_lock_simple(struct kvm_vcpu *vcpu)
+{
+       union ipte_control old, new, *ic;
+
+       mutex_lock(&ipte_mutex);
+       ipte_lock_count++;
+       if (ipte_lock_count > 1)
+               goto out;
+       ic = &vcpu->kvm->arch.sca->ipte_control;
+       do {
+               old = ACCESS_ONCE(*ic);
+               while (old.k) {
+                       cond_resched();
+                       old = ACCESS_ONCE(*ic);
+               }
+               new = old;
+               new.k = 1;
+       } while (cmpxchg(&ic->val, old.val, new.val) != old.val);
+out:
+       mutex_unlock(&ipte_mutex);
+}
+
+static void ipte_unlock_simple(struct kvm_vcpu *vcpu)
+{
+       union ipte_control old, new, *ic;
+
+       mutex_lock(&ipte_mutex);
+       ipte_lock_count--;
+       if (ipte_lock_count)
+               goto out;
+       ic = &vcpu->kvm->arch.sca->ipte_control;
+       do {
+               new = old = ACCESS_ONCE(*ic);
+               new.k = 0;
+       } while (cmpxchg(&ic->val, old.val, new.val) != old.val);
+       if (!ipte_lock_count)
+               wake_up(&vcpu->kvm->arch.ipte_wq);
+out:
+       mutex_unlock(&ipte_mutex);
+}
+
+static void ipte_lock_siif(struct kvm_vcpu *vcpu)
+{
+       union ipte_control old, new, *ic;
+
+       ic = &vcpu->kvm->arch.sca->ipte_control;
+       do {
+               old = ACCESS_ONCE(*ic);
+               while (old.kg) {
+                       cond_resched();
+                       old = ACCESS_ONCE(*ic);
+               }
+               new = old;
+               new.k = 1;
+               new.kh++;
+       } while (cmpxchg(&ic->val, old.val, new.val) != old.val);
+}
+
+static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
+{
+       union ipte_control old, new, *ic;
+
+       ic = &vcpu->kvm->arch.sca->ipte_control;
+       do {
+               new = old = ACCESS_ONCE(*ic);
+               new.kh--;
+               if (!new.kh)
+                       new.k = 0;
+       } while (cmpxchg(&ic->val, old.val, new.val) != old.val);
+       if (!new.kh)
+               wake_up(&vcpu->kvm->arch.ipte_wq);
+}
+
+void ipte_lock(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.sie_block->eca & 1)
+               ipte_lock_siif(vcpu);
+       else
+               ipte_lock_simple(vcpu);
+}
+
+void ipte_unlock(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.sie_block->eca & 1)
+               ipte_unlock_siif(vcpu);
+       else
+               ipte_unlock_simple(vcpu);
+}
+
+static unsigned long get_vcpu_asce(struct kvm_vcpu *vcpu)
+{
+       switch (psw_bits(vcpu->arch.sie_block->gpsw).as) {
+       case PSW_AS_PRIMARY:
+               return vcpu->arch.sie_block->gcr[1];
+       case PSW_AS_SECONDARY:
+               return vcpu->arch.sie_block->gcr[7];
+       case PSW_AS_HOME:
+               return vcpu->arch.sie_block->gcr[13];
+       }
+       return 0;
+}
+
+static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
+{
+       return kvm_read_guest(kvm, gpa, val, sizeof(*val));
+}
+
+/**
+ * guest_translate - translate a guest virtual into a guest absolute address
+ * @vcpu: virtual cpu
+ * @gva: guest virtual address
+ * @gpa: points to where guest physical (absolute) address should be stored
+ * @write: indicates if access is a write access
+ *
+ * Translate a guest virtual address into a guest absolute address by means
+ * of dynamic address translation as specified by the architecuture.
+ * If the resulting absolute address is not available in the configuration
+ * an addressing exception is indicated and @gpa will not be changed.
+ *
+ * Returns: - zero on success; @gpa contains the resulting absolute address
+ *         - a negative value if guest access failed due to e.g. broken
+ *           guest mapping
+ *         - a positve value if an access exception happened. In this case
+ *           the returned value is the program interruption code as defined
+ *           by the architecture
+ */
+static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
+                                    unsigned long *gpa, int write)
+{
+       union vaddress vaddr = {.addr = gva};
+       union raddress raddr = {.addr = gva};
+       union page_table_entry pte;
+       int dat_protection = 0;
+       union ctlreg0 ctlreg0;
+       unsigned long ptr;
+       int edat1, edat2;
+       union asce asce;
+
+       ctlreg0.val = vcpu->arch.sie_block->gcr[0];
+       edat1 = ctlreg0.edat && test_vfacility(8);
+       edat2 = edat1 && test_vfacility(78);
+       asce.val = get_vcpu_asce(vcpu);
+       if (asce.r)
+               goto real_address;
+       ptr = asce.origin * 4096;
+       switch (asce.dt) {
+       case ASCE_TYPE_REGION1:
+               if (vaddr.rfx01 > asce.tl)
+                       return PGM_REGION_FIRST_TRANS;
+               ptr += vaddr.rfx * 8;
+               break;
+       case ASCE_TYPE_REGION2:
+               if (vaddr.rfx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.rsx01 > asce.tl)
+                       return PGM_REGION_SECOND_TRANS;
+               ptr += vaddr.rsx * 8;
+               break;
+       case ASCE_TYPE_REGION3:
+               if (vaddr.rfx || vaddr.rsx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.rtx01 > asce.tl)
+                       return PGM_REGION_THIRD_TRANS;
+               ptr += vaddr.rtx * 8;
+               break;
+       case ASCE_TYPE_SEGMENT:
+               if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.sx01 > asce.tl)
+                       return PGM_SEGMENT_TRANSLATION;
+               ptr += vaddr.sx * 8;
+               break;
+       }
+       switch (asce.dt) {
+       case ASCE_TYPE_REGION1: {
+               union region1_table_entry rfte;
+
+               if (kvm_is_error_gpa(vcpu->kvm, ptr))
+                       return PGM_ADDRESSING;
+               if (deref_table(vcpu->kvm, ptr, &rfte.val))
+                       return -EFAULT;
+               if (rfte.i)
+                       return PGM_REGION_FIRST_TRANS;
+               if (rfte.tt != TABLE_TYPE_REGION1)
+                       return PGM_TRANSLATION_SPEC;
+               if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+                       return PGM_REGION_SECOND_TRANS;
+               if (edat1)
+                       dat_protection |= rfte.p;
+               ptr = rfte.rto * 4096 + vaddr.rsx * 8;
+       }
+               /* fallthrough */
+       case ASCE_TYPE_REGION2: {
+               union region2_table_entry rste;
+
+               if (kvm_is_error_gpa(vcpu->kvm, ptr))
+                       return PGM_ADDRESSING;
+               if (deref_table(vcpu->kvm, ptr, &rste.val))
+                       return -EFAULT;
+               if (rste.i)
+                       return PGM_REGION_SECOND_TRANS;
+               if (rste.tt != TABLE_TYPE_REGION2)
+                       return PGM_TRANSLATION_SPEC;
+               if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+                       return PGM_REGION_THIRD_TRANS;
+               if (edat1)
+                       dat_protection |= rste.p;
+               ptr = rste.rto * 4096 + vaddr.rtx * 8;
+       }
+               /* fallthrough */
+       case ASCE_TYPE_REGION3: {
+               union region3_table_entry rtte;
+
+               if (kvm_is_error_gpa(vcpu->kvm, ptr))
+                       return PGM_ADDRESSING;
+               if (deref_table(vcpu->kvm, ptr, &rtte.val))
+                       return -EFAULT;
+               if (rtte.i)
+                       return PGM_REGION_THIRD_TRANS;
+               if (rtte.tt != TABLE_TYPE_REGION3)
+                       return PGM_TRANSLATION_SPEC;
+               if (rtte.cr && asce.p && edat2)
+                       return PGM_TRANSLATION_SPEC;
+               if (rtte.fc && edat2) {
+                       dat_protection |= rtte.fc1.p;
+                       raddr.rfaa = rtte.fc1.rfaa;
+                       goto absolute_address;
+               }
+               if (vaddr.sx01 < rtte.fc0.tf)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (vaddr.sx01 > rtte.fc0.tl)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (edat1)
+                       dat_protection |= rtte.fc0.p;
+               ptr = rtte.fc0.sto * 4096 + vaddr.sx * 8;
+       }
+               /* fallthrough */
+       case ASCE_TYPE_SEGMENT: {
+               union segment_table_entry ste;
+
+               if (kvm_is_error_gpa(vcpu->kvm, ptr))
+                       return PGM_ADDRESSING;
+               if (deref_table(vcpu->kvm, ptr, &ste.val))
+                       return -EFAULT;
+               if (ste.i)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (ste.tt != TABLE_TYPE_SEGMENT)
+                       return PGM_TRANSLATION_SPEC;
+               if (ste.cs && asce.p)
+                       return PGM_TRANSLATION_SPEC;
+               if (ste.fc && edat1) {
+                       dat_protection |= ste.fc1.p;
+                       raddr.sfaa = ste.fc1.sfaa;
+                       goto absolute_address;
+               }
+               dat_protection |= ste.fc0.p;
+               ptr = ste.fc0.pto * 2048 + vaddr.px * 8;
+       }
+       }
+       if (kvm_is_error_gpa(vcpu->kvm, ptr))
+               return PGM_ADDRESSING;
+       if (deref_table(vcpu->kvm, ptr, &pte.val))
+               return -EFAULT;
+       if (pte.i)
+               return PGM_PAGE_TRANSLATION;
+       if (pte.z)
+               return PGM_TRANSLATION_SPEC;
+       if (pte.co && !edat1)
+               return PGM_TRANSLATION_SPEC;
+       dat_protection |= pte.p;
+       raddr.pfra = pte.pfra;
+real_address:
+       raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr);
+absolute_address:
+       if (write && dat_protection)
+               return PGM_PROTECTION;
+       if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
+               return PGM_ADDRESSING;
+       *gpa = raddr.addr;
+       return 0;
+}
+
+static inline int is_low_address(unsigned long ga)
+{
+       /* Check for address ranges 0..511 and 4096..4607 */
+       return (ga & ~0x11fful) == 0;
+}
+
+static int low_address_protection_enabled(struct kvm_vcpu *vcpu)
+{
+       union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       union asce asce;
+
+       if (!ctlreg0.lap)
+               return 0;
+       asce.val = get_vcpu_asce(vcpu);
+       if (psw_bits(*psw).t && asce.p)
+               return 0;
+       return 1;
+}
+
+struct trans_exc_code_bits {
+       unsigned long addr : 52; /* Translation-exception Address */
+       unsigned long fsi  : 2;  /* Access Exception Fetch/Store Indication */
+       unsigned long      : 7;
+       unsigned long b61  : 1;
+       unsigned long as   : 2;  /* ASCE Identifier */
+};
+
+enum {
+       FSI_UNKNOWN = 0, /* Unknown wether fetch or store */
+       FSI_STORE   = 1, /* Exception was due to store operation */
+       FSI_FETCH   = 2  /* Exception was due to fetch operation */
+};
+
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
+                           unsigned long *pages, unsigned long nr_pages,
+                           int write)
+{
+       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       struct trans_exc_code_bits *tec_bits;
+       int lap_enabled, rc;
+
+       memset(pgm, 0, sizeof(*pgm));
+       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+       tec_bits->fsi = write ? FSI_STORE : FSI_FETCH;
+       tec_bits->as = psw_bits(*psw).as;
+       lap_enabled = low_address_protection_enabled(vcpu);
+       while (nr_pages) {
+               ga = kvm_s390_logical_to_effective(vcpu, ga);
+               tec_bits->addr = ga >> PAGE_SHIFT;
+               if (write && lap_enabled && is_low_address(ga)) {
+                       pgm->code = PGM_PROTECTION;
+                       return pgm->code;
+               }
+               ga &= PAGE_MASK;
+               if (psw_bits(*psw).t) {
+                       rc = guest_translate(vcpu, ga, pages, write);
+                       if (rc < 0)
+                               return rc;
+                       if (rc == PGM_PROTECTION)
+                               tec_bits->b61 = 1;
+                       if (rc)
+                               pgm->code = rc;
+               } else {
+                       *pages = kvm_s390_real_to_abs(vcpu, ga);
+                       if (kvm_is_error_gpa(vcpu->kvm, *pages))
+                               pgm->code = PGM_ADDRESSING;
+               }
+               if (pgm->code)
+                       return pgm->code;
+               ga += PAGE_SIZE;
+               pages++;
+               nr_pages--;
+       }
+       return 0;
+}
+
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+                unsigned long len, int write)
+{
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       unsigned long _len, nr_pages, gpa, idx;
+       unsigned long pages_array[2];
+       unsigned long *pages;
+       int need_ipte_lock;
+       union asce asce;
+       int rc;
+
+       if (!len)
+               return 0;
+       /* Access register mode is not supported yet. */
+       if (psw_bits(*psw).t && psw_bits(*psw).as == PSW_AS_ACCREG)
+               return -EOPNOTSUPP;
+       nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
+       pages = pages_array;
+       if (nr_pages > ARRAY_SIZE(pages_array))
+               pages = vmalloc(nr_pages * sizeof(unsigned long));
+       if (!pages)
+               return -ENOMEM;
+       asce.val = get_vcpu_asce(vcpu);
+       need_ipte_lock = psw_bits(*psw).t && !asce.r;
+       if (need_ipte_lock)
+               ipte_lock(vcpu);
+       rc = guest_page_range(vcpu, ga, pages, nr_pages, write);
+       for (idx = 0; idx < nr_pages && !rc; idx++) {
+               gpa = *(pages + idx) + (ga & ~PAGE_MASK);
+               _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
+               if (write)
+                       rc = kvm_write_guest(vcpu->kvm, gpa, data, _len);
+               else
+                       rc = kvm_read_guest(vcpu->kvm, gpa, data, _len);
+               len -= _len;
+               ga += _len;
+               data += _len;
+       }
+       if (need_ipte_lock)
+               ipte_unlock(vcpu);
+       if (nr_pages > ARRAY_SIZE(pages_array))
+               vfree(pages);
+       return rc;
+}
+
+int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
+                     void *data, unsigned long len, int write)
+{
+       unsigned long _len, gpa;
+       int rc = 0;
+
+       while (len && !rc) {
+               gpa = kvm_s390_real_to_abs(vcpu, gra);
+               _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
+               if (write)
+                       rc = write_guest_abs(vcpu, gpa, data, _len);
+               else
+                       rc = read_guest_abs(vcpu, gpa, data, _len);
+               len -= _len;
+               gra += _len;
+               data += _len;
+       }
+       return rc;
+}
+
+/**
+ * guest_translate_address - translate guest logical into guest absolute address
+ *
+ * Parameter semantics are the same as the ones from guest_translate.
+ * The memory contents at the guest address are not changed.
+ *
+ * Note: The IPTE lock is not taken during this function, so the caller
+ * has to take care of this.
+ */
+int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
+                           unsigned long *gpa, int write)
+{
+       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       struct trans_exc_code_bits *tec;
+       union asce asce;
+       int rc;
+
+       /* Access register mode is not supported yet. */
+       if (psw_bits(*psw).t && psw_bits(*psw).as == PSW_AS_ACCREG)
+               return -EOPNOTSUPP;
+
+       gva = kvm_s390_logical_to_effective(vcpu, gva);
+       memset(pgm, 0, sizeof(*pgm));
+       tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+       tec->as = psw_bits(*psw).as;
+       tec->fsi = write ? FSI_STORE : FSI_FETCH;
+       tec->addr = gva >> PAGE_SHIFT;
+       if (is_low_address(gva) && low_address_protection_enabled(vcpu)) {
+               if (write) {
+                       rc = pgm->code = PGM_PROTECTION;
+                       return rc;
+               }
+       }
+
+       asce.val = get_vcpu_asce(vcpu);
+       if (psw_bits(*psw).t && !asce.r) {      /* Use DAT? */
+               rc = guest_translate(vcpu, gva, gpa, write);
+               if (rc > 0) {
+                       if (rc == PGM_PROTECTION)
+                               tec->b61 = 1;
+                       pgm->code = rc;
+               }
+       } else {
+               rc = 0;
+               *gpa = kvm_s390_real_to_abs(vcpu, gva);
+               if (kvm_is_error_gpa(vcpu->kvm, *gpa))
+                       rc = pgm->code = PGM_ADDRESSING;
+       }
+
+       return rc;
+}
+
+/**
+ * kvm_s390_check_low_addr_protection - check for low-address protection
+ * @ga: Guest address
+ *
+ * Checks whether an address is subject to low-address protection and set
+ * up vcpu->arch.pgm accordingly if necessary.
+ *
+ * Return: 0 if no protection exception, or PGM_PROTECTION if protected.
+ */
+int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga)
+{
+       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       struct trans_exc_code_bits *tec_bits;
+
+       if (!is_low_address(ga) || !low_address_protection_enabled(vcpu))
+               return 0;
+
+       memset(pgm, 0, sizeof(*pgm));
+       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+       tec_bits->fsi = FSI_STORE;
+       tec_bits->as = psw_bits(*psw).as;
+       tec_bits->addr = ga >> PAGE_SHIFT;
+       pgm->code = PGM_PROTECTION;
+
+       return pgm->code;
+}
index 374a439ccc6080a004c7593f6227bc0c799ff7a6..0149cf15058ab9e8d12918192353884ac4ce8f4f 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * access guest memory
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2014
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
 
 #include <linux/compiler.h>
 #include <linux/kvm_host.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
 #include "kvm-s390.h"
 
-/* Convert real to absolute address by applying the prefix of the CPU */
+/**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
 static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
-                                                unsigned long gaddr)
+                                                unsigned long gra)
 {
-       unsigned long prefix  = vcpu->arch.sie_block->prefix;
-       if (gaddr < 2 * PAGE_SIZE)
-               gaddr += prefix;
-       else if (gaddr >= prefix && gaddr < prefix + 2 * PAGE_SIZE)
-               gaddr -= prefix;
-       return gaddr;
+       unsigned long prefix  = kvm_s390_get_prefix(vcpu);
+
+       if (gra < 2 * PAGE_SIZE)
+               gra += prefix;
+       else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
+               gra -= prefix;
+       return gra;
 }
 
-static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu,
-                                         void __user *gptr,
-                                         int prefixing)
+/**
+ * kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @vcpu: guest virtual cpu
+ * @ga: guest logical address
+ *
+ * Convert a guest vcpu logical address to a guest vcpu effective address by
+ * applying the rules of the vcpu's addressing mode defined by PSW bits 31
+ * and 32 (extendended/basic addressing mode).
+ *
+ * Depending on the vcpu's addressing mode the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing mode)
+ * of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
+                                                         unsigned long ga)
 {
-       unsigned long gaddr = (unsigned long) gptr;
-       unsigned long uaddr;
-
-       if (prefixing)
-               gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
-       uaddr = gmap_fault(gaddr, vcpu->arch.gmap);
-       if (IS_ERR_VALUE(uaddr))
-               uaddr = -EFAULT;
-       return (void __user *)uaddr;
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+
+       if (psw_bits(*psw).eaba == PSW_AMODE_64BIT)
+               return ga;
+       if (psw_bits(*psw).eaba == PSW_AMODE_31BIT)
+               return ga & ((1UL << 31) - 1);
+       return ga & ((1UL << 24) - 1);
 }
 
-#define get_guest(vcpu, x, gptr)                               \
-({                                                             \
-       __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\
-       int __mask = sizeof(__typeof__(*(gptr))) - 1;           \
-       int __ret;                                              \
-                                                               \
-       if (IS_ERR((void __force *)__uptr)) {                   \
-               __ret = PTR_ERR((void __force *)__uptr);        \
-       } else {                                                \
-               BUG_ON((unsigned long)__uptr & __mask);         \
-               __ret = get_user(x, __uptr);                    \
-       }                                                       \
-       __ret;                                                  \
-})
+/*
+ * put_guest_lc, read_guest_lc and write_guest_lc are guest access functions
+ * which shall only be used to access the lowcore of a vcpu.
+ * These functions should be used for e.g. interrupt handlers where no
+ * guest memory access protection facilities, like key or low address
+ * protection, are applicable.
+ * At a later point guest vcpu lowcore access should happen via pinned
+ * prefix pages, so that these pages can be accessed directly via the
+ * kernel mapping. All of these *_lc functions can be removed then.
+ */
 
-#define put_guest(vcpu, x, gptr)                               \
+/**
+ * put_guest_lc - write a simple variable to a guest vcpu's lowcore
+ * @vcpu: virtual cpu
+ * @x: value to copy to guest
+ * @gra: vcpu's destination guest real address
+ *
+ * Copies a simple value from kernel space to a guest vcpu's lowcore.
+ * The size of the variable may be 1, 2, 4 or 8 bytes. The destination
+ * must be located in the vcpu's lowcore. Otherwise the result is undefined.
+ *
+ * Returns zero on success or -EFAULT on error.
+ *
+ * Note: an error indicates that either the kernel is out of memory or
+ *      the guest memory mapping is broken. In any case the best solution
+ *      would be to terminate the guest.
+ *      It is wrong to inject a guest exception.
+ */
+#define put_guest_lc(vcpu, x, gra)                             \
 ({                                                             \
-       __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\
-       int __mask = sizeof(__typeof__(*(gptr))) - 1;           \
-       int __ret;                                              \
+       struct kvm_vcpu *__vcpu = (vcpu);                       \
+       __typeof__(*(gra)) __x = (x);                           \
+       unsigned long __gpa;                                    \
                                                                \
-       if (IS_ERR((void __force *)__uptr)) {                   \
-               __ret = PTR_ERR((void __force *)__uptr);        \
-       } else {                                                \
-               BUG_ON((unsigned long)__uptr & __mask);         \
-               __ret = put_user(x, __uptr);                    \
-       }                                                       \
-       __ret;                                                  \
+       __gpa = (unsigned long)(gra);                           \
+       __gpa += kvm_s390_get_prefix(__vcpu);                   \
+       kvm_write_guest(__vcpu->kvm, __gpa, &__x, sizeof(__x)); \
 })
 
-static inline int __copy_guest(struct kvm_vcpu *vcpu, unsigned long to,
-                              unsigned long from, unsigned long len,
-                              int to_guest, int prefixing)
+/**
+ * write_guest_lc - copy data from kernel space to guest vcpu's lowcore
+ * @vcpu: virtual cpu
+ * @gra: vcpu's source guest real address
+ * @data: source address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy data from kernel space to guest vcpu's lowcore. The entire range must
+ * be located within the vcpu's lowcore, otherwise the result is undefined.
+ *
+ * Returns zero on success or -EFAULT on error.
+ *
+ * Note: an error indicates that either the kernel is out of memory or
+ *      the guest memory mapping is broken. In any case the best solution
+ *      would be to terminate the guest.
+ *      It is wrong to inject a guest exception.
+ */
+static inline __must_check
+int write_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
+                  unsigned long len)
+{
+       unsigned long gpa = gra + kvm_s390_get_prefix(vcpu);
+
+       return kvm_write_guest(vcpu->kvm, gpa, data, len);
+}
+
+/**
+ * read_guest_lc - copy data from guest vcpu's lowcore to kernel space
+ * @vcpu: virtual cpu
+ * @gra: vcpu's source guest real address
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy data from guest vcpu's lowcore to kernel space. The entire range must
+ * be located within the vcpu's lowcore, otherwise the result is undefined.
+ *
+ * Returns zero on success or -EFAULT on error.
+ *
+ * Note: an error indicates that either the kernel is out of memory or
+ *      the guest memory mapping is broken. In any case the best solution
+ *      would be to terminate the guest.
+ *      It is wrong to inject a guest exception.
+ */
+static inline __must_check
+int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
+                 unsigned long len)
+{
+       unsigned long gpa = gra + kvm_s390_get_prefix(vcpu);
+
+       return kvm_read_guest(vcpu->kvm, gpa, data, len);
+}
+
+int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
+                           unsigned long *gpa, int write);
+
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+                unsigned long len, int write);
+
+int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
+                     void *data, unsigned long len, int write);
+
+/**
+ * write_guest - copy data from kernel space to guest space
+ * @vcpu: virtual cpu
+ * @ga: guest address
+ * @data: source address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @data (kernel space) to @ga (guest address).
+ * In order to copy data to guest space the PSW of the vcpu is inspected:
+ * If DAT is off data will be copied to guest real or absolute memory.
+ * If DAT is on data will be copied to the address space as specified by
+ * the address space bits of the PSW:
+ * Primary, secondory or home space (access register mode is currently not
+ * implemented).
+ * The addressing mode of the PSW is also inspected, so that address wrap
+ * around is taken into account for 24-, 31- and 64-bit addressing mode,
+ * if the to be copied data crosses page boundaries in guest address space.
+ * In addition also low address and DAT protection are inspected before
+ * copying any data (key protection is currently not implemented).
+ *
+ * This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu.
+ * In case of an access exception (e.g. protection exception) pgm will contain
+ * all data necessary so that a subsequent call to 'kvm_s390_inject_prog_vcpu()'
+ * will inject a correct exception into the guest.
+ * If no access exception happened, the contents of pgm are undefined when
+ * this function returns.
+ *
+ * Returns:  - zero on success
+ *          - a negative value if e.g. the guest mapping is broken or in
+ *            case of out-of-memory. In this case the contents of pgm are
+ *            undefined. Also parts of @data may have been copied to guest
+ *            space.
+ *          - a positive value if an access exception happened. In this case
+ *            the returned value is the program interruption code and the
+ *            contents of pgm may be used to inject an exception into the
+ *            guest. No data has been copied to guest space.
+ *
+ * Note: in case an access exception is recognized no data has been copied to
+ *      guest space (this is also true, if the to be copied data would cross
+ *      one or more page boundaries in guest space).
+ *      Therefore this function may be used for nullifying and suppressing
+ *      instruction emulation.
+ *      It may also be used for terminating instructions, if it is undefined
+ *      if data has been changed in guest space in case of an exception.
+ */
+static inline __must_check
+int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+               unsigned long len)
+{
+       return access_guest(vcpu, ga, data, len, 1);
+}
+
+/**
+ * read_guest - copy data from guest space to kernel space
+ * @vcpu: virtual cpu
+ * @ga: guest address
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @ga (guest address) to @data (kernel space).
+ *
+ * The behaviour of read_guest is identical to write_guest, except that
+ * data will be copied from guest space to kernel space.
+ */
+static inline __must_check
+int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+              unsigned long len)
+{
+       return access_guest(vcpu, ga, data, len, 0);
+}
+
+/**
+ * write_guest_abs - copy data from kernel space to guest space absolute
+ * @vcpu: virtual cpu
+ * @gpa: guest physical (absolute) address
+ * @data: source address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @data (kernel space) to @gpa (guest absolute address).
+ * It is up to the caller to ensure that the entire guest memory range is
+ * valid memory before calling this function.
+ * Guest low address and key protection are not checked.
+ *
+ * Returns zero on success or -EFAULT on error.
+ *
+ * If an error occurs data may have been copied partially to guest memory.
+ */
+static inline __must_check
+int write_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
+                   unsigned long len)
+{
+       return kvm_write_guest(vcpu->kvm, gpa, data, len);
+}
+
+/**
+ * read_guest_abs - copy data from guest space absolute to kernel space
+ * @vcpu: virtual cpu
+ * @gpa: guest physical (absolute) address
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @gpa (guest absolute address) to @data (kernel space).
+ * It is up to the caller to ensure that the entire guest memory range is
+ * valid memory before calling this function.
+ * Guest key protection is not checked.
+ *
+ * Returns zero on success or -EFAULT on error.
+ *
+ * If an error occurs data may have been copied partially to kernel space.
+ */
+static inline __must_check
+int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
+                  unsigned long len)
+{
+       return kvm_read_guest(vcpu->kvm, gpa, data, len);
+}
+
+/**
+ * write_guest_real - copy data from kernel space to guest space real
+ * @vcpu: virtual cpu
+ * @gra: guest real address
+ * @data: source address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @data (kernel space) to @gra (guest real address).
+ * It is up to the caller to ensure that the entire guest memory range is
+ * valid memory before calling this function.
+ * Guest low address and key protection are not checked.
+ *
+ * Returns zero on success or -EFAULT on error.
+ *
+ * If an error occurs data may have been copied partially to guest memory.
+ */
+static inline __must_check
+int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
+                    unsigned long len)
+{
+       return access_guest_real(vcpu, gra, data, len, 1);
+}
+
+/**
+ * read_guest_real - copy data from guest space real to kernel space
+ * @vcpu: virtual cpu
+ * @gra: guest real address
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @gra (guest real address) to @data (kernel space).
+ * It is up to the caller to ensure that the entire guest memory range is
+ * valid memory before calling this function.
+ * Guest key protection is not checked.
+ *
+ * Returns zero on success or -EFAULT on error.
+ *
+ * If an error occurs data may have been copied partially to kernel space.
+ */
+static inline __must_check
+int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
+                   unsigned long len)
 {
-       unsigned long _len, rc;
-       void __user *uptr;
-
-       while (len) {
-               uptr = to_guest ? (void __user *)to : (void __user *)from;
-               uptr = __gptr_to_uptr(vcpu, uptr, prefixing);
-               if (IS_ERR((void __force *)uptr))
-                       return -EFAULT;
-               _len = PAGE_SIZE - ((unsigned long)uptr & (PAGE_SIZE - 1));
-               _len = min(_len, len);
-               if (to_guest)
-                       rc = copy_to_user((void __user *) uptr, (void *)from, _len);
-               else
-                       rc = copy_from_user((void *)to, (void __user *)uptr, _len);
-               if (rc)
-                       return -EFAULT;
-               len -= _len;
-               from += _len;
-               to += _len;
-       }
-       return 0;
+       return access_guest_real(vcpu, gra, data, len, 0);
 }
 
-#define copy_to_guest(vcpu, to, from, size) \
-       __copy_guest(vcpu, to, (unsigned long)from, size, 1, 1)
-#define copy_from_guest(vcpu, to, from, size) \
-       __copy_guest(vcpu, (unsigned long)to, from, size, 0, 1)
-#define copy_to_guest_absolute(vcpu, to, from, size) \
-       __copy_guest(vcpu, to, (unsigned long)from, size, 1, 0)
-#define copy_from_guest_absolute(vcpu, to, from, size) \
-       __copy_guest(vcpu, (unsigned long)to, from, size, 0, 0)
+void ipte_lock(struct kvm_vcpu *vcpu);
+void ipte_unlock(struct kvm_vcpu *vcpu);
+int ipte_lock_held(struct kvm_vcpu *vcpu);
+int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga);
 
 #endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
new file mode 100644 (file)
index 0000000..3e8d409
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ * kvm guest debug support
+ *
+ * Copyright IBM Corp. 2014
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+/*
+ * Extends the address range given by *start and *stop to include the address
+ * range starting with estart and the length len. Takes care of overflowing
+ * intervals and tries to minimize the overall intervall size.
+ */
+static void extend_address_range(u64 *start, u64 *stop, u64 estart, int len)
+{
+       u64 estop;
+
+       if (len > 0)
+               len--;
+       else
+               len = 0;
+
+       estop = estart + len;
+
+       /* 0-0 range represents "not set" */
+       if ((*start == 0) && (*stop == 0)) {
+               *start = estart;
+               *stop = estop;
+       } else if (*start <= *stop) {
+               /* increase the existing range */
+               if (estart < *start)
+                       *start = estart;
+               if (estop > *stop)
+                       *stop = estop;
+       } else {
+               /* "overflowing" interval, whereby *stop > *start */
+               if (estart <= *stop) {
+                       if (estop > *stop)
+                               *stop = estop;
+               } else if (estop > *start) {
+                       if (estart < *start)
+                               *start = estart;
+               }
+               /* minimize the range */
+               else if ((estop - *stop) < (*start - estart))
+                       *stop = estop;
+               else
+                       *start = estart;
+       }
+}
+
+#define MAX_INST_SIZE 6
+
+static void enable_all_hw_bp(struct kvm_vcpu *vcpu)
+{
+       unsigned long start, len;
+       u64 *cr9 = &vcpu->arch.sie_block->gcr[9];
+       u64 *cr10 = &vcpu->arch.sie_block->gcr[10];
+       u64 *cr11 = &vcpu->arch.sie_block->gcr[11];
+       int i;
+
+       if (vcpu->arch.guestdbg.nr_hw_bp <= 0 ||
+           vcpu->arch.guestdbg.hw_bp_info == NULL)
+               return;
+
+       /*
+        * If the guest is not interrested in branching events, we can savely
+        * limit them to the PER address range.
+        */
+       if (!(*cr9 & PER_EVENT_BRANCH))
+               *cr9 |= PER_CONTROL_BRANCH_ADDRESS;
+       *cr9 |= PER_EVENT_IFETCH | PER_EVENT_BRANCH;
+
+       for (i = 0; i < vcpu->arch.guestdbg.nr_hw_bp; i++) {
+               start = vcpu->arch.guestdbg.hw_bp_info[i].addr;
+               len = vcpu->arch.guestdbg.hw_bp_info[i].len;
+
+               /*
+                * The instruction in front of the desired bp has to
+                * report instruction-fetching events
+                */
+               if (start < MAX_INST_SIZE) {
+                       len += start;
+                       start = 0;
+               } else {
+                       start -= MAX_INST_SIZE;
+                       len += MAX_INST_SIZE;
+               }
+
+               extend_address_range(cr10, cr11, start, len);
+       }
+}
+
+static void enable_all_hw_wp(struct kvm_vcpu *vcpu)
+{
+       unsigned long start, len;
+       u64 *cr9 = &vcpu->arch.sie_block->gcr[9];
+       u64 *cr10 = &vcpu->arch.sie_block->gcr[10];
+       u64 *cr11 = &vcpu->arch.sie_block->gcr[11];
+       int i;
+
+       if (vcpu->arch.guestdbg.nr_hw_wp <= 0 ||
+           vcpu->arch.guestdbg.hw_wp_info == NULL)
+               return;
+
+       /* if host uses storage alternation for special address
+        * spaces, enable all events and give all to the guest */
+       if (*cr9 & PER_EVENT_STORE && *cr9 & PER_CONTROL_ALTERATION) {
+               *cr9 &= ~PER_CONTROL_ALTERATION;
+               *cr10 = 0;
+               *cr11 = PSW_ADDR_INSN;
+       } else {
+               *cr9 &= ~PER_CONTROL_ALTERATION;
+               *cr9 |= PER_EVENT_STORE;
+
+               for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) {
+                       start = vcpu->arch.guestdbg.hw_wp_info[i].addr;
+                       len = vcpu->arch.guestdbg.hw_wp_info[i].len;
+
+                       extend_address_range(cr10, cr11, start, len);
+               }
+       }
+}
+
+void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.guestdbg.cr0 = vcpu->arch.sie_block->gcr[0];
+       vcpu->arch.guestdbg.cr9 = vcpu->arch.sie_block->gcr[9];
+       vcpu->arch.guestdbg.cr10 = vcpu->arch.sie_block->gcr[10];
+       vcpu->arch.guestdbg.cr11 = vcpu->arch.sie_block->gcr[11];
+}
+
+void kvm_s390_restore_guest_per_regs(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.sie_block->gcr[0] = vcpu->arch.guestdbg.cr0;
+       vcpu->arch.sie_block->gcr[9] = vcpu->arch.guestdbg.cr9;
+       vcpu->arch.sie_block->gcr[10] = vcpu->arch.guestdbg.cr10;
+       vcpu->arch.sie_block->gcr[11] = vcpu->arch.guestdbg.cr11;
+}
+
+void kvm_s390_patch_guest_per_regs(struct kvm_vcpu *vcpu)
+{
+       /*
+        * TODO: if guest psw has per enabled, otherwise 0s!
+        * This reduces the amount of reported events.
+        * Need to intercept all psw changes!
+        */
+
+       if (guestdbg_sstep_enabled(vcpu)) {
+               /* disable timer (clock-comparator) interrupts */
+               vcpu->arch.sie_block->gcr[0] &= ~0x800ul;
+               vcpu->arch.sie_block->gcr[9] |= PER_EVENT_IFETCH;
+               vcpu->arch.sie_block->gcr[10] = 0;
+               vcpu->arch.sie_block->gcr[11] = PSW_ADDR_INSN;
+       }
+
+       if (guestdbg_hw_bp_enabled(vcpu)) {
+               enable_all_hw_bp(vcpu);
+               enable_all_hw_wp(vcpu);
+       }
+
+       /* TODO: Instruction-fetching-nullification not allowed for now */
+       if (vcpu->arch.sie_block->gcr[9] & PER_EVENT_NULLIFICATION)
+               vcpu->arch.sie_block->gcr[9] &= ~PER_EVENT_NULLIFICATION;
+}
+
+#define MAX_WP_SIZE 100
+
+static int __import_wp_info(struct kvm_vcpu *vcpu,
+                           struct kvm_hw_breakpoint *bp_data,
+                           struct kvm_hw_wp_info_arch *wp_info)
+{
+       int ret = 0;
+       wp_info->len = bp_data->len;
+       wp_info->addr = bp_data->addr;
+       wp_info->phys_addr = bp_data->phys_addr;
+       wp_info->old_data = NULL;
+
+       if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE)
+               return -EINVAL;
+
+       wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL);
+       if (!wp_info->old_data)
+               return -ENOMEM;
+       /* try to backup the original value */
+       ret = read_guest(vcpu, wp_info->phys_addr, wp_info->old_data,
+                        wp_info->len);
+       if (ret) {
+               kfree(wp_info->old_data);
+               wp_info->old_data = NULL;
+       }
+
+       return ret;
+}
+
+#define MAX_BP_COUNT 50
+
+int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
+                           struct kvm_guest_debug *dbg)
+{
+       int ret = 0, nr_wp = 0, nr_bp = 0, i, size;
+       struct kvm_hw_breakpoint *bp_data = NULL;
+       struct kvm_hw_wp_info_arch *wp_info = NULL;
+       struct kvm_hw_bp_info_arch *bp_info = NULL;
+
+       if (dbg->arch.nr_hw_bp <= 0 || !dbg->arch.hw_bp)
+               return 0;
+       else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT)
+               return -EINVAL;
+
+       size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint);
+       bp_data = kmalloc(size, GFP_KERNEL);
+       if (!bp_data) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) {
+               ret = -EFAULT;
+               goto error;
+       }
+
+       for (i = 0; i < dbg->arch.nr_hw_bp; i++) {
+               switch (bp_data[i].type) {
+               case KVM_HW_WP_WRITE:
+                       nr_wp++;
+                       break;
+               case KVM_HW_BP:
+                       nr_bp++;
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       size = nr_wp * sizeof(struct kvm_hw_wp_info_arch);
+       if (size > 0) {
+               wp_info = kmalloc(size, GFP_KERNEL);
+               if (!wp_info) {
+                       ret = -ENOMEM;
+                       goto error;
+               }
+       }
+       size = nr_bp * sizeof(struct kvm_hw_bp_info_arch);
+       if (size > 0) {
+               bp_info = kmalloc(size, GFP_KERNEL);
+               if (!bp_info) {
+                       ret = -ENOMEM;
+                       goto error;
+               }
+       }
+
+       for (nr_wp = 0, nr_bp = 0, i = 0; i < dbg->arch.nr_hw_bp; i++) {
+               switch (bp_data[i].type) {
+               case KVM_HW_WP_WRITE:
+                       ret = __import_wp_info(vcpu, &bp_data[i],
+                                              &wp_info[nr_wp]);
+                       if (ret)
+                               goto error;
+                       nr_wp++;
+                       break;
+               case KVM_HW_BP:
+                       bp_info[nr_bp].len = bp_data[i].len;
+                       bp_info[nr_bp].addr = bp_data[i].addr;
+                       nr_bp++;
+                       break;
+               }
+       }
+
+       vcpu->arch.guestdbg.nr_hw_bp = nr_bp;
+       vcpu->arch.guestdbg.hw_bp_info = bp_info;
+       vcpu->arch.guestdbg.nr_hw_wp = nr_wp;
+       vcpu->arch.guestdbg.hw_wp_info = wp_info;
+       return 0;
+error:
+       kfree(bp_data);
+       kfree(wp_info);
+       kfree(bp_info);
+       return ret;
+}
+
+void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_hw_wp_info_arch *hw_wp_info = NULL;
+
+       for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) {
+               hw_wp_info = &vcpu->arch.guestdbg.hw_wp_info[i];
+               kfree(hw_wp_info->old_data);
+               hw_wp_info->old_data = NULL;
+       }
+       kfree(vcpu->arch.guestdbg.hw_wp_info);
+       vcpu->arch.guestdbg.hw_wp_info = NULL;
+
+       kfree(vcpu->arch.guestdbg.hw_bp_info);
+       vcpu->arch.guestdbg.hw_bp_info = NULL;
+
+       vcpu->arch.guestdbg.nr_hw_wp = 0;
+       vcpu->arch.guestdbg.nr_hw_bp = 0;
+}
+
+static inline int in_addr_range(u64 addr, u64 a, u64 b)
+{
+       if (a <= b)
+               return (addr >= a) && (addr <= b);
+       else
+               /* "overflowing" interval */
+               return (addr <= a) && (addr >= b);
+}
+
+#define end_of_range(bp_info) (bp_info->addr + bp_info->len - 1)
+
+static struct kvm_hw_bp_info_arch *find_hw_bp(struct kvm_vcpu *vcpu,
+                                             unsigned long addr)
+{
+       struct kvm_hw_bp_info_arch *bp_info = vcpu->arch.guestdbg.hw_bp_info;
+       int i;
+
+       if (vcpu->arch.guestdbg.nr_hw_bp == 0)
+               return NULL;
+
+       for (i = 0; i < vcpu->arch.guestdbg.nr_hw_bp; i++) {
+               /* addr is directly the start or in the range of a bp */
+               if (addr == bp_info->addr)
+                       goto found;
+               if (bp_info->len > 0 &&
+                   in_addr_range(addr, bp_info->addr, end_of_range(bp_info)))
+                       goto found;
+
+               bp_info++;
+       }
+
+       return NULL;
+found:
+       return bp_info;
+}
+
+static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_hw_wp_info_arch *wp_info = NULL;
+       void *temp = NULL;
+
+       if (vcpu->arch.guestdbg.nr_hw_wp == 0)
+               return NULL;
+
+       for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) {
+               wp_info = &vcpu->arch.guestdbg.hw_wp_info[i];
+               if (!wp_info || !wp_info->old_data || wp_info->len <= 0)
+                       continue;
+
+               temp = kmalloc(wp_info->len, GFP_KERNEL);
+               if (!temp)
+                       continue;
+
+               /* refetch the wp data and compare it to the old value */
+               if (!read_guest(vcpu, wp_info->phys_addr, temp,
+                               wp_info->len)) {
+                       if (memcmp(temp, wp_info->old_data, wp_info->len)) {
+                               kfree(temp);
+                               return wp_info;
+                       }
+               }
+               kfree(temp);
+               temp = NULL;
+       }
+
+       return NULL;
+}
+
+void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
+{
+       vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+       vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING;
+}
+
+#define per_bp_event(code) \
+                       (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH))
+#define per_write_wp_event(code) \
+                       (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL))
+
+static int debug_exit_required(struct kvm_vcpu *vcpu)
+{
+       u32 perc = (vcpu->arch.sie_block->perc << 24);
+       struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
+       struct kvm_hw_wp_info_arch *wp_info = NULL;
+       struct kvm_hw_bp_info_arch *bp_info = NULL;
+       unsigned long addr = vcpu->arch.sie_block->gpsw.addr;
+       unsigned long peraddr = vcpu->arch.sie_block->peraddr;
+
+       if (guestdbg_hw_bp_enabled(vcpu)) {
+               if (per_write_wp_event(perc) &&
+                   vcpu->arch.guestdbg.nr_hw_wp > 0) {
+                       wp_info = any_wp_changed(vcpu);
+                       if (wp_info) {
+                               debug_exit->addr = wp_info->addr;
+                               debug_exit->type = KVM_HW_WP_WRITE;
+                               goto exit_required;
+                       }
+               }
+               if (per_bp_event(perc) &&
+                        vcpu->arch.guestdbg.nr_hw_bp > 0) {
+                       bp_info = find_hw_bp(vcpu, addr);
+                       /* remove duplicate events if PC==PER address */
+                       if (bp_info && (addr != peraddr)) {
+                               debug_exit->addr = addr;
+                               debug_exit->type = KVM_HW_BP;
+                               vcpu->arch.guestdbg.last_bp = addr;
+                               goto exit_required;
+                       }
+                       /* breakpoint missed */
+                       bp_info = find_hw_bp(vcpu, peraddr);
+                       if (bp_info && vcpu->arch.guestdbg.last_bp != peraddr) {
+                               debug_exit->addr = peraddr;
+                               debug_exit->type = KVM_HW_BP;
+                               goto exit_required;
+                       }
+               }
+       }
+       if (guestdbg_sstep_enabled(vcpu) && per_bp_event(perc)) {
+               debug_exit->addr = addr;
+               debug_exit->type = KVM_SINGLESTEP;
+               goto exit_required;
+       }
+
+       return 0;
+exit_required:
+       return 1;
+}
+
+#define guest_per_enabled(vcpu) \
+                            (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
+
+static void filter_guest_per_event(struct kvm_vcpu *vcpu)
+{
+       u32 perc = vcpu->arch.sie_block->perc << 24;
+       u64 peraddr = vcpu->arch.sie_block->peraddr;
+       u64 addr = vcpu->arch.sie_block->gpsw.addr;
+       u64 cr9 = vcpu->arch.sie_block->gcr[9];
+       u64 cr10 = vcpu->arch.sie_block->gcr[10];
+       u64 cr11 = vcpu->arch.sie_block->gcr[11];
+       /* filter all events, demanded by the guest */
+       u32 guest_perc = perc & cr9 & PER_EVENT_MASK;
+
+       if (!guest_per_enabled(vcpu))
+               guest_perc = 0;
+
+       /* filter "successful-branching" events */
+       if (guest_perc & PER_EVENT_BRANCH &&
+           cr9 & PER_CONTROL_BRANCH_ADDRESS &&
+           !in_addr_range(addr, cr10, cr11))
+               guest_perc &= ~PER_EVENT_BRANCH;
+
+       /* filter "instruction-fetching" events */
+       if (guest_perc & PER_EVENT_IFETCH &&
+           !in_addr_range(peraddr, cr10, cr11))
+               guest_perc &= ~PER_EVENT_IFETCH;
+
+       /* All other PER events will be given to the guest */
+       /* TODO: Check alterated address/address space */
+
+       vcpu->arch.sie_block->perc = guest_perc >> 24;
+
+       if (!guest_perc)
+               vcpu->arch.sie_block->iprcc &= ~PGM_PER;
+}
+
+void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
+{
+       if (debug_exit_required(vcpu))
+               vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
+
+       filter_guest_per_event(vcpu);
+}
index eeb1ac7d8fa48798a79c18a5015aae5aa372c455..a0b586c1913c18827b4a00e1b21f025d21c2515a 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * in-kernel handling for sie intercepts
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2014
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -16,6 +16,8 @@
 #include <linux/pagemap.h>
 
 #include <asm/kvm_host.h>
+#include <asm/asm-offsets.h>
+#include <asm/irq.h>
 
 #include "kvm-s390.h"
 #include "gaccess.h"
@@ -29,6 +31,7 @@ static const intercept_handler_t instruction_handlers[256] = {
        [0x83] = kvm_s390_handle_diag,
        [0xae] = kvm_s390_handle_sigp,
        [0xb2] = kvm_s390_handle_b2,
+       [0xb6] = kvm_s390_handle_stctl,
        [0xb7] = kvm_s390_handle_lctl,
        [0xb9] = kvm_s390_handle_b9,
        [0xe5] = kvm_s390_handle_e5,
@@ -44,9 +47,6 @@ static int handle_noop(struct kvm_vcpu *vcpu)
        case 0x10:
                vcpu->stat.exit_external_request++;
                break;
-       case 0x14:
-               vcpu->stat.exit_external_interrupt++;
-               break;
        default:
                break; /* nothing */
        }
@@ -63,8 +63,7 @@ static int handle_stop(struct kvm_vcpu *vcpu)
        trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
 
        if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) {
-               atomic_set_mask(CPUSTAT_STOPPED,
-                               &vcpu->arch.sie_block->cpuflags);
+               kvm_s390_vcpu_stop(vcpu);
                vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP;
                VCPU_EVENT(vcpu, 3, "%s", "cpu stopped");
                rc = -EOPNOTSUPP;
@@ -109,22 +108,120 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
        return -EOPNOTSUPP;
 }
 
+static void __extract_prog_irq(struct kvm_vcpu *vcpu,
+                              struct kvm_s390_pgm_info *pgm_info)
+{
+       memset(pgm_info, 0, sizeof(struct kvm_s390_pgm_info));
+       pgm_info->code = vcpu->arch.sie_block->iprcc;
+
+       switch (vcpu->arch.sie_block->iprcc & ~PGM_PER) {
+       case PGM_AFX_TRANSLATION:
+       case PGM_ASX_TRANSLATION:
+       case PGM_EX_TRANSLATION:
+       case PGM_LFX_TRANSLATION:
+       case PGM_LSTE_SEQUENCE:
+       case PGM_LSX_TRANSLATION:
+       case PGM_LX_TRANSLATION:
+       case PGM_PRIMARY_AUTHORITY:
+       case PGM_SECONDARY_AUTHORITY:
+       case PGM_SPACE_SWITCH:
+               pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
+               break;
+       case PGM_ALEN_TRANSLATION:
+       case PGM_ALE_SEQUENCE:
+       case PGM_ASTE_INSTANCE:
+       case PGM_ASTE_SEQUENCE:
+       case PGM_ASTE_VALIDITY:
+       case PGM_EXTENDED_AUTHORITY:
+               pgm_info->exc_access_id = vcpu->arch.sie_block->eai;
+               break;
+       case PGM_ASCE_TYPE:
+       case PGM_PAGE_TRANSLATION:
+       case PGM_REGION_FIRST_TRANS:
+       case PGM_REGION_SECOND_TRANS:
+       case PGM_REGION_THIRD_TRANS:
+       case PGM_SEGMENT_TRANSLATION:
+               pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
+               pgm_info->exc_access_id  = vcpu->arch.sie_block->eai;
+               pgm_info->op_access_id  = vcpu->arch.sie_block->oai;
+               break;
+       case PGM_MONITOR:
+               pgm_info->mon_class_nr = vcpu->arch.sie_block->mcn;
+               pgm_info->mon_code = vcpu->arch.sie_block->tecmc;
+               break;
+       case PGM_DATA:
+               pgm_info->data_exc_code = vcpu->arch.sie_block->dxc;
+               break;
+       case PGM_PROTECTION:
+               pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
+               pgm_info->exc_access_id  = vcpu->arch.sie_block->eai;
+               break;
+       default:
+               break;
+       }
+
+       if (vcpu->arch.sie_block->iprcc & PGM_PER) {
+               pgm_info->per_code = vcpu->arch.sie_block->perc;
+               pgm_info->per_atmid = vcpu->arch.sie_block->peratmid;
+               pgm_info->per_address = vcpu->arch.sie_block->peraddr;
+               pgm_info->per_access_id = vcpu->arch.sie_block->peraid;
+       }
+}
+
+/*
+ * restore ITDB to program-interruption TDB in guest lowcore
+ * and set TX abort indication if required
+*/
+static int handle_itdb(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_itdb *itdb;
+       int rc;
+
+       if (!IS_TE_ENABLED(vcpu) || !IS_ITDB_VALID(vcpu))
+               return 0;
+       if (current->thread.per_flags & PER_FLAG_NO_TE)
+               return 0;
+       itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba;
+       rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb));
+       if (rc)
+               return rc;
+       memset(itdb, 0, sizeof(*itdb));
+
+       return 0;
+}
+
+#define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER)
+
 static int handle_prog(struct kvm_vcpu *vcpu)
 {
+       struct kvm_s390_pgm_info pgm_info;
+       psw_t psw;
+       int rc;
+
        vcpu->stat.exit_program_interruption++;
 
-       /* Restore ITDB to Program-Interruption TDB in guest memory */
-       if (IS_TE_ENABLED(vcpu) &&
-           !(current->thread.per_flags & PER_FLAG_NO_TE) &&
-           IS_ITDB_VALID(vcpu)) {
-               copy_to_guest(vcpu, TDB_ADDR, vcpu->arch.sie_block->itdba,
-                             sizeof(struct kvm_s390_itdb));
-               memset((void *) vcpu->arch.sie_block->itdba, 0,
-                      sizeof(struct kvm_s390_itdb));
+       if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
+               kvm_s390_handle_per_event(vcpu);
+               /* the interrupt might have been filtered out completely */
+               if (vcpu->arch.sie_block->iprcc == 0)
+                       return 0;
        }
 
        trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc);
-       return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc);
+       if (vcpu->arch.sie_block->iprcc == PGM_SPECIFICATION) {
+               rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &psw, sizeof(psw_t));
+               if (rc)
+                       return rc;
+               /* Avoid endless loops of specification exceptions */
+               if (!is_valid_psw(&psw))
+                       return -EOPNOTSUPP;
+       }
+       rc = handle_itdb(vcpu);
+       if (rc)
+               return rc;
+
+       __extract_prog_irq(vcpu, &pgm_info);
+       return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
 }
 
 static int handle_instruction_and_prog(struct kvm_vcpu *vcpu)
@@ -142,17 +239,110 @@ static int handle_instruction_and_prog(struct kvm_vcpu *vcpu)
        return rc2;
 }
 
+/**
+ * handle_external_interrupt - used for external interruption interceptions
+ *
+ * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if
+ * the new PSW does not have external interrupts disabled. In the first case,
+ * we've got to deliver the interrupt manually, and in the second case, we
+ * drop to userspace to handle the situation there.
+ */
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+       u16 eic = vcpu->arch.sie_block->eic;
+       struct kvm_s390_interrupt irq;
+       psw_t newpsw;
+       int rc;
+
+       vcpu->stat.exit_external_interrupt++;
+
+       rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
+       if (rc)
+               return rc;
+       /* We can not handle clock comparator or timer interrupt with bad PSW */
+       if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) &&
+           (newpsw.mask & PSW_MASK_EXT))
+               return -EOPNOTSUPP;
+
+       switch (eic) {
+       case EXT_IRQ_CLK_COMP:
+               irq.type = KVM_S390_INT_CLOCK_COMP;
+               break;
+       case EXT_IRQ_CPU_TIMER:
+               irq.type = KVM_S390_INT_CPU_TIMER;
+               break;
+       case EXT_IRQ_EXTERNAL_CALL:
+               if (kvm_s390_si_ext_call_pending(vcpu))
+                       return 0;
+               irq.type = KVM_S390_INT_EXTERNAL_CALL;
+               irq.parm = vcpu->arch.sie_block->extcpuaddr;
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       return kvm_s390_inject_vcpu(vcpu, &irq);
+}
+
+/**
+ * Handle MOVE PAGE partial execution interception.
+ *
+ * This interception can only happen for guests with DAT disabled and
+ * addresses that are currently not mapped in the host. Thus we try to
+ * set up the mappings for the corresponding user pages here (or throw
+ * addressing exceptions in case of illegal guest addresses).
+ */
+static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
+{
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       unsigned long srcaddr, dstaddr;
+       int reg1, reg2, rc;
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+       /* Make sure that the source is paged-in */
+       srcaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg2]);
+       if (kvm_is_error_gpa(vcpu->kvm, srcaddr))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
+       if (rc != 0)
+               return rc;
+
+       /* Make sure that the destination is paged-in */
+       dstaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg1]);
+       if (kvm_is_error_gpa(vcpu->kvm, dstaddr))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
+       if (rc != 0)
+               return rc;
+
+       psw->addr = __rewind_psw(*psw, 4);
+
+       return 0;
+}
+
+static int handle_partial_execution(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.sie_block->ipa == 0xb254)        /* MVPG */
+               return handle_mvpg_pei(vcpu);
+       if (vcpu->arch.sie_block->ipa >> 8 == 0xae)     /* SIGP */
+               return kvm_s390_handle_sigp_pei(vcpu);
+
+       return -EOPNOTSUPP;
+}
+
 static const intercept_handler_t intercept_funcs[] = {
        [0x00 >> 2] = handle_noop,
        [0x04 >> 2] = handle_instruction,
        [0x08 >> 2] = handle_prog,
        [0x0C >> 2] = handle_instruction_and_prog,
        [0x10 >> 2] = handle_noop,
-       [0x14 >> 2] = handle_noop,
+       [0x14 >> 2] = handle_external_interrupt,
        [0x18 >> 2] = handle_noop,
        [0x1C >> 2] = kvm_s390_handle_wait,
        [0x20 >> 2] = handle_validity,
        [0x28 >> 2] = handle_stop,
+       [0x38 >> 2] = handle_partial_execution,
 };
 
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
index 200a8f9390b68cb68556f6a319dd19fe90fa0457..90c8de22a2a0252187d9f176b3ad5f14c5fd8635 100644 (file)
@@ -27,6 +27,8 @@
 #define IOINT_CSSID_MASK 0x03fc0000
 #define IOINT_AI_MASK 0x04000000
 
+static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu);
+
 static int is_ioint(u64 type)
 {
        return ((type & 0xfffe0000u) != 0xfffe0000u);
@@ -56,6 +58,17 @@ static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
+{
+       if (psw_extint_disabled(vcpu) ||
+           !(vcpu->arch.sie_block->gcr[0] & 0x800ul))
+               return 0;
+       if (guestdbg_enabled(vcpu) && guestdbg_sstep_enabled(vcpu))
+               /* No timer interrupts when single stepping */
+               return 0;
+       return 1;
+}
+
 static u64 int_word_to_isc_bits(u32 int_word)
 {
        u8 isc = (int_word & 0x38000000) >> 27;
@@ -78,6 +91,14 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
                if (vcpu->arch.sie_block->gcr[0] & 0x4000ul)
                        return 1;
                return 0;
+       case KVM_S390_INT_CLOCK_COMP:
+               return ckc_interrupts_enabled(vcpu);
+       case KVM_S390_INT_CPU_TIMER:
+               if (psw_extint_disabled(vcpu))
+                       return 0;
+               if (vcpu->arch.sie_block->gcr[0] & 0x400ul)
+                       return 1;
+               return 0;
        case KVM_S390_INT_SERVICE:
        case KVM_S390_INT_PFAULT_INIT:
        case KVM_S390_INT_PFAULT_DONE:
@@ -127,11 +148,16 @@ static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
 
 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
 {
-       atomic_clear_mask(CPUSTAT_ECALL_PEND |
-               CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
-               &vcpu->arch.sie_block->cpuflags);
+       atomic_clear_mask(CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
+                         &vcpu->arch.sie_block->cpuflags);
        vcpu->arch.sie_block->lctl = 0x0000;
-       vcpu->arch.sie_block->ictl &= ~ICTL_LPSW;
+       vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT);
+
+       if (guestdbg_enabled(vcpu)) {
+               vcpu->arch.sie_block->lctl |= (LCTL_CR0 | LCTL_CR9 |
+                                              LCTL_CR10 | LCTL_CR11);
+               vcpu->arch.sie_block->ictl |= (ICTL_STCTL | ICTL_PINT);
+       }
 }
 
 static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
@@ -149,6 +175,8 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
        case KVM_S390_INT_PFAULT_INIT:
        case KVM_S390_INT_PFAULT_DONE:
        case KVM_S390_INT_VIRTIO:
+       case KVM_S390_INT_CLOCK_COMP:
+       case KVM_S390_INT_CPU_TIMER:
                if (psw_extint_disabled(vcpu))
                        __set_cpuflag(vcpu, CPUSTAT_EXT_INT);
                else
@@ -174,6 +202,106 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
        }
 }
 
+static int __deliver_prog_irq(struct kvm_vcpu *vcpu,
+                             struct kvm_s390_pgm_info *pgm_info)
+{
+       const unsigned short table[] = { 2, 4, 4, 6 };
+       int rc = 0;
+
+       switch (pgm_info->code & ~PGM_PER) {
+       case PGM_AFX_TRANSLATION:
+       case PGM_ASX_TRANSLATION:
+       case PGM_EX_TRANSLATION:
+       case PGM_LFX_TRANSLATION:
+       case PGM_LSTE_SEQUENCE:
+       case PGM_LSX_TRANSLATION:
+       case PGM_LX_TRANSLATION:
+       case PGM_PRIMARY_AUTHORITY:
+       case PGM_SECONDARY_AUTHORITY:
+       case PGM_SPACE_SWITCH:
+               rc = put_guest_lc(vcpu, pgm_info->trans_exc_code,
+                                 (u64 *)__LC_TRANS_EXC_CODE);
+               break;
+       case PGM_ALEN_TRANSLATION:
+       case PGM_ALE_SEQUENCE:
+       case PGM_ASTE_INSTANCE:
+       case PGM_ASTE_SEQUENCE:
+       case PGM_ASTE_VALIDITY:
+       case PGM_EXTENDED_AUTHORITY:
+               rc = put_guest_lc(vcpu, pgm_info->exc_access_id,
+                                 (u8 *)__LC_EXC_ACCESS_ID);
+               break;
+       case PGM_ASCE_TYPE:
+       case PGM_PAGE_TRANSLATION:
+       case PGM_REGION_FIRST_TRANS:
+       case PGM_REGION_SECOND_TRANS:
+       case PGM_REGION_THIRD_TRANS:
+       case PGM_SEGMENT_TRANSLATION:
+               rc = put_guest_lc(vcpu, pgm_info->trans_exc_code,
+                                 (u64 *)__LC_TRANS_EXC_CODE);
+               rc |= put_guest_lc(vcpu, pgm_info->exc_access_id,
+                                  (u8 *)__LC_EXC_ACCESS_ID);
+               rc |= put_guest_lc(vcpu, pgm_info->op_access_id,
+                                  (u8 *)__LC_OP_ACCESS_ID);
+               break;
+       case PGM_MONITOR:
+               rc = put_guest_lc(vcpu, pgm_info->mon_class_nr,
+                                 (u64 *)__LC_MON_CLASS_NR);
+               rc |= put_guest_lc(vcpu, pgm_info->mon_code,
+                                  (u64 *)__LC_MON_CODE);
+               break;
+       case PGM_DATA:
+               rc = put_guest_lc(vcpu, pgm_info->data_exc_code,
+                                 (u32 *)__LC_DATA_EXC_CODE);
+               break;
+       case PGM_PROTECTION:
+               rc = put_guest_lc(vcpu, pgm_info->trans_exc_code,
+                                 (u64 *)__LC_TRANS_EXC_CODE);
+               rc |= put_guest_lc(vcpu, pgm_info->exc_access_id,
+                                  (u8 *)__LC_EXC_ACCESS_ID);
+               break;
+       }
+
+       if (pgm_info->code & PGM_PER) {
+               rc |= put_guest_lc(vcpu, pgm_info->per_code,
+                                  (u8 *) __LC_PER_CODE);
+               rc |= put_guest_lc(vcpu, pgm_info->per_atmid,
+                                  (u8 *)__LC_PER_ATMID);
+               rc |= put_guest_lc(vcpu, pgm_info->per_address,
+                                  (u64 *) __LC_PER_ADDRESS);
+               rc |= put_guest_lc(vcpu, pgm_info->per_access_id,
+                                  (u8 *) __LC_PER_ACCESS_ID);
+       }
+
+       switch (vcpu->arch.sie_block->icptcode) {
+       case ICPT_INST:
+       case ICPT_INSTPROGI:
+       case ICPT_OPEREXC:
+       case ICPT_PARTEXEC:
+       case ICPT_IOINST:
+               /* last instruction only stored for these icptcodes */
+               rc |= put_guest_lc(vcpu, table[vcpu->arch.sie_block->ipa >> 14],
+                                  (u16 *) __LC_PGM_ILC);
+               break;
+       case ICPT_PROGI:
+               rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->pgmilc,
+                                  (u16 *) __LC_PGM_ILC);
+               break;
+       default:
+               rc |= put_guest_lc(vcpu, 0,
+                                  (u16 *) __LC_PGM_ILC);
+       }
+
+       rc |= put_guest_lc(vcpu, pgm_info->code,
+                          (u16 *)__LC_PGM_INT_CODE);
+       rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW,
+                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= read_guest_lc(vcpu, __LC_PGM_NEW_PSW,
+                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+
+       return rc;
+}
+
 static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                                   struct kvm_s390_interrupt_info *inti)
 {
@@ -186,26 +314,46 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_emergency_signal++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->emerg.code, 0);
-               rc  = put_guest(vcpu, 0x1201, (u16 __user *)__LC_EXT_INT_CODE);
-               rc |= put_guest(vcpu, inti->emerg.code,
-                               (u16 __user *)__LC_EXT_CPU_ADDR);
-               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+               rc  = put_guest_lc(vcpu, 0x1201, (u16 *)__LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, inti->emerg.code,
+                                  (u16 *)__LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
                break;
        case KVM_S390_INT_EXTERNAL_CALL:
                VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
                vcpu->stat.deliver_external_call++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->extcall.code, 0);
-               rc  = put_guest(vcpu, 0x1202, (u16 __user *)__LC_EXT_INT_CODE);
-               rc |= put_guest(vcpu, inti->extcall.code,
-                               (u16 __user *)__LC_EXT_CPU_ADDR);
-               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+               rc  = put_guest_lc(vcpu, 0x1202, (u16 *)__LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, inti->extcall.code,
+                                  (u16 *)__LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw,
+                                    sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+                                   &vcpu->arch.sie_block->gpsw,
+                                   sizeof(psw_t));
+               break;
+       case KVM_S390_INT_CLOCK_COMP:
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->ext.ext_params, 0);
+               deliver_ckc_interrupt(vcpu);
+               break;
+       case KVM_S390_INT_CPU_TIMER:
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->ext.ext_params, 0);
+               rc  = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
+                                  (u16 *)__LC_EXT_INT_CODE);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw,
+                                    sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params,
+                                  (u32 *)__LC_EXT_PARAMS);
                break;
        case KVM_S390_INT_SERVICE:
                VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
@@ -213,37 +361,39 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_service_signal++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->ext.ext_params, 0);
-               rc  = put_guest(vcpu, 0x2401, (u16 __user *)__LC_EXT_INT_CODE);
-               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+               rc  = put_guest_lc(vcpu, 0x2401, (u16 *)__LC_EXT_INT_CODE);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw,
+                                    sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
-               rc |= put_guest(vcpu, inti->ext.ext_params,
-                               (u32 __user *)__LC_EXT_PARAMS);
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params,
+                                  (u32 *)__LC_EXT_PARAMS);
                break;
        case KVM_S390_INT_PFAULT_INIT:
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
                                                 inti->ext.ext_params2);
-               rc  = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE);
-               rc |= put_guest(vcpu, 0x0600, (u16 __user *) __LC_EXT_CPU_ADDR);
-               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+               rc  = put_guest_lc(vcpu, 0x2603, (u16 *) __LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, 0x0600, (u16 *) __LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
-               rc |= put_guest(vcpu, inti->ext.ext_params2,
-                               (u64 __user *) __LC_EXT_PARAMS2);
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
+                                  (u64 *) __LC_EXT_PARAMS2);
                break;
        case KVM_S390_INT_PFAULT_DONE:
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
                                                 inti->ext.ext_params2);
-               rc  = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE);
-               rc |= put_guest(vcpu, 0x0680, (u16 __user *) __LC_EXT_CPU_ADDR);
-               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+               rc  = put_guest_lc(vcpu, 0x2603, (u16 *)__LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, 0x0680, (u16 *)__LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw,
+                                    sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
-               rc |= put_guest(vcpu, inti->ext.ext_params2,
-                               (u64 __user *) __LC_EXT_PARAMS2);
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
+                                  (u64 *)__LC_EXT_PARAMS2);
                break;
        case KVM_S390_INT_VIRTIO:
                VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
@@ -252,16 +402,17 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->ext.ext_params,
                                                 inti->ext.ext_params2);
-               rc  = put_guest(vcpu, 0x2603, (u16 __user *)__LC_EXT_INT_CODE);
-               rc |= put_guest(vcpu, 0x0d00, (u16 __user *)__LC_EXT_CPU_ADDR);
-               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+               rc  = put_guest_lc(vcpu, 0x2603, (u16 *)__LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, 0x0d00, (u16 *)__LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw,
+                                    sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
-               rc |= put_guest(vcpu, inti->ext.ext_params,
-                               (u32 __user *)__LC_EXT_PARAMS);
-               rc |= put_guest(vcpu, inti->ext.ext_params2,
-                               (u64 __user *)__LC_EXT_PARAMS2);
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params,
+                                  (u32 *)__LC_EXT_PARAMS);
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
+                                  (u64 *)__LC_EXT_PARAMS2);
                break;
        case KVM_S390_SIGP_STOP:
                VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
@@ -285,13 +436,12 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_restart_signal++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 0, 0);
-               rc  = copy_to_guest(vcpu,
-                                   offsetof(struct _lowcore, restart_old_psw),
-                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     offsetof(struct _lowcore, restart_psw),
-                                     sizeof(psw_t));
-               atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+               rc  = write_guest_lc(vcpu,
+                                    offsetof(struct _lowcore, restart_old_psw),
+                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, offsetof(struct _lowcore, restart_psw),
+                                   &vcpu->arch.sie_block->gpsw,
+                                   sizeof(psw_t));
                break;
        case KVM_S390_PROGRAM_INT:
                VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x",
@@ -300,13 +450,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_program_int++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->pgm.code, 0);
-               rc  = put_guest(vcpu, inti->pgm.code, (u16 __user *)__LC_PGM_INT_CODE);
-               rc |= put_guest(vcpu, table[vcpu->arch.sie_block->ipa >> 14],
-                               (u16 __user *)__LC_PGM_ILC);
-               rc |= copy_to_guest(vcpu, __LC_PGM_OLD_PSW,
-                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_PGM_NEW_PSW, sizeof(psw_t));
+               rc = __deliver_prog_irq(vcpu, &inti->pgm);
                break;
 
        case KVM_S390_MCHK:
@@ -317,11 +461,12 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                                                 inti->mchk.mcic);
                rc  = kvm_s390_vcpu_store_status(vcpu,
                                                 KVM_S390_STORE_STATUS_PREFIXED);
-               rc |= put_guest(vcpu, inti->mchk.mcic, (u64 __user *) __LC_MCCK_CODE);
-               rc |= copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
+               rc |= put_guest_lc(vcpu, inti->mchk.mcic, (u64 *)__LC_MCCK_CODE);
+               rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw,
+                                    sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_MCK_NEW_PSW, sizeof(psw_t));
                break;
 
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
@@ -334,18 +479,20 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_io_int++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 param0, param1);
-               rc  = put_guest(vcpu, inti->io.subchannel_id,
-                               (u16 __user *) __LC_SUBCHANNEL_ID);
-               rc |= put_guest(vcpu, inti->io.subchannel_nr,
-                               (u16 __user *) __LC_SUBCHANNEL_NR);
-               rc |= put_guest(vcpu, inti->io.io_int_parm,
-                               (u32 __user *) __LC_IO_INT_PARM);
-               rc |= put_guest(vcpu, inti->io.io_int_word,
-                               (u32 __user *) __LC_IO_INT_WORD);
-               rc |= copy_to_guest(vcpu, __LC_IO_OLD_PSW,
-                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                     __LC_IO_NEW_PSW, sizeof(psw_t));
+               rc  = put_guest_lc(vcpu, inti->io.subchannel_id,
+                                  (u16 *)__LC_SUBCHANNEL_ID);
+               rc |= put_guest_lc(vcpu, inti->io.subchannel_nr,
+                                  (u16 *)__LC_SUBCHANNEL_NR);
+               rc |= put_guest_lc(vcpu, inti->io.io_int_parm,
+                                  (u32 *)__LC_IO_INT_PARM);
+               rc |= put_guest_lc(vcpu, inti->io.io_int_word,
+                                  (u32 *)__LC_IO_INT_WORD);
+               rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw,
+                                    sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW,
+                                   &vcpu->arch.sie_block->gpsw,
+                                   sizeof(psw_t));
                break;
        }
        default:
@@ -358,25 +505,35 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
        }
 }
 
-static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
+static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
 {
        int rc;
 
-       if (psw_extint_disabled(vcpu))
-               return 0;
-       if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul))
-               return 0;
-       rc  = put_guest(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE);
-       rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                             __LC_EXT_NEW_PSW, sizeof(psw_t));
+       rc  = put_guest_lc(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE);
+       rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+                           &vcpu->arch.sie_block->gpsw,
+                           sizeof(psw_t));
        if (rc) {
                printk("kvm: The guest lowcore is not mapped during interrupt "
                        "delivery, killing userspace\n");
                do_exit(SIGKILL);
        }
-       return 1;
+}
+
+/* Check whether SIGP interpretation facility has an external call pending */
+int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu)
+{
+       atomic_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl;
+
+       if (!psw_extint_disabled(vcpu) &&
+           (vcpu->arch.sie_block->gcr[0] & 0x2000ul) &&
+           (atomic_read(sigp_ctrl) & SIGP_CTRL_C) &&
+           (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
+               return 1;
+
+       return 0;
 }
 
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
@@ -406,19 +563,23 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
                spin_unlock(&fi->lock);
        }
 
-       if ((!rc) && (vcpu->arch.sie_block->ckc <
-               get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) {
-               if ((!psw_extint_disabled(vcpu)) &&
-                       (vcpu->arch.sie_block->gcr[0] & 0x800ul))
-                       rc = 1;
-       }
+       if (!rc && kvm_cpu_has_pending_timer(vcpu))
+               rc = 1;
+
+       if (!rc && kvm_s390_si_ext_call_pending(vcpu))
+               rc = 1;
 
        return rc;
 }
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-       return 0;
+       if (!(vcpu->arch.sie_block->ckc <
+             get_tod_clock_fast() + vcpu->arch.sie_block->epoch))
+               return 0;
+       if (!ckc_interrupts_enabled(vcpu))
+               return 0;
+       return 1;
 }
 
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
@@ -441,8 +602,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
                return -EOPNOTSUPP; /* disabled wait */
        }
 
-       if (psw_extint_disabled(vcpu) ||
-           (!(vcpu->arch.sie_block->gcr[0] & 0x800ul))) {
+       if (!ckc_interrupts_enabled(vcpu)) {
                VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
                goto no_timer;
        }
@@ -465,7 +625,8 @@ no_timer:
        while (list_empty(&vcpu->arch.local_int.list) &&
                list_empty(&vcpu->arch.local_int.float_int->list) &&
                (!vcpu->arch.local_int.timer_due) &&
-               !signal_pending(current)) {
+               !signal_pending(current) &&
+               !kvm_s390_si_ext_call_pending(vcpu)) {
                set_current_state(TASK_INTERRUPTIBLE);
                spin_unlock_bh(&vcpu->arch.local_int.lock);
                spin_unlock(&vcpu->arch.local_int.float_int->lock);
@@ -522,6 +683,11 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
        }
        atomic_set(&li->active, 0);
        spin_unlock_bh(&li->lock);
+
+       /* clear pending external calls set by sigp interpretation facility */
+       atomic_clear_mask(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
+       atomic_clear_mask(SIGP_CTRL_C,
+                         &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl);
 }
 
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
@@ -554,9 +720,8 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
                } while (deliver);
        }
 
-       if ((vcpu->arch.sie_block->ckc <
-               get_tod_clock_fast() + vcpu->arch.sie_block->epoch))
-               __try_deliver_ckc_interrupt(vcpu);
+       if (kvm_cpu_has_pending_timer(vcpu))
+               deliver_ckc_interrupt(vcpu);
 
        if (atomic_read(&fi->active)) {
                do {
@@ -660,6 +825,31 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
        return 0;
 }
 
+int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
+                            struct kvm_s390_pgm_info *pgm_info)
+{
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       struct kvm_s390_interrupt_info *inti;
+
+       inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+       if (!inti)
+               return -ENOMEM;
+
+       VCPU_EVENT(vcpu, 3, "inject: prog irq %d (from kernel)",
+                  pgm_info->code);
+       trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
+                                  pgm_info->code, 0, 1);
+
+       inti->type = KVM_S390_PROGRAM_INT;
+       memcpy(&inti->pgm, pgm_info, sizeof(inti->pgm));
+       spin_lock_bh(&li->lock);
+       list_add(&inti->list, &li->list);
+       atomic_set(&li->active, 1);
+       BUG_ON(waitqueue_active(li->wq));
+       spin_unlock_bh(&li->lock);
+       return 0;
+}
+
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
                                                    u64 cr6, u64 schid)
 {
@@ -810,6 +1000,12 @@ int kvm_s390_inject_vm(struct kvm *kvm,
        return __inject_vm(kvm, inti);
 }
 
+void kvm_s390_reinject_io_int(struct kvm *kvm,
+                             struct kvm_s390_interrupt_info *inti)
+{
+       __inject_vm(kvm, inti);
+}
+
 int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
                         struct kvm_s390_interrupt *s390int)
 {
@@ -839,6 +1035,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
                break;
        case KVM_S390_SIGP_STOP:
        case KVM_S390_RESTART:
+       case KVM_S390_INT_CLOCK_COMP:
+       case KVM_S390_INT_CPU_TIMER:
                VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
                inti->type = s390int->type;
                break;
@@ -900,7 +1098,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-static void clear_floating_interrupts(struct kvm *kvm)
+void kvm_s390_clear_float_irqs(struct kvm *kvm)
 {
        struct kvm_s390_float_interrupt *fi;
        struct kvm_s390_interrupt_info  *n, *inti = NULL;
@@ -1246,7 +1444,7 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
                break;
        case KVM_DEV_FLIC_CLEAR_IRQS:
                r = 0;
-               clear_floating_interrupts(dev->kvm);
+               kvm_s390_clear_float_irqs(dev->kvm);
                break;
        case KVM_DEV_FLIC_APF_ENABLE:
                dev->kvm->arch.gmap->pfault_enabled = 1;
index 825fe7bf95a62e2c6a6c99c1c7d37d7f1f531f66..2f3e14fe91a4882d0c5ed5599e1d6f0da2f750f0 100644 (file)
@@ -11,6 +11,7 @@
  *               Christian Borntraeger <borntraeger@de.ibm.com>
  *               Heiko Carstens <heiko.carstens@de.ibm.com>
  *               Christian Ehrhardt <ehrhardt@de.ibm.com>
+ *               Jason J. Herne <jjherne@us.ibm.com>
  */
 
 #include <linux/compiler.h>
@@ -51,6 +52,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
        { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
        { "instruction_lctl", VCPU_STAT(instruction_lctl) },
+       { "instruction_stctl", VCPU_STAT(instruction_stctl) },
+       { "instruction_stctg", VCPU_STAT(instruction_stctg) },
        { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
        { "deliver_external_call", VCPU_STAT(deliver_external_call) },
        { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
@@ -66,6 +69,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "instruction_stpx", VCPU_STAT(instruction_stpx) },
        { "instruction_stap", VCPU_STAT(instruction_stap) },
        { "instruction_storage_key", VCPU_STAT(instruction_storage_key) },
+       { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
        { "instruction_stsch", VCPU_STAT(instruction_stsch) },
        { "instruction_chsc", VCPU_STAT(instruction_chsc) },
        { "instruction_essa", VCPU_STAT(instruction_essa) },
@@ -90,7 +94,7 @@ unsigned long *vfacilities;
 static struct gmap_notifier gmap_notifier;
 
 /* test availability of vfacility */
-static inline int test_vfacility(unsigned long nr)
+int test_vfacility(unsigned long nr)
 {
        return __test_facility(nr, (void *) vfacilities);
 }
@@ -162,6 +166,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IOEVENTFD:
        case KVM_CAP_DEVICE_CTRL:
        case KVM_CAP_ENABLE_CAP_VM:
+       case KVM_CAP_VM_ATTRIBUTES:
                r = 1;
                break;
        case KVM_CAP_NR_VCPUS:
@@ -180,6 +185,25 @@ int kvm_dev_ioctl_check_extension(long ext)
        return r;
 }
 
+static void kvm_s390_sync_dirty_log(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot)
+{
+       gfn_t cur_gfn, last_gfn;
+       unsigned long address;
+       struct gmap *gmap = kvm->arch.gmap;
+
+       down_read(&gmap->mm->mmap_sem);
+       /* Loop over all guest pages */
+       last_gfn = memslot->base_gfn + memslot->npages;
+       for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
+               address = gfn_to_hva_memslot(memslot, cur_gfn);
+
+               if (gmap_test_and_clear_dirty(address, gmap))
+                       mark_page_dirty(kvm, cur_gfn);
+       }
+       up_read(&gmap->mm->mmap_sem);
+}
+
 /* Section: vm related */
 /*
  * Get (and clear) the dirty memory log for a memory slot.
@@ -187,7 +211,36 @@ int kvm_dev_ioctl_check_extension(long ext)
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                               struct kvm_dirty_log *log)
 {
-       return 0;
+       int r;
+       unsigned long n;
+       struct kvm_memory_slot *memslot;
+       int is_dirty = 0;
+
+       mutex_lock(&kvm->slots_lock);
+
+       r = -EINVAL;
+       if (log->slot >= KVM_USER_MEM_SLOTS)
+               goto out;
+
+       memslot = id_to_memslot(kvm->memslots, log->slot);
+       r = -ENOENT;
+       if (!memslot->dirty_bitmap)
+               goto out;
+
+       kvm_s390_sync_dirty_log(kvm, memslot);
+       r = kvm_get_dirty_log(kvm, log, &is_dirty);
+       if (r)
+               goto out;
+
+       /* Clear the dirty log */
+       if (is_dirty) {
+               n = kvm_dirty_bitmap_bytes(memslot);
+               memset(memslot->dirty_bitmap, 0, n);
+       }
+       r = 0;
+out:
+       mutex_unlock(&kvm->slots_lock);
+       return r;
 }
 
 static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
@@ -209,11 +262,86 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
        return r;
 }
 
+static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+       int ret;
+       unsigned int idx;
+       switch (attr->attr) {
+       case KVM_S390_VM_MEM_ENABLE_CMMA:
+               ret = -EBUSY;
+               mutex_lock(&kvm->lock);
+               if (atomic_read(&kvm->online_vcpus) == 0) {
+                       kvm->arch.use_cmma = 1;
+                       ret = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
+       case KVM_S390_VM_MEM_CLR_CMMA:
+               mutex_lock(&kvm->lock);
+               idx = srcu_read_lock(&kvm->srcu);
+               page_table_reset_pgste(kvm->arch.gmap->mm, 0, TASK_SIZE, false);
+               srcu_read_unlock(&kvm->srcu, idx);
+               mutex_unlock(&kvm->lock);
+               ret = 0;
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+       return ret;
+}
+
+static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+       int ret;
+
+       switch (attr->group) {
+       case KVM_S390_VM_MEM_CTRL:
+               ret = kvm_s390_mem_control(kvm, attr);
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+
+       return ret;
+}
+
+static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+       return -ENXIO;
+}
+
+static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+       int ret;
+
+       switch (attr->group) {
+       case KVM_S390_VM_MEM_CTRL:
+               switch (attr->attr) {
+               case KVM_S390_VM_MEM_ENABLE_CMMA:
+               case KVM_S390_VM_MEM_CLR_CMMA:
+                       ret = 0;
+                       break;
+               default:
+                       ret = -ENXIO;
+                       break;
+               }
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+
+       return ret;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
        struct kvm *kvm = filp->private_data;
        void __user *argp = (void __user *)arg;
+       struct kvm_device_attr attr;
        int r;
 
        switch (ioctl) {
@@ -246,6 +374,27 @@ long kvm_arch_vm_ioctl(struct file *filp,
                }
                break;
        }
+       case KVM_SET_DEVICE_ATTR: {
+               r = -EFAULT;
+               if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+                       break;
+               r = kvm_s390_vm_set_attr(kvm, &attr);
+               break;
+       }
+       case KVM_GET_DEVICE_ATTR: {
+               r = -EFAULT;
+               if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+                       break;
+               r = kvm_s390_vm_get_attr(kvm, &attr);
+               break;
+       }
+       case KVM_HAS_DEVICE_ATTR: {
+               r = -EFAULT;
+               if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+                       break;
+               r = kvm_s390_vm_has_attr(kvm, &attr);
+               break;
+       }
        default:
                r = -ENOTTY;
        }
@@ -292,6 +441,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        spin_lock_init(&kvm->arch.float_int.lock);
        INIT_LIST_HEAD(&kvm->arch.float_int.list);
+       init_waitqueue_head(&kvm->arch.ipte_wq);
 
        debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
        VM_EVENT(kvm, 3, "%s", "vm created");
@@ -309,6 +459,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.css_support = 0;
        kvm->arch.use_irqchip = 0;
 
+       spin_lock_init(&kvm->arch.start_stop_lock);
+
        return 0;
 out_nogmap:
        debug_unregister(kvm->arch.dbf);
@@ -322,6 +474,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        VCPU_EVENT(vcpu, 3, "%s", "free cpu");
        trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
+       kvm_s390_clear_local_irqs(vcpu);
        kvm_clear_async_pf_completion_queue(vcpu);
        if (!kvm_is_ucontrol(vcpu->kvm)) {
                clear_bit(63 - vcpu->vcpu_id,
@@ -335,9 +488,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        if (kvm_is_ucontrol(vcpu->kvm))
                gmap_free(vcpu->arch.gmap);
 
-       if (vcpu->arch.sie_block->cbrlo)
-               __free_page(__pfn_to_page(
-                               vcpu->arch.sie_block->cbrlo >> PAGE_SHIFT));
+       if (kvm_s390_cmma_enabled(vcpu->kvm))
+               kvm_s390_vcpu_unsetup_cmma(vcpu);
        free_page((unsigned long)(vcpu->arch.sie_block));
 
        kvm_vcpu_uninit(vcpu);
@@ -372,6 +524,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        if (!kvm_is_ucontrol(kvm))
                gmap_free(kvm->arch.gmap);
        kvm_s390_destroy_adapters(kvm);
+       kvm_s390_clear_float_irqs(kvm);
 }
 
 /* Section: vcpu related */
@@ -442,7 +595,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
        vcpu->arch.sie_block->pp = 0;
        vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
        kvm_clear_async_pf_completion_queue(vcpu);
-       atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_vcpu_stop(vcpu);
        kvm_s390_clear_local_irqs(vcpu);
 }
 
@@ -451,9 +604,26 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
+{
+       free_page(vcpu->arch.sie_block->cbrlo);
+       vcpu->arch.sie_block->cbrlo = 0;
+}
+
+int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL);
+       if (!vcpu->arch.sie_block->cbrlo)
+               return -ENOMEM;
+
+       vcpu->arch.sie_block->ecb2 |= 0x80;
+       vcpu->arch.sie_block->ecb2 &= ~0x08;
+       return 0;
+}
+
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-       struct page *cbrl;
+       int rc = 0;
 
        atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
                                                    CPUSTAT_SM |
@@ -464,15 +634,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
                vcpu->arch.sie_block->ecb |= 0x10;
 
        vcpu->arch.sie_block->ecb2  = 8;
-       vcpu->arch.sie_block->eca   = 0xC1002001U;
+       vcpu->arch.sie_block->eca   = 0xD1002000U;
+       if (sclp_has_siif())
+               vcpu->arch.sie_block->eca |= 1;
        vcpu->arch.sie_block->fac   = (int) (long) vfacilities;
-       if (kvm_enabled_cmma()) {
-               cbrl = alloc_page(GFP_KERNEL | __GFP_ZERO);
-               if (cbrl) {
-                       vcpu->arch.sie_block->ecb2 |= 0x80;
-                       vcpu->arch.sie_block->ecb2 &= ~0x08;
-                       vcpu->arch.sie_block->cbrlo = page_to_phys(cbrl);
-               }
+       vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE |
+                                     ICTL_TPROT;
+
+       if (kvm_s390_cmma_enabled(vcpu->kvm)) {
+               rc = kvm_s390_vcpu_setup_cmma(vcpu);
+               if (rc)
+                       return rc;
        }
        hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
        tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
@@ -480,7 +652,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
        get_cpu_id(&vcpu->arch.cpu_id);
        vcpu->arch.cpu_id.version = 0xff;
-       return 0;
+       return rc;
 }
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
@@ -584,7 +756,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
                /* match against both prefix pages */
-               if (vcpu->arch.sie_block->prefix == (address & ~0x1000UL)) {
+               if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
                        VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
                        kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
                        exit_sie_sync(vcpu);
@@ -769,10 +941,40 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        return -EINVAL; /* not implemented yet */
 }
 
+#define VALID_GUESTDBG_FLAGS (KVM_GUESTDBG_SINGLESTEP | \
+                             KVM_GUESTDBG_USE_HW_BP | \
+                             KVM_GUESTDBG_ENABLE)
+
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                                        struct kvm_guest_debug *dbg)
 {
-       return -EINVAL; /* not implemented yet */
+       int rc = 0;
+
+       vcpu->guest_debug = 0;
+       kvm_s390_clear_bp_data(vcpu);
+
+       if (dbg->control & ~VALID_GUESTDBG_FLAGS)
+               return -EINVAL;
+
+       if (dbg->control & KVM_GUESTDBG_ENABLE) {
+               vcpu->guest_debug = dbg->control;
+               /* enforce guest PER */
+               atomic_set_mask(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+
+               if (dbg->control & KVM_GUESTDBG_USE_HW_BP)
+                       rc = kvm_s390_import_bp_data(vcpu, dbg);
+       } else {
+               atomic_clear_mask(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+               vcpu->arch.guestdbg.last_bp = 0;
+       }
+
+       if (rc) {
+               vcpu->guest_debug = 0;
+               kvm_s390_clear_bp_data(vcpu);
+               atomic_clear_mask(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+       }
+
+       return rc;
 }
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
@@ -787,8 +989,27 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
        return -EINVAL; /* not implemented yet */
 }
 
+bool kvm_s390_cmma_enabled(struct kvm *kvm)
+{
+       if (!MACHINE_IS_LPAR)
+               return false;
+       /* only enable for z10 and later */
+       if (!MACHINE_HAS_EDAT1)
+               return false;
+       if (!kvm->arch.use_cmma)
+               return false;
+       return true;
+}
+
+static bool ibs_enabled(struct kvm_vcpu *vcpu)
+{
+       return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS;
+}
+
 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 {
+retry:
+       s390_vcpu_unblock(vcpu);
        /*
         * We use MMU_RELOAD just to re-arm the ipte notifier for the
         * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
@@ -796,27 +1017,61 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
         * already finished. We might race against a second unmapper that
         * wants to set the blocking bit. Lets just retry the request loop.
         */
-       while (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
+       if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
                int rc;
                rc = gmap_ipte_notify(vcpu->arch.gmap,
-                                     vcpu->arch.sie_block->prefix,
+                                     kvm_s390_get_prefix(vcpu),
                                      PAGE_SIZE * 2);
                if (rc)
                        return rc;
-               s390_vcpu_unblock(vcpu);
+               goto retry;
+       }
+
+       if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) {
+               if (!ibs_enabled(vcpu)) {
+                       trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1);
+                       atomic_set_mask(CPUSTAT_IBS,
+                                       &vcpu->arch.sie_block->cpuflags);
+               }
+               goto retry;
        }
+
+       if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) {
+               if (ibs_enabled(vcpu)) {
+                       trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0);
+                       atomic_clear_mask(CPUSTAT_IBS,
+                                         &vcpu->arch.sie_block->cpuflags);
+               }
+               goto retry;
+       }
+
        return 0;
 }
 
-static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu)
+/**
+ * kvm_arch_fault_in_page - fault-in guest page if necessary
+ * @vcpu: The corresponding virtual cpu
+ * @gpa: Guest physical address
+ * @writable: Whether the page should be writable or not
+ *
+ * Make sure that a guest page has been faulted-in on the host.
+ *
+ * Return: Zero on success, negative error code otherwise.
+ */
+long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable)
 {
-       long rc;
-       hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
        struct mm_struct *mm = current->mm;
+       hva_t hva;
+       long rc;
+
+       hva = gmap_fault(gpa, vcpu->arch.gmap);
+       if (IS_ERR_VALUE(hva))
+               return (long)hva;
        down_read(&mm->mmap_sem);
-       rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL);
+       rc = get_user_pages(current, mm, hva, 1, writable, 0, NULL, NULL);
        up_read(&mm->mmap_sem);
-       return rc;
+
+       return rc < 0 ? rc : 0;
 }
 
 static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
@@ -883,8 +1138,9 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
        if (!vcpu->arch.gmap->pfault_enabled)
                return 0;
 
-       hva = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
-       if (copy_from_guest(vcpu, &arch.pfault_token, vcpu->arch.pfault_token, 8))
+       hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
+       hva += current->thread.gmap_addr & ~PAGE_MASK;
+       if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
                return 0;
 
        rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
@@ -917,6 +1173,11 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
        if (rc)
                return rc;
 
+       if (guestdbg_enabled(vcpu)) {
+               kvm_s390_backup_guest_per_regs(vcpu);
+               kvm_s390_patch_guest_per_regs(vcpu);
+       }
+
        vcpu->arch.sie_block->icptcode = 0;
        cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
        VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
@@ -933,6 +1194,9 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
                   vcpu->arch.sie_block->icptcode);
        trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
 
+       if (guestdbg_enabled(vcpu))
+               kvm_s390_restore_guest_per_regs(vcpu);
+
        if (exit_reason >= 0) {
                rc = 0;
        } else if (kvm_is_ucontrol(vcpu->kvm)) {
@@ -945,9 +1209,12 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
        } else if (current->thread.gmap_pfault) {
                trace_kvm_s390_major_guest_pfault(vcpu);
                current->thread.gmap_pfault = 0;
-               if (kvm_arch_setup_async_pf(vcpu) ||
-                   (kvm_arch_fault_in_sync(vcpu) >= 0))
+               if (kvm_arch_setup_async_pf(vcpu)) {
                        rc = 0;
+               } else {
+                       gpa_t gpa = current->thread.gmap_addr;
+                       rc = kvm_arch_fault_in_page(vcpu, gpa, 1);
+               }
        }
 
        if (rc == -1) {
@@ -969,16 +1236,6 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
        return rc;
 }
 
-bool kvm_enabled_cmma(void)
-{
-       if (!MACHINE_IS_LPAR)
-               return false;
-       /* only enable for z10 and later */
-       if (!MACHINE_HAS_EDAT1)
-               return false;
-       return true;
-}
-
 static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
        int rc, exit_reason;
@@ -1008,7 +1265,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
                rc = vcpu_post_run(vcpu, exit_reason);
-       } while (!signal_pending(current) && !rc);
+       } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
 
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        return rc;
@@ -1019,10 +1276,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int rc;
        sigset_t sigsaved;
 
+       if (guestdbg_exit_pending(vcpu)) {
+               kvm_s390_prepare_debug_exit(vcpu);
+               return 0;
+       }
+
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
-       atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_vcpu_start(vcpu);
 
        switch (kvm_run->exit_reason) {
        case KVM_EXIT_S390_SIEIC:
@@ -1031,6 +1293,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        case KVM_EXIT_S390_RESET:
        case KVM_EXIT_S390_UCONTROL:
        case KVM_EXIT_S390_TSCH:
+       case KVM_EXIT_DEBUG:
                break;
        default:
                BUG();
@@ -1056,6 +1319,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                rc = -EINTR;
        }
 
+       if (guestdbg_exit_pending(vcpu) && !rc)  {
+               kvm_s390_prepare_debug_exit(vcpu);
+               rc = 0;
+       }
+
        if (rc == -EOPNOTSUPP) {
                /* intercept cannot be handled in-kernel, prepare kvm-run */
                kvm_run->exit_reason         = KVM_EXIT_S390_SIEIC;
@@ -1073,7 +1341,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        kvm_run->psw_mask     = vcpu->arch.sie_block->gpsw.mask;
        kvm_run->psw_addr     = vcpu->arch.sie_block->gpsw.addr;
-       kvm_run->s.regs.prefix = vcpu->arch.sie_block->prefix;
+       kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
        memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
 
        if (vcpu->sigset_active)
@@ -1083,83 +1351,52 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return rc;
 }
 
-static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, void *from,
-                      unsigned long n, int prefix)
-{
-       if (prefix)
-               return copy_to_guest(vcpu, guestdest, from, n);
-       else
-               return copy_to_guest_absolute(vcpu, guestdest, from, n);
-}
-
 /*
  * store status at address
  * we use have two special cases:
  * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
  * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
  */
-int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr)
+int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
 {
        unsigned char archmode = 1;
-       int prefix;
+       unsigned int px;
        u64 clkcomp;
+       int rc;
 
-       if (addr == KVM_S390_STORE_STATUS_NOADDR) {
-               if (copy_to_guest_absolute(vcpu, 163ul, &archmode, 1))
+       if (gpa == KVM_S390_STORE_STATUS_NOADDR) {
+               if (write_guest_abs(vcpu, 163, &archmode, 1))
                        return -EFAULT;
-               addr = SAVE_AREA_BASE;
-               prefix = 0;
-       } else if (addr == KVM_S390_STORE_STATUS_PREFIXED) {
-               if (copy_to_guest(vcpu, 163ul, &archmode, 1))
+               gpa = SAVE_AREA_BASE;
+       } else if (gpa == KVM_S390_STORE_STATUS_PREFIXED) {
+               if (write_guest_real(vcpu, 163, &archmode, 1))
                        return -EFAULT;
-               addr = SAVE_AREA_BASE;
-               prefix = 1;
-       } else
-               prefix = 0;
-
-       if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
-                       vcpu->arch.guest_fpregs.fprs, 128, prefix))
-               return -EFAULT;
-
-       if (__guestcopy(vcpu, addr + offsetof(struct save_area, gp_regs),
-                       vcpu->run->s.regs.gprs, 128, prefix))
-               return -EFAULT;
-
-       if (__guestcopy(vcpu, addr + offsetof(struct save_area, psw),
-                       &vcpu->arch.sie_block->gpsw, 16, prefix))
-               return -EFAULT;
-
-       if (__guestcopy(vcpu, addr + offsetof(struct save_area, pref_reg),
-                       &vcpu->arch.sie_block->prefix, 4, prefix))
-               return -EFAULT;
-
-       if (__guestcopy(vcpu,
-                       addr + offsetof(struct save_area, fp_ctrl_reg),
-                       &vcpu->arch.guest_fpregs.fpc, 4, prefix))
-               return -EFAULT;
-
-       if (__guestcopy(vcpu, addr + offsetof(struct save_area, tod_reg),
-                       &vcpu->arch.sie_block->todpr, 4, prefix))
-               return -EFAULT;
-
-       if (__guestcopy(vcpu, addr + offsetof(struct save_area, timer),
-                       &vcpu->arch.sie_block->cputm, 8, prefix))
-               return -EFAULT;
-
+               gpa = kvm_s390_real_to_abs(vcpu, SAVE_AREA_BASE);
+       }
+       rc = write_guest_abs(vcpu, gpa + offsetof(struct save_area, fp_regs),
+                            vcpu->arch.guest_fpregs.fprs, 128);
+       rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, gp_regs),
+                             vcpu->run->s.regs.gprs, 128);
+       rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, psw),
+                             &vcpu->arch.sie_block->gpsw, 16);
+       px = kvm_s390_get_prefix(vcpu);
+       rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, pref_reg),
+                             &px, 4);
+       rc |= write_guest_abs(vcpu,
+                             gpa + offsetof(struct save_area, fp_ctrl_reg),
+                             &vcpu->arch.guest_fpregs.fpc, 4);
+       rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, tod_reg),
+                             &vcpu->arch.sie_block->todpr, 4);
+       rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, timer),
+                             &vcpu->arch.sie_block->cputm, 8);
        clkcomp = vcpu->arch.sie_block->ckc >> 8;
-       if (__guestcopy(vcpu, addr + offsetof(struct save_area, clk_cmp),
-                       &clkcomp, 8, prefix))
-               return -EFAULT;
-
-       if (__guestcopy(vcpu, addr + offsetof(struct save_area, acc_regs),
-                       &vcpu->run->s.regs.acrs, 64, prefix))
-               return -EFAULT;
-
-       if (__guestcopy(vcpu,
-                       addr + offsetof(struct save_area, ctrl_regs),
-                       &vcpu->arch.sie_block->gcr, 128, prefix))
-               return -EFAULT;
-       return 0;
+       rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, clk_cmp),
+                             &clkcomp, 8);
+       rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, acc_regs),
+                             &vcpu->run->s.regs.acrs, 64);
+       rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, ctrl_regs),
+                             &vcpu->arch.sie_block->gcr, 128);
+       return rc ? -EFAULT : 0;
 }
 
 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
@@ -1176,6 +1413,109 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
        return kvm_s390_store_status_unloaded(vcpu, addr);
 }
 
+static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
+{
+       return atomic_read(&(vcpu)->arch.sie_block->cpuflags) & CPUSTAT_STOPPED;
+}
+
+static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
+{
+       kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
+       kvm_make_request(KVM_REQ_DISABLE_IBS, vcpu);
+       exit_sie_sync(vcpu);
+}
+
+static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
+{
+       unsigned int i;
+       struct kvm_vcpu *vcpu;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               __disable_ibs_on_vcpu(vcpu);
+       }
+}
+
+static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
+{
+       kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
+       kvm_make_request(KVM_REQ_ENABLE_IBS, vcpu);
+       exit_sie_sync(vcpu);
+}
+
+void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
+{
+       int i, online_vcpus, started_vcpus = 0;
+
+       if (!is_vcpu_stopped(vcpu))
+               return;
+
+       trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1);
+       /* Only one cpu at a time may enter/leave the STOPPED state. */
+       spin_lock_bh(&vcpu->kvm->arch.start_stop_lock);
+       online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
+
+       for (i = 0; i < online_vcpus; i++) {
+               if (!is_vcpu_stopped(vcpu->kvm->vcpus[i]))
+                       started_vcpus++;
+       }
+
+       if (started_vcpus == 0) {
+               /* we're the only active VCPU -> speed it up */
+               __enable_ibs_on_vcpu(vcpu);
+       } else if (started_vcpus == 1) {
+               /*
+                * As we are starting a second VCPU, we have to disable
+                * the IBS facility on all VCPUs to remove potentially
+                * oustanding ENABLE requests.
+                */
+               __disable_ibs_on_all_vcpus(vcpu->kvm);
+       }
+
+       atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       /*
+        * Another VCPU might have used IBS while we were offline.
+        * Let's play safe and flush the VCPU at startup.
+        */
+       vcpu->arch.sie_block->ihcpu  = 0xffff;
+       spin_unlock_bh(&vcpu->kvm->arch.start_stop_lock);
+       return;
+}
+
+void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
+{
+       int i, online_vcpus, started_vcpus = 0;
+       struct kvm_vcpu *started_vcpu = NULL;
+
+       if (is_vcpu_stopped(vcpu))
+               return;
+
+       trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0);
+       /* Only one cpu at a time may enter/leave the STOPPED state. */
+       spin_lock_bh(&vcpu->kvm->arch.start_stop_lock);
+       online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
+
+       atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       __disable_ibs_on_vcpu(vcpu);
+
+       for (i = 0; i < online_vcpus; i++) {
+               if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) {
+                       started_vcpus++;
+                       started_vcpu = vcpu->kvm->vcpus[i];
+               }
+       }
+
+       if (started_vcpus == 1) {
+               /*
+                * As we only have one VCPU left, we want to enable the
+                * IBS facility for that VCPU to speed it up.
+                */
+               __enable_ibs_on_vcpu(started_vcpu);
+       }
+
+       spin_unlock_bh(&vcpu->kvm->arch.start_stop_lock);
+       return;
+}
+
 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                                     struct kvm_enable_cap *cap)
 {
index 3c1e2274d9eae858fce363cd5f89ddb699e1fa05..a8655ed31616746adc2fb2e7e454b22451e2f60e 100644 (file)
@@ -28,7 +28,6 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
 
 /* Transactional Memory Execution related macros */
 #define IS_TE_ENABLED(vcpu)    ((vcpu->arch.sie_block->ecb & 0x10))
-#define TDB_ADDR               0x1800UL
 #define TDB_FORMAT1            1
 #define IS_ITDB_VALID(vcpu)    ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
 
@@ -62,9 +61,15 @@ static inline int kvm_is_ucontrol(struct kvm *kvm)
 #endif
 }
 
+#define GUEST_PREFIX_SHIFT 13
+static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.sie_block->prefix << GUEST_PREFIX_SHIFT;
+}
+
 static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
 {
-       vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u;
+       vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT;
        vcpu->arch.sie_block->ihcpu  = 0xffff;
        kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 }
@@ -130,6 +135,7 @@ void kvm_s390_tasklet(unsigned long parm);
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
 void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
 void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu);
+void kvm_s390_clear_float_irqs(struct kvm *kvm);
 int __must_check kvm_s390_inject_vm(struct kvm *kvm,
                                    struct kvm_s390_interrupt *s390int);
 int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
@@ -137,35 +143,94 @@ int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
                                                    u64 cr6, u64 schid);
+void kvm_s390_reinject_io_int(struct kvm *kvm,
+                             struct kvm_s390_interrupt_info *inti);
 int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
 
 /* implemented in priv.c */
+int is_valid_psw(psw_t *psw);
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 
 /* implemented in kvm-s390.c */
+long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
+void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
+void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
 void s390_vcpu_block(struct kvm_vcpu *vcpu);
 void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
 void exit_sie(struct kvm_vcpu *vcpu);
 void exit_sie_sync(struct kvm_vcpu *vcpu);
-/* are we going to support cmma? */
-bool kvm_enabled_cmma(void);
+int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
+void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
+/* is cmma enabled */
+bool kvm_s390_cmma_enabled(struct kvm *kvm);
+int test_vfacility(unsigned long nr);
+
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
+/* implemented in interrupt.c */
+int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
+                            struct kvm_s390_pgm_info *pgm_info);
+
+/**
+ * kvm_s390_inject_prog_cond - conditionally inject a program check
+ * @vcpu: virtual cpu
+ * @rc: original return/error code
+ *
+ * This function is supposed to be used after regular guest access functions
+ * failed, to conditionally inject a program check to a vcpu. The typical
+ * pattern would look like
+ *
+ * rc = write_guest(vcpu, addr, data, len);
+ * if (rc)
+ *     return kvm_s390_inject_prog_cond(vcpu, rc);
+ *
+ * A negative return code from guest access functions implies an internal error
+ * like e.g. out of memory. In these cases no program check should be injected
+ * to the guest.
+ * A positive value implies that an exception happened while accessing a guest's
+ * memory. In this case all data belonging to the corresponding program check
+ * has been stored in vcpu->arch.pgm and can be injected with
+ * kvm_s390_inject_prog_irq().
+ *
+ * Returns: - the original @rc value if @rc was negative (internal error)
+ *         - zero if @rc was already zero
+ *         - zero or error code from injecting if @rc was positive
+ *           (program check injected to @vcpu)
+ */
+static inline int kvm_s390_inject_prog_cond(struct kvm_vcpu *vcpu, int rc)
+{
+       if (rc <= 0)
+               return rc;
+       return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+}
 
 /* implemented in interrupt.c */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int psw_extint_disabled(struct kvm_vcpu *vcpu);
 void kvm_s390_destroy_adapters(struct kvm *kvm);
+int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu);
+
+/* implemented in guestdbg.c */
+void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu);
+void kvm_s390_restore_guest_per_regs(struct kvm_vcpu *vcpu);
+void kvm_s390_patch_guest_per_regs(struct kvm_vcpu *vcpu);
+int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
+                           struct kvm_guest_debug *dbg);
+void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
+void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
 
 #endif
index 476e9e218f43ee5cfa2842951c845a0499c4834e..f89c1cd677519b205ff6f130f43453a5d5dc236a 100644 (file)
@@ -35,8 +35,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
 {
        struct kvm_vcpu *cpup;
        s64 hostclk, val;
+       int i, rc;
        u64 op2;
-       int i;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -44,8 +44,9 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
        op2 = kvm_s390_get_base_disp_s(vcpu);
        if (op2 & 7)    /* Operand must be on a doubleword boundary */
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-       if (get_guest(vcpu, val, (u64 __user *) op2))
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = read_guest(vcpu, op2, &val, sizeof(val));
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
 
        if (store_tod_clock(&hostclk)) {
                kvm_s390_set_psw_cc(vcpu, 3);
@@ -65,8 +66,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
 static int handle_set_prefix(struct kvm_vcpu *vcpu)
 {
        u64 operand2;
-       u32 address = 0;
-       u8 tmp;
+       u32 address;
+       int rc;
 
        vcpu->stat.instruction_spx++;
 
@@ -80,14 +81,18 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
        /* get the value */
-       if (get_guest(vcpu, address, (u32 __user *) operand2))
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = read_guest(vcpu, operand2, &address, sizeof(address));
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
 
-       address = address & 0x7fffe000u;
+       address &= 0x7fffe000u;
 
-       /* make sure that the new value is valid memory */
-       if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
-          (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)))
+       /*
+        * Make sure the new value is valid memory. We only need to check the
+        * first page, since address is 8k aligned and memory pieces are always
+        * at least 1MB aligned and have at least a size of 1MB.
+        */
+       if (kvm_is_error_gpa(vcpu->kvm, address))
                return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
        kvm_s390_set_prefix(vcpu, address);
@@ -101,6 +106,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
 {
        u64 operand2;
        u32 address;
+       int rc;
 
        vcpu->stat.instruction_stpx++;
 
@@ -113,12 +119,12 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
        if (operand2 & 3)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       address = vcpu->arch.sie_block->prefix;
-       address = address & 0x7fffe000u;
+       address = kvm_s390_get_prefix(vcpu);
 
        /* get the value */
-       if (put_guest(vcpu, address, (u32 __user *)operand2))
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = write_guest(vcpu, operand2, &address, sizeof(address));
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
 
        VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
        trace_kvm_s390_handle_prefix(vcpu, 0, address);
@@ -127,28 +133,44 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
 
 static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 {
-       u64 useraddr;
+       u16 vcpu_id = vcpu->vcpu_id;
+       u64 ga;
+       int rc;
 
        vcpu->stat.instruction_stap++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       useraddr = kvm_s390_get_base_disp_s(vcpu);
+       ga = kvm_s390_get_base_disp_s(vcpu);
 
-       if (useraddr & 1)
+       if (ga & 1)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       if (put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr))
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = write_guest(vcpu, ga, &vcpu_id, sizeof(vcpu_id));
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
 
-       VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
-       trace_kvm_s390_handle_stap(vcpu, useraddr);
+       VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", ga);
+       trace_kvm_s390_handle_stap(vcpu, ga);
        return 0;
 }
 
+static void __skey_check_enable(struct kvm_vcpu *vcpu)
+{
+       if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
+               return;
+
+       s390_enable_skey();
+       trace_kvm_s390_skey_related_inst(vcpu);
+       vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+}
+
+
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
+       __skey_check_enable(vcpu);
+
        vcpu->stat.instruction_storage_key++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -160,9 +182,21 @@ static int handle_skey(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
+{
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+
+       vcpu->stat.instruction_ipte_interlock++;
+       if (psw_bits(*psw).p)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+       wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
+       psw->addr = __rewind_psw(*psw, 4);
+       VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
+       return 0;
+}
+
 static int handle_test_block(struct kvm_vcpu *vcpu)
 {
-       unsigned long hva;
        gpa_t addr;
        int reg2;
 
@@ -171,16 +205,18 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
 
        kvm_s390_get_regs_rre(vcpu, NULL, &reg2);
        addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       addr = kvm_s390_logical_to_effective(vcpu, addr);
+       if (kvm_s390_check_low_addr_protection(vcpu, addr))
+               return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
        addr = kvm_s390_real_to_abs(vcpu, addr);
 
-       hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
-       if (kvm_is_error_hva(hva))
+       if (kvm_is_error_gpa(vcpu->kvm, addr))
                return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
        /*
         * We don't expect errors on modern systems, and do not care
         * about storage keys (yet), so let's just clear the page.
         */
-       if (clear_user((void __user *)hva, PAGE_SIZE) != 0)
+       if (kvm_clear_guest(vcpu->kvm, addr, PAGE_SIZE))
                return -EFAULT;
        kvm_s390_set_psw_cc(vcpu, 0);
        vcpu->run->s.regs.gprs[0] = 0;
@@ -190,9 +226,12 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
 static int handle_tpi(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_interrupt_info *inti;
+       unsigned long len;
+       u32 tpi_data[3];
+       int cc, rc;
        u64 addr;
-       int cc;
 
+       rc = 0;
        addr = kvm_s390_get_base_disp_s(vcpu);
        if (addr & 3)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -201,30 +240,41 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
        if (!inti)
                goto no_interrupt;
        cc = 1;
+       tpi_data[0] = inti->io.subchannel_id << 16 | inti->io.subchannel_nr;
+       tpi_data[1] = inti->io.io_int_parm;
+       tpi_data[2] = inti->io.io_int_word;
        if (addr) {
                /*
                 * Store the two-word I/O interruption code into the
                 * provided area.
                 */
-               if (put_guest(vcpu, inti->io.subchannel_id, (u16 __user *)addr)
-                   || put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *)(addr + 2))
-                   || put_guest(vcpu, inti->io.io_int_parm, (u32 __user *)(addr + 4)))
-                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+               len = sizeof(tpi_data) - 4;
+               rc = write_guest(vcpu, addr, &tpi_data, len);
+               if (rc)
+                       return kvm_s390_inject_prog_cond(vcpu, rc);
        } else {
                /*
                 * Store the three-word I/O interruption code into
                 * the appropriate lowcore area.
                 */
-               put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) __LC_SUBCHANNEL_ID);
-               put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) __LC_SUBCHANNEL_NR);
-               put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) __LC_IO_INT_PARM);
-               put_guest(vcpu, inti->io.io_int_word, (u32 __user *) __LC_IO_INT_WORD);
+               len = sizeof(tpi_data);
+               if (write_guest_lc(vcpu, __LC_SUBCHANNEL_ID, &tpi_data, len))
+                       rc = -EFAULT;
        }
-       kfree(inti);
+       /*
+        * If we encounter a problem storing the interruption code, the
+        * instruction is suppressed from the guest's view: reinject the
+        * interrupt.
+        */
+       if (!rc)
+               kfree(inti);
+       else
+               kvm_s390_reinject_io_int(vcpu->kvm, inti);
 no_interrupt:
        /* Set condition code and we're done. */
-       kvm_s390_set_psw_cc(vcpu, cc);
-       return 0;
+       if (!rc)
+               kvm_s390_set_psw_cc(vcpu, cc);
+       return rc ? -EFAULT : 0;
 }
 
 static int handle_tsch(struct kvm_vcpu *vcpu)
@@ -292,10 +342,10 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
-                          vfacilities, 4);
+       rc = write_guest_lc(vcpu, offsetof(struct _lowcore, stfl_fac_list),
+                           vfacilities, 4);
        if (rc)
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+               return rc;
        VCPU_EVENT(vcpu, 5, "store facility list value %x",
                   *(unsigned int *) vfacilities);
        trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
@@ -314,7 +364,8 @@ static void handle_new_psw(struct kvm_vcpu *vcpu)
 #define PSW_ADDR_24 0x0000000000ffffffUL
 #define PSW_ADDR_31 0x000000007fffffffUL
 
-static int is_valid_psw(psw_t *psw) {
+int is_valid_psw(psw_t *psw)
+{
        if (psw->mask & PSW_MASK_UNASSIGNED)
                return 0;
        if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) {
@@ -325,6 +376,8 @@ static int is_valid_psw(psw_t *psw) {
                return 0;
        if ((psw->mask & PSW_MASK_ADDR_MODE) ==  PSW_MASK_EA)
                return 0;
+       if (psw->addr & 1)
+               return 0;
        return 1;
 }
 
@@ -333,6 +386,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
        psw_t *gpsw = &vcpu->arch.sie_block->gpsw;
        psw_compat_t new_psw;
        u64 addr;
+       int rc;
 
        if (gpsw->mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -340,8 +394,10 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
        addr = kvm_s390_get_base_disp_s(vcpu);
        if (addr & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-       if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       rc = read_guest(vcpu, addr, &new_psw, sizeof(new_psw));
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
        if (!(new_psw.mask & PSW32_MASK_BASE))
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
        gpsw->mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32;
@@ -357,6 +413,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 {
        psw_t new_psw;
        u64 addr;
+       int rc;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -364,8 +421,9 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
        addr = kvm_s390_get_base_disp_s(vcpu);
        if (addr & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-       if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = read_guest(vcpu, addr, &new_psw, sizeof(new_psw));
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
        vcpu->arch.sie_block->gpsw = new_psw;
        if (!is_valid_psw(&vcpu->arch.sie_block->gpsw))
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -375,7 +433,9 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 
 static int handle_stidp(struct kvm_vcpu *vcpu)
 {
+       u64 stidp_data = vcpu->arch.stidp_data;
        u64 operand2;
+       int rc;
 
        vcpu->stat.instruction_stidp++;
 
@@ -387,8 +447,9 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
        if (operand2 & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       if (put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2))
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = write_guest(vcpu, operand2, &stidp_data, sizeof(stidp_data));
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
 
        VCPU_EVENT(vcpu, 5, "%s", "store cpu id");
        return 0;
@@ -474,9 +535,10 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
                break;
        }
 
-       if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
-               rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out_exception;
+       rc = write_guest(vcpu, operand2, (void *)mem, PAGE_SIZE);
+       if (rc) {
+               rc = kvm_s390_inject_prog_cond(vcpu, rc);
+               goto out;
        }
        trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
        free_page(mem);
@@ -485,7 +547,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
        return 0;
 out_no_data:
        kvm_s390_set_psw_cc(vcpu, 3);
-out_exception:
+out:
        free_page(mem);
        return rc;
 }
@@ -496,6 +558,7 @@ static const intercept_handler_t b2_handlers[256] = {
        [0x10] = handle_set_prefix,
        [0x11] = handle_store_prefix,
        [0x12] = handle_store_cpu_address,
+       [0x21] = handle_ipte_interlock,
        [0x29] = handle_skey,
        [0x2a] = handle_skey,
        [0x2b] = handle_skey,
@@ -513,6 +576,7 @@ static const intercept_handler_t b2_handlers[256] = {
        [0x3a] = handle_io_inst,
        [0x3b] = handle_io_inst,
        [0x3c] = handle_io_inst,
+       [0x50] = handle_ipte_interlock,
        [0x5f] = handle_io_inst,
        [0x74] = handle_io_inst,
        [0x76] = handle_io_inst,
@@ -591,6 +655,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
        start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+               if (kvm_s390_check_low_addr_protection(vcpu, start))
+                       return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+       }
+
        switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
        case 0x00000000:
                end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
@@ -606,10 +675,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
        }
        while (start < end) {
-               unsigned long useraddr;
-
-               useraddr = gmap_translate(start, vcpu->arch.gmap);
-               if (IS_ERR((void *)useraddr))
+               unsigned long useraddr, abs_addr;
+
+               /* Translate guest address to host address */
+               if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
+                       abs_addr = kvm_s390_real_to_abs(vcpu, start);
+               else
+                       abs_addr = start;
+               useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+               if (kvm_is_error_hva(useraddr))
                        return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
                if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
@@ -618,6 +692,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
                }
 
                if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
+                       __skey_check_enable(vcpu);
                        if (set_guest_storage_key(current->mm, useraddr,
                                        vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
                                        vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
@@ -642,7 +717,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
        VCPU_EVENT(vcpu, 5, "cmma release %d pages", entries);
        gmap = vcpu->arch.gmap;
        vcpu->stat.instruction_essa++;
-       if (!kvm_enabled_cmma() || !vcpu->arch.sie_block->cbrlo)
+       if (!kvm_s390_cmma_enabled(vcpu->kvm))
                return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -672,7 +747,10 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 }
 
 static const intercept_handler_t b9_handlers[256] = {
+       [0x8a] = handle_ipte_interlock,
        [0x8d] = handle_epsw,
+       [0x8e] = handle_ipte_interlock,
+       [0x8f] = handle_ipte_interlock,
        [0xab] = handle_essa,
        [0xaf] = handle_pfmf,
 };
@@ -693,32 +771,67 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
 {
        int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
        int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
-       u64 useraddr;
        u32 val = 0;
        int reg, rc;
+       u64 ga;
 
        vcpu->stat.instruction_lctl++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       useraddr = kvm_s390_get_base_disp_rs(vcpu);
+       ga = kvm_s390_get_base_disp_rs(vcpu);
 
-       if (useraddr & 3)
+       if (ga & 3)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
-                  useraddr);
-       trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
+       VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3, ga);
+       trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, ga);
 
        reg = reg1;
        do {
-               rc = get_guest(vcpu, val, (u32 __user *) useraddr);
+               rc = read_guest(vcpu, ga, &val, sizeof(val));
                if (rc)
-                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                       return kvm_s390_inject_prog_cond(vcpu, rc);
                vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
                vcpu->arch.sie_block->gcr[reg] |= val;
-               useraddr += 4;
+               ga += 4;
+               if (reg == reg3)
+                       break;
+               reg = (reg + 1) % 16;
+       } while (1);
+
+       return 0;
+}
+
+int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu)
+{
+       int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
+       int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
+       u64 ga;
+       u32 val;
+       int reg, rc;
+
+       vcpu->stat.instruction_stctl++;
+
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+       ga = kvm_s390_get_base_disp_rs(vcpu);
+
+       if (ga & 3)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       VCPU_EVENT(vcpu, 5, "stctl r1:%x, r3:%x, addr:%llx", reg1, reg3, ga);
+       trace_kvm_s390_handle_stctl(vcpu, 0, reg1, reg3, ga);
+
+       reg = reg1;
+       do {
+               val = vcpu->arch.sie_block->gcr[reg] &  0x00000000fffffffful;
+               rc = write_guest(vcpu, ga, &val, sizeof(val));
+               if (rc)
+                       return kvm_s390_inject_prog_cond(vcpu, rc);
+               ga += 4;
                if (reg == reg3)
                        break;
                reg = (reg + 1) % 16;
@@ -731,7 +844,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
 {
        int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
        int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
-       u64 useraddr;
+       u64 ga, val;
        int reg, rc;
 
        vcpu->stat.instruction_lctlg++;
@@ -739,23 +852,58 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       useraddr = kvm_s390_get_base_disp_rsy(vcpu);
+       ga = kvm_s390_get_base_disp_rsy(vcpu);
 
-       if (useraddr & 7)
+       if (ga & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
        reg = reg1;
 
-       VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
-                  useraddr);
-       trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
+       VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3, ga);
+       trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, ga);
 
        do {
-               rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
-                              (u64 __user *) useraddr);
+               rc = read_guest(vcpu, ga, &val, sizeof(val));
                if (rc)
-                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               useraddr += 8;
+                       return kvm_s390_inject_prog_cond(vcpu, rc);
+               vcpu->arch.sie_block->gcr[reg] = val;
+               ga += 8;
+               if (reg == reg3)
+                       break;
+               reg = (reg + 1) % 16;
+       } while (1);
+
+       return 0;
+}
+
+static int handle_stctg(struct kvm_vcpu *vcpu)
+{
+       int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
+       int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
+       u64 ga, val;
+       int reg, rc;
+
+       vcpu->stat.instruction_stctg++;
+
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+       ga = kvm_s390_get_base_disp_rsy(vcpu);
+
+       if (ga & 7)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       reg = reg1;
+
+       VCPU_EVENT(vcpu, 5, "stctg r1:%x, r3:%x, addr:%llx", reg1, reg3, ga);
+       trace_kvm_s390_handle_stctl(vcpu, 1, reg1, reg3, ga);
+
+       do {
+               val = vcpu->arch.sie_block->gcr[reg];
+               rc = write_guest(vcpu, ga, &val, sizeof(val));
+               if (rc)
+                       return kvm_s390_inject_prog_cond(vcpu, rc);
+               ga += 8;
                if (reg == reg3)
                        break;
                reg = (reg + 1) % 16;
@@ -766,6 +914,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
 
 static const intercept_handler_t eb_handlers[256] = {
        [0x2f] = handle_lctlg,
+       [0x25] = handle_stctg,
 };
 
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
@@ -781,8 +930,9 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
 static int handle_tprot(struct kvm_vcpu *vcpu)
 {
        u64 address1, address2;
-       struct vm_area_struct *vma;
-       unsigned long user_address;
+       unsigned long hva, gpa;
+       int ret = 0, cc = 0;
+       bool writable;
 
        vcpu->stat.instruction_tprot++;
 
@@ -793,32 +943,41 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
 
        /* we only handle the Linux memory detection case:
         * access key == 0
-        * guest DAT == off
         * everything else goes to userspace. */
        if (address2 & 0xf0)
                return -EOPNOTSUPP;
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
-               return -EOPNOTSUPP;
-
-       down_read(&current->mm->mmap_sem);
-       user_address = __gmap_translate(address1, vcpu->arch.gmap);
-       if (IS_ERR_VALUE(user_address))
-               goto out_inject;
-       vma = find_vma(current->mm, user_address);
-       if (!vma)
-               goto out_inject;
-       vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-       if (!(vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_READ))
-               vcpu->arch.sie_block->gpsw.mask |= (1ul << 44);
-       if (!(vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_READ))
-               vcpu->arch.sie_block->gpsw.mask |= (2ul << 44);
-
-       up_read(&current->mm->mmap_sem);
-       return 0;
+               ipte_lock(vcpu);
+       ret = guest_translate_address(vcpu, address1, &gpa, 1);
+       if (ret == PGM_PROTECTION) {
+               /* Write protected? Try again with read-only... */
+               cc = 1;
+               ret = guest_translate_address(vcpu, address1, &gpa, 0);
+       }
+       if (ret) {
+               if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) {
+                       ret = kvm_s390_inject_program_int(vcpu, ret);
+               } else if (ret > 0) {
+                       /* Translation not available */
+                       kvm_s390_set_psw_cc(vcpu, 3);
+                       ret = 0;
+               }
+               goto out_unlock;
+       }
 
-out_inject:
-       up_read(&current->mm->mmap_sem);
-       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       hva = gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable);
+       if (kvm_is_error_hva(hva)) {
+               ret = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       } else {
+               if (!writable)
+                       cc = 1;         /* Write not permitted ==> read-only */
+               kvm_s390_set_psw_cc(vcpu, cc);
+               /* Note: CC2 only occurs for storage keys (not supported yet) */
+       }
+out_unlock:
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
+               ipte_unlock(vcpu);
+       return ret;
 }
 
 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu)
index 26caeb530a7829f6688a31e8ecaa84203b2496bb..43079a48cc98d7858960e71f4d2f0b0b28ca8708 100644 (file)
@@ -54,33 +54,23 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 
 static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 {
-       struct kvm_s390_local_interrupt *li;
-       struct kvm_s390_interrupt_info *inti;
+       struct kvm_s390_interrupt s390int = {
+               .type = KVM_S390_INT_EMERGENCY,
+               .parm = vcpu->vcpu_id,
+       };
        struct kvm_vcpu *dst_vcpu = NULL;
+       int rc = 0;
 
        if (cpu_addr < KVM_MAX_VCPUS)
                dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
        if (!dst_vcpu)
                return SIGP_CC_NOT_OPERATIONAL;
 
-       inti = kzalloc(sizeof(*inti), GFP_KERNEL);
-       if (!inti)
-               return -ENOMEM;
-
-       inti->type = KVM_S390_INT_EMERGENCY;
-       inti->emerg.code = vcpu->vcpu_id;
-
-       li = &dst_vcpu->arch.local_int;
-       spin_lock_bh(&li->lock);
-       list_add_tail(&inti->list, &li->list);
-       atomic_set(&li->active, 1);
-       atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-       if (waitqueue_active(li->wq))
-               wake_up_interruptible(li->wq);
-       spin_unlock_bh(&li->lock);
-       VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
+       rc = kvm_s390_inject_vcpu(dst_vcpu, &s390int);
+       if (!rc)
+               VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
 
-       return SIGP_CC_ORDER_CODE_ACCEPTED;
+       return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
 static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr,
@@ -116,33 +106,23 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr,
 
 static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
 {
-       struct kvm_s390_local_interrupt *li;
-       struct kvm_s390_interrupt_info *inti;
+       struct kvm_s390_interrupt s390int = {
+               .type = KVM_S390_INT_EXTERNAL_CALL,
+               .parm = vcpu->vcpu_id,
+       };
        struct kvm_vcpu *dst_vcpu = NULL;
+       int rc;
 
        if (cpu_addr < KVM_MAX_VCPUS)
                dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
        if (!dst_vcpu)
                return SIGP_CC_NOT_OPERATIONAL;
 
-       inti = kzalloc(sizeof(*inti), GFP_KERNEL);
-       if (!inti)
-               return -ENOMEM;
+       rc = kvm_s390_inject_vcpu(dst_vcpu, &s390int);
+       if (!rc)
+               VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
 
-       inti->type = KVM_S390_INT_EXTERNAL_CALL;
-       inti->extcall.code = vcpu->vcpu_id;
-
-       li = &dst_vcpu->arch.local_int;
-       spin_lock_bh(&li->lock);
-       list_add_tail(&inti->list, &li->list);
-       atomic_set(&li->active, 1);
-       atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-       if (waitqueue_active(li->wq))
-               wake_up_interruptible(li->wq);
-       spin_unlock_bh(&li->lock);
-       VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
-
-       return SIGP_CC_ORDER_CODE_ACCEPTED;
+       return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
 static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
@@ -235,7 +215,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
        struct kvm_vcpu *dst_vcpu = NULL;
        struct kvm_s390_interrupt_info *inti;
        int rc;
-       u8 tmp;
 
        if (cpu_addr < KVM_MAX_VCPUS)
                dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
@@ -243,10 +222,13 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
                return SIGP_CC_NOT_OPERATIONAL;
        li = &dst_vcpu->arch.local_int;
 
-       /* make sure that the new value is valid memory */
-       address = address & 0x7fffe000u;
-       if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
-          copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) {
+       /*
+        * Make sure the new value is valid memory. We only need to check the
+        * first page, since address is 8k aligned and memory pieces are always
+        * at least 1MB aligned and have at least a size of 1MB.
+        */
+       address &= 0x7fffe000u;
+       if (kvm_is_error_gpa(vcpu->kvm, address)) {
                *reg &= 0xffffffff00000000UL;
                *reg |= SIGP_STATUS_INVALID_PARAMETER;
                return SIGP_CC_STATUS_STORED;
@@ -456,3 +438,38 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
        kvm_s390_set_psw_cc(vcpu, rc);
        return 0;
 }
+
+/*
+ * Handle SIGP partial execution interception.
+ *
+ * This interception will occur at the source cpu when a source cpu sends an
+ * external call to a target cpu and the target cpu has the WAIT bit set in
+ * its cpuflags. Interception will occurr after the interrupt indicator bits at
+ * the target cpu have been set. All error cases will lead to instruction
+ * interception, therefore nothing is to be checked or prepared.
+ */
+int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
+{
+       int r3 = vcpu->arch.sie_block->ipa & 0x000f;
+       u16 cpu_addr = vcpu->run->s.regs.gprs[r3];
+       struct kvm_vcpu *dest_vcpu;
+       u8 order_code = kvm_s390_get_base_disp_rs(vcpu);
+
+       trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
+
+       if (order_code == SIGP_EXTERNAL_CALL) {
+               dest_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+               BUG_ON(dest_vcpu == NULL);
+
+               spin_lock_bh(&dest_vcpu->arch.local_int.lock);
+               if (waitqueue_active(&dest_vcpu->wq))
+                       wake_up_interruptible(&dest_vcpu->wq);
+               dest_vcpu->preempted = true;
+               spin_unlock_bh(&dest_vcpu->arch.local_int.lock);
+
+               kvm_s390_set_psw_cc(vcpu, SIGP_CC_ORDER_CODE_ACCEPTED);
+               return 0;
+       }
+
+       return -EOPNOTSUPP;
+}
index 13f30f58a2df7d94ba8f91d8113f49c8d7938d27..647e9d6a4818ecb20ecf7e53e5199069e751bf24 100644 (file)
@@ -67,6 +67,27 @@ TRACE_EVENT(kvm_s390_destroy_vcpu,
            TP_printk("destroy cpu %d", __entry->id)
        );
 
+/*
+ * Trace point for start and stop of vpcus.
+ */
+TRACE_EVENT(kvm_s390_vcpu_start_stop,
+           TP_PROTO(unsigned int id, int state),
+           TP_ARGS(id, state),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int, id)
+                   __field(int, state)
+                   ),
+
+           TP_fast_assign(
+                   __entry->id = id;
+                   __entry->state = state;
+                   ),
+
+           TP_printk("%s cpu %d", __entry->state ? "starting" : "stopping",
+                     __entry->id)
+       );
+
 /*
  * Trace points for injection of interrupts, either per machine or
  * per vcpu.
@@ -223,6 +244,28 @@ TRACE_EVENT(kvm_s390_enable_css,
                      __entry->kvm)
        );
 
+/*
+ * Trace point for enabling and disabling interlocking-and-broadcasting
+ * suppression.
+ */
+TRACE_EVENT(kvm_s390_enable_disable_ibs,
+           TP_PROTO(unsigned int id, int state),
+           TP_ARGS(id, state),
+
+           TP_STRUCT__entry(
+                   __field(unsigned int, id)
+                   __field(int, state)
+                   ),
+
+           TP_fast_assign(
+                   __entry->id = id;
+                   __entry->state = state;
+                   ),
+
+           TP_printk("%s ibs on cpu %d",
+                     __entry->state ? "enabling" : "disabling", __entry->id)
+       );
+
 
 #endif /* _TRACE_KVMS390_H */
 
index e8e7213d4cc53a31c2b7f6e86bfffe6fa4a9bc67..916834d7a73a760da5b7fd9beba5c267ad437240 100644 (file)
@@ -2,7 +2,7 @@
 #define _TRACE_KVM_H
 
 #include <linux/tracepoint.h>
-#include <asm/sigp.h>
+#include <asm/sie.h>
 #include <asm/debug.h>
 #include <asm/dis.h>
 
        TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id,           \
                  __entry->pswmask, __entry->pswaddr, p_args)
 
+TRACE_EVENT(kvm_s390_skey_related_inst,
+           TP_PROTO(VCPU_PROTO_COMMON),
+           TP_ARGS(VCPU_ARGS_COMMON),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   ),
+           VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu")
+       );
+
 TRACE_EVENT(kvm_s390_major_guest_pfault,
            TP_PROTO(VCPU_PROTO_COMMON),
            TP_ARGS(VCPU_ARGS_COMMON),
@@ -111,17 +125,6 @@ TRACE_EVENT(kvm_s390_sie_fault,
            VCPU_TP_PRINTK("%s", "fault in sie instruction")
        );
 
-#define sie_intercept_code                             \
-       {0x04, "Instruction"},                          \
-       {0x08, "Program interruption"},                 \
-       {0x0C, "Instruction and program interruption"}, \
-       {0x10, "External request"},                     \
-       {0x14, "External interruption"},                \
-       {0x18, "I/O request"},                          \
-       {0x1C, "Wait state"},                           \
-       {0x20, "Validity"},                             \
-       {0x28, "Stop request"}
-
 TRACE_EVENT(kvm_s390_sie_exit,
            TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode),
            TP_ARGS(VCPU_ARGS_COMMON, icptcode),
@@ -151,7 +154,6 @@ TRACE_EVENT(kvm_s390_intercept_instruction,
            TP_STRUCT__entry(
                    VCPU_FIELD_COMMON
                    __field(__u64, instruction)
-                   __field(char, insn[8])
                    ),
 
            TP_fast_assign(
@@ -162,10 +164,8 @@ TRACE_EVENT(kvm_s390_intercept_instruction,
 
            VCPU_TP_PRINTK("intercepted instruction %016llx (%s)",
                           __entry->instruction,
-                          insn_to_mnemonic((unsigned char *)
-                                           &__entry->instruction,
-                                        __entry->insn, sizeof(__entry->insn)) ?
-                          "unknown" : __entry->insn)
+                          __print_symbolic(icpt_insn_decoder(__entry->instruction),
+                                           icpt_insn_codes))
        );
 
 /*
@@ -213,18 +213,6 @@ TRACE_EVENT(kvm_s390_intercept_validity,
  * Trace points for instructions that are of special interest.
  */
 
-#define sigp_order_codes                                       \
-       {SIGP_SENSE, "sense"},                                  \
-       {SIGP_EXTERNAL_CALL, "external call"},                  \
-       {SIGP_EMERGENCY_SIGNAL, "emergency signal"},            \
-       {SIGP_STOP, "stop"},                                    \
-       {SIGP_STOP_AND_STORE_STATUS, "stop and store status"},  \
-       {SIGP_SET_ARCHITECTURE, "set architecture"},            \
-       {SIGP_SET_PREFIX, "set prefix"},                        \
-       {SIGP_STORE_STATUS_AT_ADDRESS, "store status at addr"}, \
-       {SIGP_SENSE_RUNNING, "sense running"},                  \
-       {SIGP_RESTART, "restart"}
-
 TRACE_EVENT(kvm_s390_handle_sigp,
            TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \
                     __u32 parameter),
@@ -251,12 +239,28 @@ TRACE_EVENT(kvm_s390_handle_sigp,
                           __entry->cpu_addr, __entry->parameter)
        );
 
-#define diagnose_codes                         \
-       {0x10, "release pages"},                \
-       {0x44, "time slice end"},               \
-       {0x308, "ipl functions"},               \
-       {0x500, "kvm hypercall"},               \
-       {0x501, "kvm breakpoint"}
+TRACE_EVENT(kvm_s390_handle_sigp_pei,
+           TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr),
+           TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(__u8, order_code)
+                   __field(__u16, cpu_addr)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->order_code = order_code;
+                   __entry->cpu_addr = cpu_addr;
+                   ),
+
+           VCPU_TP_PRINTK("handle sigp pei order %02x (%s), cpu address %04x",
+                          __entry->order_code,
+                          __print_symbolic(__entry->order_code,
+                                           sigp_order_codes),
+                          __entry->cpu_addr)
+       );
 
 TRACE_EVENT(kvm_s390_handle_diag,
            TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
@@ -301,6 +305,31 @@ TRACE_EVENT(kvm_s390_handle_lctl,
                           __entry->reg1, __entry->reg3, __entry->addr)
        );
 
+TRACE_EVENT(kvm_s390_handle_stctl,
+           TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr),
+           TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(int, g)
+                   __field(int, reg1)
+                   __field(int, reg3)
+                   __field(u64, addr)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->g = g;
+                   __entry->reg1 = reg1;
+                   __entry->reg3 = reg3;
+                   __entry->addr = addr;
+                   ),
+
+           VCPU_TP_PRINTK("%s: storing cr %x-%x to %016llx",
+                          __entry->g ? "stctg" : "stctl",
+                          __entry->reg1, __entry->reg3, __entry->addr)
+       );
+
 TRACE_EVENT(kvm_s390_handle_prefix,
            TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address),
            TP_ARGS(VCPU_ARGS_COMMON, set, address),
index 7881d4eb8b6bdf0891a89fa94b67bfebaab0a875..37b8241ec784ae8c2f9e32eb8a9868bb41e72e8a 100644 (file)
@@ -834,6 +834,7 @@ void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte)
        }
        spin_unlock(&gmap_notifier_lock);
 }
+EXPORT_SYMBOL_GPL(gmap_do_ipte_notify);
 
 static inline int page_table_with_pgste(struct page *page)
 {
@@ -866,8 +867,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
        atomic_set(&page->_mapcount, 0);
        table = (unsigned long *) page_to_phys(page);
        clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
-       clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
-                   PAGE_SIZE/2);
+       clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
        return table;
 }
 
@@ -885,8 +885,8 @@ static inline void page_table_free_pgste(unsigned long *table)
        __free_page(page);
 }
 
-static inline unsigned long page_table_reset_pte(struct mm_struct *mm,
-                       pmd_t *pmd, unsigned long addr, unsigned long end)
+static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, unsigned long end, bool init_skey)
 {
        pte_t *start_pte, *pte;
        spinlock_t *ptl;
@@ -897,6 +897,22 @@ static inline unsigned long page_table_reset_pte(struct mm_struct *mm,
        do {
                pgste = pgste_get_lock(pte);
                pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
+               if (init_skey) {
+                       unsigned long address;
+
+                       pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
+                                             PGSTE_GR_BIT | PGSTE_GC_BIT);
+
+                       /* skip invalid and not writable pages */
+                       if (pte_val(*pte) & _PAGE_INVALID ||
+                           !(pte_val(*pte) & _PAGE_WRITE)) {
+                               pgste_set_unlock(pte, pgste);
+                               continue;
+                       }
+
+                       address = pte_val(*pte) & PAGE_MASK;
+                       page_set_storage_key(address, PAGE_DEFAULT_KEY, 1);
+               }
                pgste_set_unlock(pte, pgste);
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(start_pte, ptl);
@@ -904,8 +920,8 @@ static inline unsigned long page_table_reset_pte(struct mm_struct *mm,
        return addr;
 }
 
-static inline unsigned long page_table_reset_pmd(struct mm_struct *mm,
-                       pud_t *pud, unsigned long addr, unsigned long end)
+static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t *pud,
+                       unsigned long addr, unsigned long end, bool init_skey)
 {
        unsigned long next;
        pmd_t *pmd;
@@ -915,14 +931,14 @@ static inline unsigned long page_table_reset_pmd(struct mm_struct *mm,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-               next = page_table_reset_pte(mm, pmd, addr, next);
+               next = page_table_reset_pte(mm, pmd, addr, next, init_skey);
        } while (pmd++, addr = next, addr != end);
 
        return addr;
 }
 
-static inline unsigned long page_table_reset_pud(struct mm_struct *mm,
-                       pgd_t *pgd, unsigned long addr, unsigned long end)
+static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t *pgd,
+                       unsigned long addr, unsigned long end, bool init_skey)
 {
        unsigned long next;
        pud_t *pud;
@@ -932,28 +948,33 @@ static inline unsigned long page_table_reset_pud(struct mm_struct *mm,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-               next = page_table_reset_pmd(mm, pud, addr, next);
+               next = page_table_reset_pmd(mm, pud, addr, next, init_skey);
        } while (pud++, addr = next, addr != end);
 
        return addr;
 }
 
-void page_table_reset_pgste(struct mm_struct *mm,
-                       unsigned long start, unsigned long end)
+void page_table_reset_pgste(struct mm_struct *mm, unsigned long start,
+                           unsigned long end, bool init_skey)
 {
        unsigned long addr, next;
        pgd_t *pgd;
 
+       down_write(&mm->mmap_sem);
+       if (init_skey && mm_use_skey(mm))
+               goto out_up;
        addr = start;
-       down_read(&mm->mmap_sem);
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-               next = page_table_reset_pud(mm, pgd, addr, next);
+               next = page_table_reset_pud(mm, pgd, addr, next, init_skey);
        } while (pgd++, addr = next, addr != end);
-       up_read(&mm->mmap_sem);
+       if (init_skey)
+               current->mm->context.use_skey = 1;
+out_up:
+       up_write(&mm->mmap_sem);
 }
 EXPORT_SYMBOL(page_table_reset_pgste);
 
@@ -991,7 +1012,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
        /* changing the guest storage key is considered a change of the page */
        if ((pgste_val(new) ^ pgste_val(old)) &
            (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
-               pgste_val(new) |= PGSTE_HC_BIT;
+               pgste_val(new) |= PGSTE_UC_BIT;
 
        pgste_set_unlock(ptep, new);
        pte_unmap_unlock(*ptep, ptl);
@@ -1013,6 +1034,11 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
        return NULL;
 }
 
+void page_table_reset_pgste(struct mm_struct *mm, unsigned long start,
+                           unsigned long end, bool init_skey)
+{
+}
+
 static inline void page_table_free_pgste(unsigned long *table)
 {
 }
@@ -1359,6 +1385,37 @@ int s390_enable_sie(void)
 }
 EXPORT_SYMBOL_GPL(s390_enable_sie);
 
+/*
+ * Enable storage key handling from now on and initialize the storage
+ * keys with the default key.
+ */
+void s390_enable_skey(void)
+{
+       page_table_reset_pgste(current->mm, 0, TASK_SIZE, true);
+}
+EXPORT_SYMBOL_GPL(s390_enable_skey);
+
+/*
+ * Test and reset if a guest page is dirty
+ */
+bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap)
+{
+       pte_t *pte;
+       spinlock_t *ptl;
+       bool dirty = false;
+
+       pte = get_locked_pte(gmap->mm, address, &ptl);
+       if (unlikely(!pte))
+               return false;
+
+       if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte))
+               dirty = true;
+
+       spin_unlock(ptl);
+       return dirty;
+}
+EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty);
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
                           pmd_t *pmdp)
index 24ec1216596e9de4f57ebd1f75ebe7a95e77160f..a04fe4eb237d3321b0ef7af916e3350864e009cd 100644 (file)
@@ -189,7 +189,6 @@ struct x86_emulate_ops {
        void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
        ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
        int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
-       void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
        int (*cpl)(struct x86_emulate_ctxt *ctxt);
        int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
        int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
index 7de069afb382e4d3f43febb087ac21979b28d087..49314155b66c801103ffb89b40585ac649393fdb 100644 (file)
                          | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
                          | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
-#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
-#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
-#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
-#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS |   \
-                                 0xFFFFFF0000000000ULL)
+#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
 #define CR4_RESERVED_BITS                                               \
        (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
                          | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
@@ -134,7 +130,6 @@ enum kvm_reg_ex {
        VCPU_EXREG_PDPTR = NR_VCPU_REGS,
        VCPU_EXREG_CR3,
        VCPU_EXREG_RFLAGS,
-       VCPU_EXREG_CPL,
        VCPU_EXREG_SEGMENTS,
 };
 
index 58d66fe06b6170ad31fd6097e01505d321d23c46..8ba18842c48eac18aab3b90bfb9e0e3c292900a8 100644 (file)
@@ -74,6 +74,11 @@ dotraplinkage void do_general_protection(struct pt_regs *, long);
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
 #ifdef CONFIG_TRACING
 dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
+#else
+static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error)
+{
+       do_page_fault(regs, error);
+}
 #endif
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
 dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
index 0331cb389d6861481ea819c74067a08da785ccf7..7e97371387fdd80eaeb6c1fb587d0b48c1593511 100644 (file)
@@ -259,7 +259,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 
        switch (kvm_read_and_reset_pf_reason()) {
        default:
-               do_page_fault(regs, error_code);
+               trace_do_page_fault(regs, error_code);
                break;
        case KVM_PV_REASON_PAGE_NOT_PRESENT:
                /* page is swapped out by the host. */
index f47a104a749cd9d55c046c5752881ed553669d31..38a0afe83c6ba17822ca683ae570fec7ff65825e 100644 (file)
@@ -283,6 +283,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
        /* cpuid 1.ecx */
        const u32 kvm_supported_word4_x86_features =
+               /* NOTE: MONITOR (and MWAIT) are emulated as NOP,
+                * but *not* advertised to guests via CPUID ! */
                F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
                0 /* DS-CPL, VMX, SMX, EST */ |
                0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
@@ -495,6 +497,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->ecx &= kvm_supported_word6_x86_features;
                cpuid_mask(&entry->ecx, 6);
                break;
+       case 0x80000007: /* Advanced power management */
+               /* invariant TSC is CPUID.80000007H:EDX[8] */
+               entry->edx &= (1 << 8);
+               /* mask against host */
+               entry->edx &= boot_cpu_data.x86_power;
+               entry->eax = entry->ebx = entry->ecx = 0;
+               break;
        case 0x80000008: {
                unsigned g_phys_as = (entry->eax >> 16) & 0xff;
                unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
@@ -525,7 +534,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        case 3: /* Processor serial number */
        case 5: /* MONITOR/MWAIT */
        case 6: /* Thermal management */
-       case 0x80000007: /* Advanced power management */
        case 0xC0000002:
        case 0xC0000003:
        case 0xC0000004:
@@ -726,6 +734,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
 not_found:
        return 36;
 }
+EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
 
 /*
  * If no match is found, check whether we exceed the vCPU's limit
index eeecbed26ac7ce6097fd6669435d73b221f047b2..f9087315e0cdabd379cd92229da43e81b4f3ba91 100644 (file)
@@ -88,4 +88,11 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
        return best && (best->ecx & bit(X86_FEATURE_X2APIC));
 }
 
+static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+       return best && (best->edx & bit(X86_FEATURE_GBPAGES));
+}
 #endif
index 205b17eed93c2bfa0d402414a29bac9ed4fb3e5f..e4e833d3d7d7bb826907f7053d56c1c45748e476 100644 (file)
 #define Fastop      ((u64)1 << 44)  /* Use opcode::u.fastop */
 #define NoWrite     ((u64)1 << 45)  /* No writeback */
 #define SrcWrite    ((u64)1 << 46)  /* Write back src operand */
+#define NoMod      ((u64)1 << 47)  /* Mod field is ignored */
 
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
@@ -1077,7 +1078,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
        ctxt->modrm_rm |= (ctxt->modrm & 0x07);
        ctxt->modrm_seg = VCPU_SREG_DS;
 
-       if (ctxt->modrm_mod == 3) {
+       if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
                op->type = OP_REG;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
                op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
@@ -1324,7 +1325,8 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
                rc->end = n * size;
        }
 
-       if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
+       if (ctxt->rep_prefix && (ctxt->d & String) &&
+           !(ctxt->eflags & EFLG_DF)) {
                ctxt->dst.data = rc->data + rc->pos;
                ctxt->dst.type = OP_MEM_STR;
                ctxt->dst.count = (rc->end - rc->pos) / size;
@@ -1409,11 +1411,11 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 }
 
 /* Does not support long mode */
-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-                                  u16 selector, int seg)
+static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+                                    u16 selector, int seg, u8 cpl, bool in_task_switch)
 {
        struct desc_struct seg_desc, old_desc;
-       u8 dpl, rpl, cpl;
+       u8 dpl, rpl;
        unsigned err_vec = GP_VECTOR;
        u32 err_code = 0;
        bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
@@ -1441,7 +1443,6 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        }
 
        rpl = selector & 3;
-       cpl = ctxt->ops->cpl(ctxt);
 
        /* NULL selector is not valid for TR, CS and SS (except for long mode) */
        if ((seg == VCPU_SREG_CS
@@ -1486,6 +1487,9 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
                        goto exception;
                break;
        case VCPU_SREG_CS:
+               if (in_task_switch && rpl != dpl)
+                       goto exception;
+
                if (!(seg_desc.type & 8))
                        goto exception;
 
@@ -1543,6 +1547,13 @@ exception:
        return X86EMUL_PROPAGATE_FAULT;
 }
 
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+                                  u16 selector, int seg)
+{
+       u8 cpl = ctxt->ops->cpl(ctxt);
+       return __load_segment_descriptor(ctxt, selector, seg, cpl, false);
+}
+
 static void write_register_operand(struct operand *op)
 {
        /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
@@ -2404,6 +2415,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
                                 struct tss_segment_16 *tss)
 {
        int ret;
+       u8 cpl;
 
        ctxt->_eip = tss->ip;
        ctxt->eflags = tss->flag | 2;
@@ -2426,23 +2438,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
        set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
        set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
 
+       cpl = tss->cs & 3;
+
        /*
         * Now load segment descriptors. If fault happens at this stage
         * it is handled in a context of new task
         */
-       ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
+       ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
+       ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
+       ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
+       ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
+       ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
 
@@ -2496,7 +2510,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
                                struct tss_segment_32 *tss)
 {
-       tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
+       /* CR3 and ldt selector are not saved intentionally */
        tss->eip = ctxt->_eip;
        tss->eflags = ctxt->eflags;
        tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
@@ -2514,13 +2528,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
        tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
        tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
        tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
-       tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
 }
 
 static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
                                 struct tss_segment_32 *tss)
 {
        int ret;
+       u8 cpl;
 
        if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
                return emulate_gp(ctxt, 0);
@@ -2539,7 +2553,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 
        /*
         * SDM says that segment selectors are loaded before segment
-        * descriptors
+        * descriptors.  This is important because CPL checks will
+        * use CS.RPL.
         */
        set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
        set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
@@ -2553,43 +2568,38 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
         * If we're switching between Protected Mode and VM86, we need to make
         * sure to update the mode before loading the segment descriptors so
         * that the selectors are interpreted correctly.
-        *
-        * Need to get rflags to the vcpu struct immediately because it
-        * influences the CPL which is checked at least when loading the segment
-        * descriptors and when pushing an error code to the new kernel stack.
-        *
-        * TODO Introduce a separate ctxt->ops->set_cpl callback
         */
-       if (ctxt->eflags & X86_EFLAGS_VM)
+       if (ctxt->eflags & X86_EFLAGS_VM) {
                ctxt->mode = X86EMUL_MODE_VM86;
-       else
+               cpl = 3;
+       } else {
                ctxt->mode = X86EMUL_MODE_PROT32;
-
-       ctxt->ops->set_rflags(ctxt, ctxt->eflags);
+               cpl = tss->cs & 3;
+       }
 
        /*
         * Now load segment descriptors. If fault happenes at this stage
         * it is handled in a context of new task
         */
-       ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+       ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
+       ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
+       ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
+       ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
+       ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
+       ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
+       ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true);
        if (ret != X86EMUL_CONTINUE)
                return ret;
 
@@ -2604,6 +2614,8 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
        struct tss_segment_32 tss_seg;
        int ret;
        u32 new_tss_base = get_desc_base(new_desc);
+       u32 eip_offset = offsetof(struct tss_segment_32, eip);
+       u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
 
        ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
                            &ctxt->exception);
@@ -2613,8 +2625,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 
        save_state_to_tss32(ctxt, &tss_seg);
 
-       ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
-                            &ctxt->exception);
+       /* Only GP registers and segment selectors are saved */
+       ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
+                            ldt_sel_offset - eip_offset, &ctxt->exception);
        if (ret != X86EMUL_CONTINUE)
                /* FIXME: need to provide precise fault address */
                return ret;
@@ -3386,10 +3399,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
                ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
                if (efer & EFER_LMA)
                        rsvd = CR3_L_MODE_RESERVED_BITS;
-               else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
-                       rsvd = CR3_PAE_RESERVED_BITS;
-               else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
-                       rsvd = CR3_NONPAE_RESERVED_BITS;
 
                if (new_val & rsvd)
                        return emulate_gp(ctxt, 0);
@@ -3869,10 +3878,12 @@ static const struct opcode twobyte_table[256] = {
        N, N, N, N, N, N, N, N,
        D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM),
        /* 0x20 - 0x2F */
-       DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
-       DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
-       IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
-       IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
+       DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
+       DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
+       IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
+                                               check_cr_write),
+       IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
+                                               check_dr_write),
        N, N, N, N,
        GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
        GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
index 484bc874688b4ba2bfb8199702671779ffd69d2f..bd0da433e6d72471b259616980cfeac8748bd5cf 100644 (file)
@@ -113,6 +113,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 
        return kvm_get_apic_interrupt(v);       /* APIC */
 }
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
index 9736529ade08d727cbaa7db12a330ce86e5ea9c6..0069118581742d39da8d8aaada6f9f1cb2a14173 100644 (file)
@@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 
 static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
 {
+       /* Note that we never get here with APIC virtualization enabled.  */
+
        if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
                ++apic->isr_count;
        BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
        apic->highest_isr_cache = vec;
 }
 
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+       int result;
+
+       /*
+        * Note that isr_count is always 1, and highest_isr_cache
+        * is always -1, with APIC virtualization enabled.
+        */
+       if (!apic->isr_count)
+               return -1;
+       if (likely(apic->highest_isr_cache != -1))
+               return apic->highest_isr_cache;
+
+       result = find_highest_vector(apic->regs + APIC_ISR);
+       ASSERT(result == -1 || result >= 16);
+
+       return result;
+}
+
 static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 {
-       if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+       struct kvm_vcpu *vcpu;
+       if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+               return;
+
+       vcpu = apic->vcpu;
+
+       /*
+        * We do get here for APIC virtualization enabled if the guest
+        * uses the Hyper-V APIC enlightenment.  In this case we may need
+        * to trigger a new interrupt delivery by writing the SVI field;
+        * on the other hand isr_count and highest_isr_cache are unused
+        * and must be left alone.
+        */
+       if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+               kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+                                              apic_find_highest_isr(apic));
+       else {
                --apic->isr_count;
-       BUG_ON(apic->isr_count < 0);
-       apic->highest_isr_cache = -1;
+               BUG_ON(apic->isr_count < 0);
+               apic->highest_isr_cache = -1;
+       }
 }
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
@@ -456,22 +494,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
        __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
-       int result;
-
-       /* Note that isr_count is always 1 with vid enabled */
-       if (!apic->isr_count)
-               return -1;
-       if (likely(apic->highest_isr_cache != -1))
-               return apic->highest_isr_cache;
-
-       result = find_highest_vector(apic->regs + APIC_ISR);
-       ASSERT(result == -1 || result >= 16);
-
-       return result;
-}
-
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1605,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
        int vector = kvm_apic_has_interrupt(vcpu);
        struct kvm_lapic *apic = vcpu->arch.apic;
 
+       /* Note that we never get here with APIC virtualization enabled.  */
+
        if (vector == -1)
                return -1;
 
index 813d31038b93bf8d7232db9435e2ee0e59996118..931467881da77f8ea025f2d74b5beaca3aac90a7 100644 (file)
@@ -22,6 +22,7 @@
 #include "mmu.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
+#include "cpuid.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -595,7 +596,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
         * we always atomicly update it, see the comments in
         * spte_has_volatile_bits().
         */
-       if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+       if (spte_is_locklessly_modifiable(old_spte) &&
+             !is_writable_pte(new_spte))
                ret = true;
 
        if (!shadow_accessed_mask)
@@ -1176,8 +1178,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 
 /*
  * Write-protect on the specified @sptep, @pt_protect indicates whether
- * spte writ-protection is caused by protecting shadow page table.
- * @flush indicates whether tlb need be flushed.
+ * spte write-protection is caused by protecting shadow page table.
  *
  * Note: write protection is difference between drity logging and spte
  * protection:
@@ -1186,10 +1187,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
  * - for spte protection, the spte can be writable only after unsync-ing
  *   shadow page.
  *
- * Return true if the spte is dropped.
+ * Return true if tlb need be flushed.
  */
-static bool
-spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
 {
        u64 spte = *sptep;
 
@@ -1199,17 +1199,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
 
        rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
 
-       if (__drop_large_spte(kvm, sptep)) {
-               *flush |= true;
-               return true;
-       }
-
        if (pt_protect)
                spte &= ~SPTE_MMU_WRITEABLE;
        spte = spte & ~PT_WRITABLE_MASK;
 
-       *flush |= mmu_spte_update(sptep, spte);
-       return false;
+       return mmu_spte_update(sptep, spte);
 }
 
 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
@@ -1221,11 +1215,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 
        for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
                BUG_ON(!(*sptep & PT_PRESENT_MASK));
-               if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
-                       sptep = rmap_get_first(*rmapp, &iter);
-                       continue;
-               }
 
+               flush |= spte_write_protect(kvm, sptep, pt_protect);
                sptep = rmap_get_next(&iter);
        }
 
@@ -2802,9 +2793,9 @@ static bool page_fault_can_be_fast(u32 error_code)
 }
 
 static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                       u64 *sptep, u64 spte)
 {
-       struct kvm_mmu_page *sp = page_header(__pa(sptep));
        gfn_t gfn;
 
        WARN_ON(!sp->role.direct);
@@ -2830,6 +2821,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
                            u32 error_code)
 {
        struct kvm_shadow_walk_iterator iterator;
+       struct kvm_mmu_page *sp;
        bool ret = false;
        u64 spte = 0ull;
 
@@ -2853,7 +2845,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
                goto exit;
        }
 
-       if (!is_last_spte(spte, level))
+       sp = page_header(__pa(iterator.sptep));
+       if (!is_last_spte(spte, sp->role.level))
                goto exit;
 
        /*
@@ -2874,12 +2867,25 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
        if (!spte_is_locklessly_modifiable(spte))
                goto exit;
 
+       /*
+        * Do not fix write-permission on the large spte since we only dirty
+        * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
+        * that means other pages are missed if its slot is dirty-logged.
+        *
+        * Instead, we let the slow page fault path create a normal spte to
+        * fix the access.
+        *
+        * See the comments in kvm_arch_commit_memory_region().
+        */
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               goto exit;
+
        /*
         * Currently, fast page fault only works for direct mapping since
         * the gfn is not stable for indirect shadow page.
         * See Documentation/virtual/kvm/locking.txt to get more detail.
         */
-       ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+       ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
 exit:
        trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
                              spte, ret);
@@ -3511,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 {
        int maxphyaddr = cpuid_maxphyaddr(vcpu);
        u64 exb_bit_rsvd = 0;
+       u64 gbpages_bit_rsvd = 0;
 
        context->bad_mt_xwr = 0;
 
        if (!context->nx)
                exb_bit_rsvd = rsvd_bits(63, 63);
+       if (!guest_cpuid_has_gbpages(vcpu))
+               gbpages_bit_rsvd = rsvd_bits(7, 7);
        switch (context->root_level) {
        case PT32_ROOT_LEVEL:
                /* no rsvd bits for 2 level 4K page table entries */
@@ -3538,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
        case PT32E_ROOT_LEVEL:
                context->rsvd_bits_mask[0][2] =
                        rsvd_bits(maxphyaddr, 63) |
-                       rsvd_bits(7, 8) | rsvd_bits(1, 2);      /* PDPTE */
+                       rsvd_bits(5, 8) | rsvd_bits(1, 2);      /* PDPTE */
                context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
                        rsvd_bits(maxphyaddr, 62);      /* PDE */
                context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
@@ -3550,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
                break;
        case PT64_ROOT_LEVEL:
                context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+                       rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
                context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+                       gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
                context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
                        rsvd_bits(maxphyaddr, 51);
                context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
                        rsvd_bits(maxphyaddr, 51);
                context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
                context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 51) |
+                       gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
                        rsvd_bits(13, 29);
                context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
                        rsvd_bits(maxphyaddr, 51) |
@@ -4304,15 +4313,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                        if (*rmapp)
                                __rmap_write_protect(kvm, rmapp, false);
 
-                       if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-                               kvm_flush_remote_tlbs(kvm);
+                       if (need_resched() || spin_needbreak(&kvm->mmu_lock))
                                cond_resched_lock(&kvm->mmu_lock);
-                       }
                }
        }
 
-       kvm_flush_remote_tlbs(kvm);
        spin_unlock(&kvm->mmu_lock);
+
+       /*
+        * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+        * which do tlb flush out of mmu-lock should be serialized by
+        * kvm->slots_lock otherwise tlb flush would be missed.
+        */
+       lockdep_assert_held(&kvm->slots_lock);
+
+       /*
+        * We can flush all the TLBs out of the mmu lock without TLB
+        * corruption since we just change the spte from writable to
+        * readonly so that we only need to care the case of changing
+        * spte from present to present (changing the spte from present
+        * to nonpresent will flush all the TLBs immediately), in other
+        * words, the only case we care is mmu_spte_update() where we
+        * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
+        * instead of PT_WRITABLE_MASK, that means it does not depend
+        * on PT_WRITABLE_MASK anymore.
+        */
+       kvm_flush_remote_tlbs(kvm);
 }
 
 #define BATCH_ZAP_PAGES        10
index 3842e70bdb7cf92f916acabcb5480b8b93367a60..b982112d2ca5a9e112e79981dacefe6904025063 100644 (file)
@@ -104,6 +104,39 @@ static inline int is_present_gpte(unsigned long pte)
        return pte & PT_PRESENT_MASK;
 }
 
+/*
+ * Currently, we have two sorts of write-protection, a) the first one
+ * write-protects guest page to sync the guest modification, b) another one is
+ * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
+ * between these two sorts are:
+ * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 2) the first case requires flushing tlb immediately avoiding corrupting
+ *    shadow page table between all vcpus so it should be in the protection of
+ *    mmu-lock. And the another case does not need to flush tlb until returning
+ *    the dirty bitmap to userspace since it only write-protects the page
+ *    logged in the bitmap, that means the page in the dirty bitmap is not
+ *    missed, so it can flush tlb out of mmu-lock.
+ *
+ * So, there is the problem: the first case can meet the corrupted tlb caused
+ * by another case which write-protects pages but without flush tlb
+ * immediately. In order to making the first case be aware this problem we let
+ * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
+ * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ *
+ * Anyway, whenever a spte is updated (only permission and status bits are
+ * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * readonly, if that happens, we need to flush tlb. Fortunately,
+ * mmu_spte_update() has already handled it perfectly.
+ *
+ * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * - if we want to see if it has writable tlb entry or if the spte can be
+ *   writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ *   case, otherwise
+ * - if we fix page fault on the spte or do write-protection by dirty logging,
+ *   check PT_WRITABLE_MASK.
+ *
+ * TODO: introduce APIs to split these two cases.
+ */
 static inline int is_writable_pte(unsigned long pte)
 {
        return pte & PT_WRITABLE_MASK;
index 123efd3ec29f2e9e3bdadffe48c0beb0650222ae..410776528265898abaac0d6f8a816e7413c0edc4 100644 (file)
@@ -913,8 +913,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
  *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
  *   used by guest then tlbs are not flushed, so guest is allowed to access the
  *   freed pages.
- *   We set tlbs_dirty to let the notifier know this change and delay the flush
- *   until such a case actually happens.
+ *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
@@ -943,7 +942,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                        return -EINVAL;
 
                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
-                       vcpu->kvm->tlbs_dirty = true;
+                       vcpu->kvm->tlbs_dirty++;
                        continue;
                }
 
@@ -958,7 +957,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
                if (gfn != sp->gfns[i]) {
                        drop_spte(vcpu->kvm, &sp->spt[i]);
-                       vcpu->kvm->tlbs_dirty = true;
+                       vcpu->kvm->tlbs_dirty++;
                        continue;
                }
 
index 5c4f63151b4d90a405808bda13daa911c5b2a131..cbecaa90399c1fbb1555ea5309a43f392f36fda2 100644 (file)
@@ -108,7 +108,10 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
 {
        struct kvm_pmc *pmc = perf_event->overflow_handler_context;
        struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-       __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+       if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+               __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+               kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+       }
 }
 
 static void kvm_perf_overflow_intr(struct perf_event *perf_event,
@@ -117,7 +120,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
        struct kvm_pmc *pmc = perf_event->overflow_handler_context;
        struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
        if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
-               kvm_perf_overflow(perf_event, data, regs);
+               __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
                kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
                /*
                 * Inject PMI. If vcpu was in a guest mode during NMI PMI
index 7f4f9c2badaefdf880b999fed48274748a808fd7..ec8366c5cfeaa2004a637465d9fc65787cc97618 100644 (file)
@@ -1338,21 +1338,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
 
-static void svm_update_cpl(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       int cpl;
-
-       if (!is_protmode(vcpu))
-               cpl = 0;
-       else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
-               cpl = 3;
-       else
-               cpl = svm->vmcb->save.cs.selector & 0x3;
-
-       svm->vmcb->save.cpl = cpl;
-}
-
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
        return to_svm(vcpu)->vmcb->save.rflags;
@@ -1360,11 +1345,12 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-       unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
-
+       /*
+        * Any change of EFLAGS.VM is accompained by a reload of SS
+        * (caused by either a task switch or an inter-privilege IRET),
+        * so we do not need to update the CPL here.
+        */
        to_svm(vcpu)->vmcb->save.rflags = rflags;
-       if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
-               svm_update_cpl(vcpu);
 }
 
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1631,8 +1617,15 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
                s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
                s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
        }
-       if (seg == VCPU_SREG_CS)
-               svm_update_cpl(vcpu);
+
+       /*
+        * This is always accurate, except if SYSRET returned to a segment
+        * with SS.DPL != 3.  Intel does not have this quirk, and always
+        * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+        * would entail passing the CPL to userspace and back.
+        */
+       if (seg == VCPU_SREG_SS)
+               svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
 
        mark_dirty(svm->vmcb, VMCB_SEG);
 }
@@ -2770,12 +2763,6 @@ static int xsetbv_interception(struct vcpu_svm *svm)
        return 1;
 }
 
-static int invalid_op_interception(struct vcpu_svm *svm)
-{
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
-
 static int task_switch_interception(struct vcpu_svm *svm)
 {
        u16 tss_selector;
@@ -3287,6 +3274,24 @@ static int pause_interception(struct vcpu_svm *svm)
        return 1;
 }
 
+static int nop_interception(struct vcpu_svm *svm)
+{
+       skip_emulated_instruction(&(svm->vcpu));
+       return 1;
+}
+
+static int monitor_interception(struct vcpu_svm *svm)
+{
+       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+       return nop_interception(svm);
+}
+
+static int mwait_interception(struct vcpu_svm *svm)
+{
+       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+       return nop_interception(svm);
+}
+
 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_READ_CR0]                     = cr_interception,
        [SVM_EXIT_READ_CR3]                     = cr_interception,
@@ -3344,8 +3349,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_CLGI]                         = clgi_interception,
        [SVM_EXIT_SKINIT]                       = skinit_interception,
        [SVM_EXIT_WBINVD]                       = emulate_on_interception,
-       [SVM_EXIT_MONITOR]                      = invalid_op_interception,
-       [SVM_EXIT_MWAIT]                        = invalid_op_interception,
+       [SVM_EXIT_MONITOR]                      = monitor_interception,
+       [SVM_EXIT_MWAIT]                        = mwait_interception,
        [SVM_EXIT_XSETBV]                       = xsetbv_interception,
        [SVM_EXIT_NPF]                          = pf_interception,
 };
index 545245d7cc63f3128879724a1fc2811c31bdf55c..33574c95220d1a0632c738eed1ab7a1b0679b6f3 100644 (file)
@@ -91,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall,
 /*
  * Tracepoint for PIO.
  */
+
+#define KVM_PIO_IN   0
+#define KVM_PIO_OUT  1
+
 TRACE_EVENT(kvm_pio,
        TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
-                unsigned int count),
-       TP_ARGS(rw, port, size, count),
+                unsigned int count, void *data),
+       TP_ARGS(rw, port, size, count, data),
 
        TP_STRUCT__entry(
                __field(        unsigned int,   rw              )
                __field(        unsigned int,   port            )
                __field(        unsigned int,   size            )
                __field(        unsigned int,   count           )
+               __field(        unsigned int,   val             )
        ),
 
        TP_fast_assign(
@@ -108,11 +113,18 @@ TRACE_EVENT(kvm_pio,
                __entry->port           = port;
                __entry->size           = size;
                __entry->count          = count;
+               if (size == 1)
+                       __entry->val    = *(unsigned char *)data;
+               else if (size == 2)
+                       __entry->val    = *(unsigned short *)data;
+               else
+                       __entry->val    = *(unsigned int *)data;
        ),
 
-       TP_printk("pio_%s at 0x%x size %d count %d",
+       TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
                  __entry->rw ? "write" : "read",
-                 __entry->port, __entry->size, __entry->count)
+                 __entry->port, __entry->size, __entry->count, __entry->val,
+                 __entry->count > 1 ? "(...)" : "")
 );
 
 /*
index 138ceffc6377bba9e2c4fb5529ad379f81b0bec6..801332edefc3cda08306ac8f0726db320160ce03 100644 (file)
@@ -354,6 +354,7 @@ struct vmcs02_list {
 struct nested_vmx {
        /* Has the level1 guest done vmxon? */
        bool vmxon;
+       gpa_t vmxon_ptr;
 
        /* The guest-physical address of the current VMCS L1 keeps for L2 */
        gpa_t current_vmptr;
@@ -413,7 +414,6 @@ struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        unsigned long         host_rsp;
        u8                    fail;
-       u8                    cpl;
        bool                  nmi_known_unmasked;
        u32                   exit_intr_info;
        u32                   idt_vectoring_info;
@@ -2283,7 +2283,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
                nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
        nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
-       /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+
        nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
@@ -2291,7 +2291,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
        nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
-               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
+
        if (vmx_mpx_supported())
                nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
@@ -2353,12 +2354,11 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                         VMX_EPT_INVEPT_BIT;
                nested_vmx_ept_caps &= vmx_capability.ept;
                /*
-                * Since invept is completely emulated we support both global
-                * and context invalidation independent of what host cpu
-                * supports
+                * For nested guests, we don't do anything specific
+                * for single context invalidation. Hence, only advertise
+                * support for global context invalidation.
                 */
-               nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
-                       VMX_EPT_EXTENT_CONTEXT_BIT;
+               nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
        } else
                nested_vmx_ept_caps = 0;
 
@@ -3186,10 +3186,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
        fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
        fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
        fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
-
-       /* CPL is always 0 when CPU enters protected mode */
-       __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
-       vmx->cpl = 0;
 }
 
 static void fix_rmode_seg(int seg, struct kvm_segment *save)
@@ -3591,22 +3587,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       if (!is_protmode(vcpu))
+       if (unlikely(vmx->rmode.vm86_active))
                return 0;
-
-       if (!is_long_mode(vcpu)
-           && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
-               return 3;
-
-       if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
-               __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
-               vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
+       else {
+               int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+               return AR_DPL(ar);
        }
-
-       return vmx->cpl;
 }
 
-
 static u32 vmx_segment_access_rights(struct kvm_segment *var)
 {
        u32 ar;
@@ -3634,8 +3622,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
        vmx_segment_cache_clear(vmx);
-       if (seg == VCPU_SREG_CS)
-               __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
 
        if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
                vmx->rmode.segs[seg] = *var;
@@ -4564,6 +4550,16 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
                PIN_BASED_EXT_INTR_MASK;
 }
 
+/*
+ * In nested virtualization, check if L1 has set
+ * VM_EXIT_ACK_INTR_ON_EXIT
+ */
+static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
+{
+       return get_vmcs12(vcpu)->vm_exit_controls &
+               VM_EXIT_ACK_INTR_ON_EXIT;
+}
+
 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 {
        return get_vmcs12(vcpu)->pin_based_vm_exec_control &
@@ -4878,6 +4874,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
                        vcpu->arch.dr6 &= ~15;
                        vcpu->arch.dr6 |= dr6;
+                       if (!(dr6 & ~DR6_RESERVED)) /* icebp */
+                               skip_emulated_instruction(vcpu);
+
                        kvm_queue_exception(vcpu, DB_VECTOR);
                        return 1;
                }
@@ -5166,7 +5165,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                        return 1;
                kvm_register_write(vcpu, reg, val);
        } else
-               if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
+               if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)))
                        return 1;
 
        skip_emulated_instruction(vcpu);
@@ -5439,7 +5438,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
        }
 
        /* clear all local breakpoint enable flags */
-       vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+       vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
 
        /*
         * TODO: What about debug traps on tss switch?
@@ -5565,6 +5564,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
        gpa_t gpa;
 
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+       if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+               skip_emulated_instruction(vcpu);
+               return 1;
+       }
 
        ret = handle_mmio_page_fault_common(vcpu, gpa, true);
        if (likely(ret == RET_MMIO_PF_EMULATE))
@@ -5669,12 +5672,24 @@ static int handle_pause(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
+static int handle_nop(struct kvm_vcpu *vcpu)
 {
-       kvm_queue_exception(vcpu, UD_VECTOR);
+       skip_emulated_instruction(vcpu);
        return 1;
 }
 
+static int handle_mwait(struct kvm_vcpu *vcpu)
+{
+       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+       return handle_nop(vcpu);
+}
+
+static int handle_monitor(struct kvm_vcpu *vcpu)
+{
+       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+       return handle_nop(vcpu);
+}
+
 /*
  * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
  * We could reuse a single VMCS for all the L2 guests, but we also want the
@@ -5811,6 +5826,154 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
        return HRTIMER_NORESTART;
 }
 
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+                                unsigned long exit_qualification,
+                                u32 vmx_instruction_info, gva_t *ret)
+{
+       /*
+        * According to Vol. 3B, "Information for VM Exits Due to Instruction
+        * Execution", on an exit, vmx_instruction_info holds most of the
+        * addressing components of the operand. Only the displacement part
+        * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+        * For how an actual address is calculated from all these components,
+        * refer to Vol. 1, "Operand Addressing".
+        */
+       int  scaling = vmx_instruction_info & 3;
+       int  addr_size = (vmx_instruction_info >> 7) & 7;
+       bool is_reg = vmx_instruction_info & (1u << 10);
+       int  seg_reg = (vmx_instruction_info >> 15) & 7;
+       int  index_reg = (vmx_instruction_info >> 18) & 0xf;
+       bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+       int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
+       bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
+
+       if (is_reg) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       /* Addr = segment_base + offset */
+       /* offset = base + [index * scale] + displacement */
+       *ret = vmx_get_segment_base(vcpu, seg_reg);
+       if (base_is_valid)
+               *ret += kvm_register_read(vcpu, base_reg);
+       if (index_is_valid)
+               *ret += kvm_register_read(vcpu, index_reg)<<scaling;
+       *ret += exit_qualification; /* holds the displacement */
+
+       if (addr_size == 1) /* 32 bit */
+               *ret &= 0xffffffff;
+
+       /*
+        * TODO: throw #GP (and return 1) in various cases that the VM*
+        * instructions require it - e.g., offset beyond segment limit,
+        * unusable or unreadable/unwritable segment, non-canonical 64-bit
+        * address, and so on. Currently these are not checked.
+        */
+       return 0;
+}
+
+/*
+ * This function performs the various checks including
+ * - if it's 4KB aligned
+ * - No bits beyond the physical address width are set
+ * - Returns 0 on success or else 1
+ * (Intel SDM Section 30.3)
+ */
+static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
+                                 gpa_t *vmpointer)
+{
+       gva_t gva;
+       gpa_t vmptr;
+       struct x86_exception e;
+       struct page *page;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                       vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+               return 1;
+
+       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+                               sizeof(vmptr), &e)) {
+               kvm_inject_page_fault(vcpu, &e);
+               return 1;
+       }
+
+       switch (exit_reason) {
+       case EXIT_REASON_VMON:
+               /*
+                * SDM 3: 24.11.5
+                * The first 4 bytes of VMXON region contain the supported
+                * VMCS revision identifier
+                *
+                * Note - IA32_VMX_BASIC[48] will never be 1
+                * for the nested case;
+                * which replaces physical address width with 32
+                *
+                */
+               if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+                       nested_vmx_failInvalid(vcpu);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+
+               page = nested_get_page(vcpu, vmptr);
+               if (page == NULL ||
+                   *(u32 *)kmap(page) != VMCS12_REVISION) {
+                       nested_vmx_failInvalid(vcpu);
+                       kunmap(page);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+               kunmap(page);
+               vmx->nested.vmxon_ptr = vmptr;
+               break;
+       case EXIT_REASON_VMCLEAR:
+               if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+                       nested_vmx_failValid(vcpu,
+                                            VMXERR_VMCLEAR_INVALID_ADDRESS);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+
+               if (vmptr == vmx->nested.vmxon_ptr) {
+                       nested_vmx_failValid(vcpu,
+                                            VMXERR_VMCLEAR_VMXON_POINTER);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+               break;
+       case EXIT_REASON_VMPTRLD:
+               if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+                       nested_vmx_failValid(vcpu,
+                                            VMXERR_VMPTRLD_INVALID_ADDRESS);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+
+               if (vmptr == vmx->nested.vmxon_ptr) {
+                       nested_vmx_failValid(vcpu,
+                                            VMXERR_VMCLEAR_VMXON_POINTER);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+               break;
+       default:
+               return 1; /* shouldn't happen */
+       }
+
+       if (vmpointer)
+               *vmpointer = vmptr;
+       return 0;
+}
+
 /*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
@@ -5849,6 +6012,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                kvm_inject_gp(vcpu, 0);
                return 1;
        }
+
+       if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+               return 1;
+
        if (vmx->nested.vmxon) {
                nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
                skip_emulated_instruction(vcpu);
@@ -5971,87 +6138,19 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-/*
- * Decode the memory-address operand of a vmx instruction, as recorded on an
- * exit caused by such an instruction (run by a guest hypervisor).
- * On success, returns 0. When the operand is invalid, returns 1 and throws
- * #UD or #GP.
- */
-static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
-                                unsigned long exit_qualification,
-                                u32 vmx_instruction_info, gva_t *ret)
-{
-       /*
-        * According to Vol. 3B, "Information for VM Exits Due to Instruction
-        * Execution", on an exit, vmx_instruction_info holds most of the
-        * addressing components of the operand. Only the displacement part
-        * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
-        * For how an actual address is calculated from all these components,
-        * refer to Vol. 1, "Operand Addressing".
-        */
-       int  scaling = vmx_instruction_info & 3;
-       int  addr_size = (vmx_instruction_info >> 7) & 7;
-       bool is_reg = vmx_instruction_info & (1u << 10);
-       int  seg_reg = (vmx_instruction_info >> 15) & 7;
-       int  index_reg = (vmx_instruction_info >> 18) & 0xf;
-       bool index_is_valid = !(vmx_instruction_info & (1u << 22));
-       int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
-       bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
-
-       if (is_reg) {
-               kvm_queue_exception(vcpu, UD_VECTOR);
-               return 1;
-       }
-
-       /* Addr = segment_base + offset */
-       /* offset = base + [index * scale] + displacement */
-       *ret = vmx_get_segment_base(vcpu, seg_reg);
-       if (base_is_valid)
-               *ret += kvm_register_read(vcpu, base_reg);
-       if (index_is_valid)
-               *ret += kvm_register_read(vcpu, index_reg)<<scaling;
-       *ret += exit_qualification; /* holds the displacement */
-
-       if (addr_size == 1) /* 32 bit */
-               *ret &= 0xffffffff;
-
-       /*
-        * TODO: throw #GP (and return 1) in various cases that the VM*
-        * instructions require it - e.g., offset beyond segment limit,
-        * unusable or unreadable/unwritable segment, non-canonical 64-bit
-        * address, and so on. Currently these are not checked.
-        */
-       return 0;
-}
-
 /* Emulate the VMCLEAR instruction */
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       gva_t gva;
        gpa_t vmptr;
        struct vmcs12 *vmcs12;
        struct page *page;
-       struct x86_exception e;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-                       vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
-               return 1;
-
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
-                               sizeof(vmptr), &e)) {
-               kvm_inject_page_fault(vcpu, &e);
-               return 1;
-       }
-
-       if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
-               skip_emulated_instruction(vcpu);
+       if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
                return 1;
-       }
 
        if (vmptr == vmx->nested.current_vmptr) {
                nested_release_vmcs12(vmx);
@@ -6372,29 +6471,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       gva_t gva;
        gpa_t vmptr;
-       struct x86_exception e;
        u32 exec_control;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-                       vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
-               return 1;
-
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
-                               sizeof(vmptr), &e)) {
-               kvm_inject_page_fault(vcpu, &e);
-               return 1;
-       }
-
-       if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
-               skip_emulated_instruction(vcpu);
+       if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
                return 1;
-       }
 
        if (vmx->nested.current_vmptr != vmptr) {
                struct vmcs12 *new_vmcs12;
@@ -6471,7 +6555,6 @@ static int handle_invept(struct kvm_vcpu *vcpu)
        struct {
                u64 eptp, gpa;
        } operand;
-       u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
 
        if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
            !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
@@ -6511,16 +6594,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)
        }
 
        switch (type) {
-       case VMX_EPT_EXTENT_CONTEXT:
-               if ((operand.eptp & eptp_mask) !=
-                               (nested_ept_get_cr3(vcpu) & eptp_mask))
-                       break;
        case VMX_EPT_EXTENT_GLOBAL:
                kvm_mmu_sync_roots(vcpu);
                kvm_mmu_flush_tlb(vcpu);
                nested_vmx_succeed(vcpu);
                break;
        default:
+               /* Trap single context invalidation invept calls */
                BUG_ON(1);
                break;
        }
@@ -6571,8 +6651,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
-       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
+       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
        [EXIT_REASON_INVEPT]                  = handle_invept,
 };
 
@@ -7413,7 +7493,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
                                  | (1 << VCPU_EXREG_RFLAGS)
-                                 | (1 << VCPU_EXREG_CPL)
                                  | (1 << VCPU_EXREG_PDPTR)
                                  | (1 << VCPU_EXREG_SEGMENTS)
                                  | (1 << VCPU_EXREG_CR3));
@@ -8601,6 +8680,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
                       exit_qualification);
 
+       if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+           && nested_exit_intr_ack_set(vcpu)) {
+               int irq = kvm_cpu_get_interrupt(vcpu);
+               WARN_ON(irq < 0);
+               vmcs12->vm_exit_intr_info = irq |
+                       INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+       }
+
        trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
                                       vmcs12->exit_qualification,
                                       vmcs12->idt_vectoring_info_field,
index 20316c67b824a9d06baa89cc0cf905eae3f24665..f32a02578c0d1985b424edaf992aede89b5cd7b4 100644 (file)
@@ -704,25 +704,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        }
 
        if (is_long_mode(vcpu)) {
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
-                       if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
-                               return 1;
-               } else
-                       if (cr3 & CR3_L_MODE_RESERVED_BITS)
-                               return 1;
-       } else {
-               if (is_pae(vcpu)) {
-                       if (cr3 & CR3_PAE_RESERVED_BITS)
-                               return 1;
-                       if (is_paging(vcpu) &&
-                           !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
-                               return 1;
-               }
-               /*
-                * We don't check reserved bits in nonpae mode, because
-                * this isn't enforced, and VMware depends on this.
-                */
-       }
+               if (cr3 & CR3_L_MODE_RESERVED_BITS)
+                       return 1;
+       } else if (is_pae(vcpu) && is_paging(vcpu) &&
+                  !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+               return 1;
 
        vcpu->arch.cr3 = cr3;
        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
@@ -1935,6 +1921,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 
                if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
                        vcpu->arch.hv_vapic = data;
+                       if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+                               return 1;
                        break;
                }
                gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
@@ -1945,6 +1933,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                        return 1;
                vcpu->arch.hv_vapic = data;
                mark_page_dirty(vcpu->kvm, gfn);
+               if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+                       return 1;
                break;
        }
        case HV_X64_MSR_EOI:
@@ -2647,6 +2637,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IRQ_INJECT_STATUS:
        case KVM_CAP_IRQFD:
        case KVM_CAP_IOEVENTFD:
+       case KVM_CAP_IOEVENTFD_NO_LENGTH:
        case KVM_CAP_PIT2:
        case KVM_CAP_PIT_STATE2:
        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
@@ -3649,11 +3640,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
                offset = i * BITS_PER_LONG;
                kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
        }
-       if (is_dirty)
-               kvm_flush_remote_tlbs(kvm);
 
        spin_unlock(&kvm->mmu_lock);
 
+       /* See the comments in kvm_mmu_slot_remove_write_access(). */
+       lockdep_assert_held(&kvm->slots_lock);
+
+       /*
+        * All the TLBs can be flushed out of mmu lock, see the comments in
+        * kvm_mmu_slot_remove_write_access().
+        */
+       if (is_dirty)
+               kvm_flush_remote_tlbs(kvm);
+
        r = -EFAULT;
        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
                goto out;
@@ -4489,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
                               unsigned short port, void *val,
                               unsigned int count, bool in)
 {
-       trace_kvm_pio(!in, port, size, count);
-
        vcpu->arch.pio.port = port;
        vcpu->arch.pio.in = in;
        vcpu->arch.pio.count  = count;
@@ -4525,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
        if (ret) {
 data_avail:
                memcpy(val, vcpu->arch.pio_data, size * count);
+               trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
                vcpu->arch.pio.count = 0;
                return 1;
        }
@@ -4539,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
        memcpy(vcpu->arch.pio_data, val, size * count);
+       trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
        return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
 }
 
@@ -4650,11 +4649,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
        return res;
 }
 
-static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
-{
-       kvm_set_rflags(emul_to_vcpu(ctxt), val);
-}
-
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 {
        return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4839,7 +4833,6 @@ static const struct x86_emulate_ops emulate_ops = {
        .set_idt             = emulator_set_idt,
        .get_cr              = emulator_get_cr,
        .set_cr              = emulator_set_cr,
-       .set_rflags          = emulator_set_rflags,
        .cpl                 = emulator_get_cpl,
        .get_dr              = emulator_get_dr,
        .set_dr              = emulator_set_dr,
@@ -4905,7 +4898,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
        ctxt->eip = kvm_rip_read(vcpu);
        ctxt->mode = (!is_protmode(vcpu))               ? X86EMUL_MODE_REAL :
                     (ctxt->eflags & X86_EFLAGS_VM)     ? X86EMUL_MODE_VM86 :
-                    cs_l                               ? X86EMUL_MODE_PROT64 :
+                    (cs_l && is_long_mode(vcpu))       ? X86EMUL_MODE_PROT64 :
                     cs_db                              ? X86EMUL_MODE_PROT32 :
                                                          X86EMUL_MODE_PROT16;
        ctxt->guest_mode = is_guest_mode(vcpu);
@@ -7333,8 +7326,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
        /*
         * Write protect all pages for dirty logging.
-        * Existing largepage mappings are destroyed here and new ones will
-        * not be created until the end of the logging.
+        *
+        * All the sptes including the large sptes which point to this
+        * slot are set to readonly. We can not create any new large
+        * spte on this slot until the end of the logging.
+        *
+        * See the comments in fast_page_fault().
         */
        if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
index 14196ea0fdf3ad878b6fe527733d8390f0ba6a25..1918d9dff45d0dfcca43cafe4bd762008c4448f6 100644 (file)
@@ -22,11 +22,14 @@ struct read_info_sccb {
        u8      rnsize;                 /* 10 */
        u8      _reserved0[16 - 11];    /* 11-15 */
        u16     ncpurl;                 /* 16-17 */
-       u8      _reserved7[24 - 18];    /* 18-23 */
+       u16     cpuoff;                 /* 18-19 */
+       u8      _reserved7[24 - 20];    /* 20-23 */
        u8      loadparm[8];            /* 24-31 */
        u8      _reserved1[48 - 32];    /* 32-47 */
        u64     facilities;             /* 48-55 */
-       u8      _reserved2[84 - 56];    /* 56-83 */
+       u8      _reserved2a[76 - 56];   /* 56-75 */
+       u32     ibc;                    /* 76-79 */
+       u8      _reserved2b[84 - 80];   /* 80-83 */
        u8      fac84;                  /* 84 */
        u8      fac85;                  /* 85 */
        u8      _reserved3[91 - 86];    /* 86-90 */
@@ -45,6 +48,8 @@ static unsigned int sclp_con_has_linemode __initdata;
 static unsigned long sclp_hsa_size;
 static unsigned int sclp_max_cpu;
 static struct sclp_ipl_info sclp_ipl_info;
+static unsigned char sclp_siif;
+static u32 sclp_ibc;
 
 u64 sclp_facilities;
 u8 sclp_fac84;
@@ -96,6 +101,9 @@ static int __init sclp_read_info_early(struct read_info_sccb *sccb)
 
 static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 {
+       struct sclp_cpu_entry *cpue;
+       u16 boot_cpu_address, cpu;
+
        if (sclp_read_info_early(sccb))
                return;
 
@@ -106,6 +114,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
        sclp_rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2;
        sclp_rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2;
        sclp_rzm <<= 20;
+       sclp_ibc = sccb->ibc;
 
        if (!sccb->hcpua) {
                if (MACHINE_IS_VM)
@@ -116,6 +125,15 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
                sclp_max_cpu = sccb->hcpua + 1;
        }
 
+       boot_cpu_address = stap();
+       cpue = (void *)sccb + sccb->cpuoff;
+       for (cpu = 0; cpu < sccb->ncpurl; cpue++, cpu++) {
+               if (boot_cpu_address != cpue->address)
+                       continue;
+               sclp_siif = cpue->siif;
+               break;
+       }
+
        /* Save IPL information */
        sclp_ipl_info.is_valid = 1;
        if (sccb->flags & 0x2)
@@ -148,6 +166,18 @@ unsigned int sclp_get_max_cpu(void)
        return sclp_max_cpu;
 }
 
+int sclp_has_siif(void)
+{
+       return sclp_siif;
+}
+EXPORT_SYMBOL(sclp_has_siif);
+
+unsigned int sclp_get_ibc(void)
+{
+       return sclp_ibc;
+}
+EXPORT_SYMBOL(sclp_get_ibc);
+
 /*
  * This function will be called after sclp_facilities_detect(), which gets
  * called from early.c code. The sclp_facilities_detect() function retrieves
index 7d21cf9f43806cae2c6b14363e83cd628366acdb..970c68197c698898df483198995d9a0c558c0770 100644 (file)
@@ -134,6 +134,8 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_EPR_EXIT          20
 #define KVM_REQ_SCAN_IOAPIC       21
 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 22
+#define KVM_REQ_ENABLE_IBS        23
+#define KVM_REQ_DISABLE_IBS       24
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID            0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID       1
@@ -163,6 +165,7 @@ enum kvm_bus {
        KVM_MMIO_BUS,
        KVM_PIO_BUS,
        KVM_VIRTIO_CCW_NOTIFY_BUS,
+       KVM_FAST_MMIO_BUS,
        KVM_NR_BUSES
 };
 
@@ -367,6 +370,7 @@ struct kvm {
        struct mm_struct *mm; /* userspace tied to this vm */
        struct kvm_memslots *memslots;
        struct srcu_struct srcu;
+       struct srcu_struct irq_srcu;
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
        u32 bsp_vcpu_id;
 #endif
@@ -410,9 +414,7 @@ struct kvm {
        unsigned long mmu_notifier_seq;
        long mmu_notifier_count;
 #endif
-       /* Protected by mmu_lock */
-       bool tlbs_dirty;
-
+       long tlbs_dirty;
        struct list_head devices;
 };
 
@@ -879,6 +881,13 @@ static inline hpa_t pfn_to_hpa(pfn_t pfn)
        return (hpa_t)pfn << PAGE_SHIFT;
 }
 
+static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
+{
+       unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+
+       return kvm_is_error_hva(hva);
+}
+
 static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
        set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
index 6929571b79b0c396496991fc136a7ff811c9177d..24e9033f8b3f9f10fde3467b9122d59c9621edcd 100644 (file)
@@ -317,6 +317,7 @@ header-y += ppp-ioctl.h
 header-y += ppp_defs.h
 header-y += pps.h
 header-y += prctl.h
+header-y += psci.h
 header-y += ptp_clock.h
 header-y += ptrace.h
 header-y += qnx4_fs.h
index a8f4ee5d2e8242507601d0f3cb81417e39daf40b..e11d8f170a62962c0bb62c9f5a28ffae88de08cf 100644 (file)
@@ -171,6 +171,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_WATCHDOG         21
 #define KVM_EXIT_S390_TSCH        22
 #define KVM_EXIT_EPR              23
+#define KVM_EXIT_SYSTEM_EVENT     24
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -301,6 +302,13 @@ struct kvm_run {
                struct {
                        __u32 epr;
                } epr;
+               /* KVM_EXIT_SYSTEM_EVENT */
+               struct {
+#define KVM_SYSTEM_EVENT_SHUTDOWN       1
+#define KVM_SYSTEM_EVENT_RESET          2
+                       __u32 type;
+                       __u64 flags;
+               } system_event;
                /* Fix the size of the union. */
                char padding[256];
        };
@@ -416,6 +424,8 @@ struct kvm_s390_psw {
 #define KVM_S390_INT_PFAULT_INIT       0xfffe0004u
 #define KVM_S390_INT_PFAULT_DONE       0xfffe0005u
 #define KVM_S390_MCHK                  0xfffe1000u
+#define KVM_S390_INT_CLOCK_COMP                0xffff1004u
+#define KVM_S390_INT_CPU_TIMER         0xffff1005u
 #define KVM_S390_INT_VIRTIO            0xffff2603u
 #define KVM_S390_INT_SERVICE           0xffff2401u
 #define KVM_S390_INT_EMERGENCY         0xffff1201u
@@ -515,6 +525,7 @@ enum {
        kvm_ioeventfd_flag_nr_pio,
        kvm_ioeventfd_flag_nr_deassign,
        kvm_ioeventfd_flag_nr_virtio_ccw_notify,
+       kvm_ioeventfd_flag_nr_fast_mmio,
        kvm_ioeventfd_flag_nr_max,
 };
 
@@ -529,7 +540,7 @@ enum {
 struct kvm_ioeventfd {
        __u64 datamatch;
        __u64 addr;        /* legal pio/mmio address */
-       __u32 len;         /* 1, 2, 4, or 8 bytes    */
+       __u32 len;         /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
        __s32 fd;
        __u32 flags;
        __u8  pad[36];
@@ -743,6 +754,10 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IOAPIC_POLARITY_IGNORED 97
 #define KVM_CAP_ENABLE_CAP_VM 98
 #define KVM_CAP_S390_IRQCHIP 99
+#define KVM_CAP_IOEVENTFD_NO_LENGTH 100
+#define KVM_CAP_VM_ATTRIBUTES 101
+#define KVM_CAP_ARM_PSCI_0_2 102
+#define KVM_CAP_PPC_FIXUP_HCALL 103
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
new file mode 100644 (file)
index 0000000..310d83e
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * ARM Power State and Coordination Interface (PSCI) header
+ *
+ * This header holds common PSCI defines and macros shared
+ * by: ARM kernel, ARM64 kernel, KVM ARM/ARM64 and user space.
+ *
+ * Copyright (C) 2014 Linaro Ltd.
+ * Author: Anup Patel <anup.patel@linaro.org>
+ */
+
+#ifndef _UAPI_LINUX_PSCI_H
+#define _UAPI_LINUX_PSCI_H
+
+/*
+ * PSCI v0.1 interface
+ *
+ * The PSCI v0.1 function numbers are implementation defined.
+ *
+ * Only PSCI return values such as: SUCCESS, NOT_SUPPORTED,
+ * INVALID_PARAMS, and DENIED defined below are applicable
+ * to PSCI v0.1.
+ */
+
+/* PSCI v0.2 interface */
+#define PSCI_0_2_FN_BASE                       0x84000000
+#define PSCI_0_2_FN(n)                         (PSCI_0_2_FN_BASE + (n))
+#define PSCI_0_2_64BIT                         0x40000000
+#define PSCI_0_2_FN64_BASE                     \
+                                       (PSCI_0_2_FN_BASE + PSCI_0_2_64BIT)
+#define PSCI_0_2_FN64(n)                       (PSCI_0_2_FN64_BASE + (n))
+
+#define PSCI_0_2_FN_PSCI_VERSION               PSCI_0_2_FN(0)
+#define PSCI_0_2_FN_CPU_SUSPEND                        PSCI_0_2_FN(1)
+#define PSCI_0_2_FN_CPU_OFF                    PSCI_0_2_FN(2)
+#define PSCI_0_2_FN_CPU_ON                     PSCI_0_2_FN(3)
+#define PSCI_0_2_FN_AFFINITY_INFO              PSCI_0_2_FN(4)
+#define PSCI_0_2_FN_MIGRATE                    PSCI_0_2_FN(5)
+#define PSCI_0_2_FN_MIGRATE_INFO_TYPE          PSCI_0_2_FN(6)
+#define PSCI_0_2_FN_MIGRATE_INFO_UP_CPU                PSCI_0_2_FN(7)
+#define PSCI_0_2_FN_SYSTEM_OFF                 PSCI_0_2_FN(8)
+#define PSCI_0_2_FN_SYSTEM_RESET               PSCI_0_2_FN(9)
+
+#define PSCI_0_2_FN64_CPU_SUSPEND              PSCI_0_2_FN64(1)
+#define PSCI_0_2_FN64_CPU_ON                   PSCI_0_2_FN64(3)
+#define PSCI_0_2_FN64_AFFINITY_INFO            PSCI_0_2_FN64(4)
+#define PSCI_0_2_FN64_MIGRATE                  PSCI_0_2_FN64(5)
+#define PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU      PSCI_0_2_FN64(7)
+
+/* PSCI v0.2 power state encoding for CPU_SUSPEND function */
+#define PSCI_0_2_POWER_STATE_ID_MASK           0xffff
+#define PSCI_0_2_POWER_STATE_ID_SHIFT          0
+#define PSCI_0_2_POWER_STATE_TYPE_SHIFT                16
+#define PSCI_0_2_POWER_STATE_TYPE_MASK         \
+                               (0x1 << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
+#define PSCI_0_2_POWER_STATE_AFFL_SHIFT                24
+#define PSCI_0_2_POWER_STATE_AFFL_MASK         \
+                               (0x3 << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
+
+/* PSCI v0.2 affinity level state returned by AFFINITY_INFO */
+#define PSCI_0_2_AFFINITY_LEVEL_ON             0
+#define PSCI_0_2_AFFINITY_LEVEL_OFF            1
+#define PSCI_0_2_AFFINITY_LEVEL_ON_PENDING     2
+
+/* PSCI v0.2 multicore support in Trusted OS returned by MIGRATE_INFO_TYPE */
+#define PSCI_0_2_TOS_UP_MIGRATE                        0
+#define PSCI_0_2_TOS_UP_NO_MIGRATE             1
+#define PSCI_0_2_TOS_MP                                2
+
+/* PSCI version decoding (independent of PSCI version) */
+#define PSCI_VERSION_MAJOR_SHIFT               16
+#define PSCI_VERSION_MINOR_MASK                        \
+               ((1U << PSCI_VERSION_MAJOR_SHIFT) - 1)
+#define PSCI_VERSION_MAJOR_MASK                        ~PSCI_VERSION_MINOR_MASK
+#define PSCI_VERSION_MAJOR(ver)                        \
+               (((ver) & PSCI_VERSION_MAJOR_MASK) >> PSCI_VERSION_MAJOR_SHIFT)
+#define PSCI_VERSION_MINOR(ver)                        \
+               ((ver) & PSCI_VERSION_MINOR_MASK)
+
+/* PSCI return values (inclusive of all PSCI versions) */
+#define PSCI_RET_SUCCESS                       0
+#define PSCI_RET_NOT_SUPPORTED                 -1
+#define PSCI_RET_INVALID_PARAMS                        -2
+#define PSCI_RET_DENIED                                -3
+#define PSCI_RET_ALREADY_ON                    -4
+#define PSCI_RET_ON_PENDING                    -5
+#define PSCI_RET_INTERNAL_FAILURE              -6
+#define PSCI_RET_NOT_PRESENT                   -7
+#define PSCI_RET_DISABLED                      -8
+
+#endif /* _UAPI_LINUX_PSCI_H */
index 06e6401d6ef45326edcbce4c8ff96e13286d2940..d6a3d0993d8812c8527274d01e8c08ce942746a4 100644 (file)
@@ -80,12 +80,10 @@ static void async_pf_execute(struct work_struct *work)
 
        might_sleep();
 
-       use_mm(mm);
        down_read(&mm->mmap_sem);
-       get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL);
+       get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
        up_read(&mm->mmap_sem);
        kvm_async_page_present_sync(vcpu, apf);
-       unuse_mm(mm);
 
        spin_lock(&vcpu->async_pf.lock);
        list_add_tail(&apf->link, &vcpu->async_pf.done);
index 29c2a04e036e8a30613133b11510eee9232f5213..20c3af7692c5d32d6de0c7a88a120a98b14d252e 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/list.h>
 #include <linux/eventfd.h>
 #include <linux/kernel.h>
+#include <linux/srcu.h>
 #include <linux/slab.h>
 
 #include "iodev.h"
@@ -118,19 +119,22 @@ static void
 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 {
        struct _irqfd_resampler *resampler;
+       struct kvm *kvm;
        struct _irqfd *irqfd;
+       int idx;
 
        resampler = container_of(kian, struct _irqfd_resampler, notifier);
+       kvm = resampler->kvm;
 
-       kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+       kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
                    resampler->notifier.gsi, 0, false);
 
-       rcu_read_lock();
+       idx = srcu_read_lock(&kvm->irq_srcu);
 
        list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
                eventfd_signal(irqfd->resamplefd, 1);
 
-       rcu_read_unlock();
+       srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
 static void
@@ -142,7 +146,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
        mutex_lock(&kvm->irqfds.resampler_lock);
 
        list_del_rcu(&irqfd->resampler_link);
-       synchronize_rcu();
+       synchronize_srcu(&kvm->irq_srcu);
 
        if (list_empty(&resampler->list)) {
                list_del(&resampler->link);
@@ -221,17 +225,18 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
        unsigned long flags = (unsigned long)key;
        struct kvm_kernel_irq_routing_entry *irq;
        struct kvm *kvm = irqfd->kvm;
+       int idx;
 
        if (flags & POLLIN) {
-               rcu_read_lock();
-               irq = rcu_dereference(irqfd->irq_entry);
+               idx = srcu_read_lock(&kvm->irq_srcu);
+               irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu);
                /* An event has been signaled, inject an interrupt */
                if (irq)
                        kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
                                        false);
                else
                        schedule_work(&irqfd->inject);
-               rcu_read_unlock();
+               srcu_read_unlock(&kvm->irq_srcu, idx);
        }
 
        if (flags & POLLHUP) {
@@ -363,7 +368,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
                }
 
                list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
-               synchronize_rcu();
+               synchronize_srcu(&kvm->irq_srcu);
 
                mutex_unlock(&kvm->irqfds.resampler_lock);
        }
@@ -465,7 +470,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
                         * another thread calls kvm_irq_routing_update before
                         * we flush workqueue below (we synchronize with
                         * kvm_irq_routing_update using irqfds.lock).
-                        * It is paired with synchronize_rcu done by caller
+                        * It is paired with synchronize_srcu done by caller
                         * of that function.
                         */
                        rcu_assign_pointer(irqfd->irq_entry, NULL);
@@ -524,7 +529,7 @@ kvm_irqfd_release(struct kvm *kvm)
 
 /*
  * Change irq_routing and irqfd.
- * Caller must invoke synchronize_rcu afterwards.
+ * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
  */
 void kvm_irq_routing_update(struct kvm *kvm,
                            struct kvm_irq_routing_table *irq_rt)
@@ -600,7 +605,15 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 {
        u64 _val;
 
-       if (!(addr == p->addr && len == p->length))
+       if (addr != p->addr)
+               /* address must be precise for a hit */
+               return false;
+
+       if (!p->length)
+               /* length = 0 means only look at the address, so always a hit */
+               return true;
+
+       if (len != p->length)
                /* address-range must be precise for a hit */
                return false;
 
@@ -671,9 +684,11 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
 
        list_for_each_entry(_p, &kvm->ioeventfds, list)
                if (_p->bus_idx == p->bus_idx &&
-                   _p->addr == p->addr && _p->length == p->length &&
-                   (_p->wildcard || p->wildcard ||
-                    _p->datamatch == p->datamatch))
+                   _p->addr == p->addr &&
+                   (!_p->length || !p->length ||
+                    (_p->length == p->length &&
+                     (_p->wildcard || p->wildcard ||
+                      _p->datamatch == p->datamatch))))
                        return true;
 
        return false;
@@ -697,8 +712,9 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
        int                       ret;
 
        bus_idx = ioeventfd_bus_from_flags(args->flags);
-       /* must be natural-word sized */
+       /* must be natural-word sized, or 0 to ignore length */
        switch (args->len) {
+       case 0:
        case 1:
        case 2:
        case 4:
@@ -716,6 +732,12 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
        if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
                return -EINVAL;
 
+       /* ioeventfd with no length can't be combined with DATAMATCH */
+       if (!args->len &&
+           args->flags & (KVM_IOEVENTFD_FLAG_PIO |
+                          KVM_IOEVENTFD_FLAG_DATAMATCH))
+               return -EINVAL;
+
        eventfd = eventfd_ctx_fdget(args->fd);
        if (IS_ERR(eventfd))
                return PTR_ERR(eventfd);
@@ -753,6 +775,16 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
        if (ret < 0)
                goto unlock_fail;
 
+       /* When length is ignored, MMIO is also put on a separate bus, for
+        * faster lookups.
+        */
+       if (!args->len && !(args->flags & KVM_IOEVENTFD_FLAG_PIO)) {
+               ret = kvm_io_bus_register_dev(kvm, KVM_FAST_MMIO_BUS,
+                                             p->addr, 0, &p->dev);
+               if (ret < 0)
+                       goto register_fail;
+       }
+
        kvm->buses[bus_idx]->ioeventfd_count++;
        list_add_tail(&p->list, &kvm->ioeventfds);
 
@@ -760,6 +792,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
        return 0;
 
+register_fail:
+       kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
 unlock_fail:
        mutex_unlock(&kvm->slots_lock);
 
@@ -799,6 +833,10 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
                        continue;
 
                kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
+               if (!p->length) {
+                       kvm_io_bus_unregister_dev(kvm, KVM_FAST_MMIO_BUS,
+                                                 &p->dev);
+               }
                kvm->buses[bus_idx]->ioeventfd_count--;
                ioeventfd_release(p);
                ret = 0;
index e2e6b4473a96fafc98dc85aa6c2e8fa49fcd1c09..ced4a542a031313cefd0915bbfa3943d3433c240 100644 (file)
@@ -163,6 +163,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
        struct kvm_kernel_irq_routing_entry *e;
        int ret = -EINVAL;
        struct kvm_irq_routing_table *irq_rt;
+       int idx;
 
        trace_kvm_set_irq(irq, level, irq_source_id);
 
@@ -174,8 +175,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
         * Since there's no easy way to do this, we only support injecting MSI
         * which is limited to 1:1 GSI mapping.
         */
-       rcu_read_lock();
-       irq_rt = rcu_dereference(kvm->irq_routing);
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
        if (irq < irq_rt->nr_rt_entries)
                hlist_for_each_entry(e, &irq_rt->map[irq], link) {
                        if (likely(e->type == KVM_IRQ_ROUTING_MSI))
@@ -184,7 +185,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
                                ret = -EWOULDBLOCK;
                        break;
                }
-       rcu_read_unlock();
+       srcu_read_unlock(&kvm->irq_srcu, idx);
        return ret;
 }
 
@@ -253,22 +254,22 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
        mutex_lock(&kvm->irq_lock);
        hlist_del_rcu(&kimn->link);
        mutex_unlock(&kvm->irq_lock);
-       synchronize_rcu();
+       synchronize_srcu(&kvm->irq_srcu);
 }
 
 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
                             bool mask)
 {
        struct kvm_irq_mask_notifier *kimn;
-       int gsi;
+       int idx, gsi;
 
-       rcu_read_lock();
-       gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
        if (gsi != -1)
                hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link)
                        if (kimn->irq == gsi)
                                kimn->func(kimn, mask);
-       rcu_read_unlock();
+       srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
 int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
index 20dc9e4a8f6cee7b6d86a6decc92410715c9264c..b43c275775cd5a1d9e5bbc7f842ceaa81f2a0c69 100644 (file)
@@ -26,6 +26,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
+#include <linux/srcu.h>
 #include <linux/export.h>
 #include <trace/events/kvm.h>
 #include "irq.h"
 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
        struct kvm_irq_ack_notifier *kian;
-       int gsi;
+       int gsi, idx;
 
-       rcu_read_lock();
-       gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
        if (gsi != -1)
                hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
                                         link)
                        if (kian->gsi == gsi) {
-                               rcu_read_unlock();
+                               srcu_read_unlock(&kvm->irq_srcu, idx);
                                return true;
                        }
 
-       rcu_read_unlock();
+       srcu_read_unlock(&kvm->irq_srcu, idx);
 
        return false;
 }
@@ -54,18 +55,18 @@ EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
        struct kvm_irq_ack_notifier *kian;
-       int gsi;
+       int gsi, idx;
 
        trace_kvm_ack_irq(irqchip, pin);
 
-       rcu_read_lock();
-       gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
        if (gsi != -1)
                hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
                                         link)
                        if (kian->gsi == gsi)
                                kian->irq_acked(kian);
-       rcu_read_unlock();
+       srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
@@ -85,7 +86,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
        mutex_lock(&kvm->irq_lock);
        hlist_del_init_rcu(&kian->link);
        mutex_unlock(&kvm->irq_lock);
-       synchronize_rcu();
+       synchronize_srcu(&kvm->irq_srcu);
 #ifdef __KVM_HAVE_IOAPIC
        kvm_vcpu_request_scan_ioapic(kvm);
 #endif
@@ -115,7 +116,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
                bool line_status)
 {
        struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
-       int ret = -1, i = 0;
+       int ret = -1, i = 0, idx;
        struct kvm_irq_routing_table *irq_rt;
 
        trace_kvm_set_irq(irq, level, irq_source_id);
@@ -124,12 +125,12 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
         * IOAPIC.  So set the bit in both. The guest will ignore
         * writes to the unused one.
         */
-       rcu_read_lock();
-       irq_rt = rcu_dereference(kvm->irq_routing);
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
        if (irq < irq_rt->nr_rt_entries)
                hlist_for_each_entry(e, &irq_rt->map[irq], link)
                        irq_set[i++] = *e;
-       rcu_read_unlock();
+       srcu_read_unlock(&kvm->irq_srcu, idx);
 
        while(i--) {
                int r;
@@ -226,7 +227,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
        kvm_irq_routing_update(kvm, new);
        mutex_unlock(&kvm->irq_lock);
 
-       synchronize_rcu();
+       synchronize_srcu_expedited(&kvm->irq_srcu);
 
        new = old;
        r = 0;
index 56baae8c2f56baf0f41bb7bb24b0b12679267d20..c86be0f983db706c81cfe60aedfb77f4a315e20f 100644 (file)
@@ -186,9 +186,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
+       long dirty_count = kvm->tlbs_dirty;
+
+       smp_mb();
        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                ++kvm->stat.remote_tlb_flush;
-       kvm->tlbs_dirty = false;
+       cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 
@@ -454,11 +457,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
 
        r = kvm_arch_init_vm(kvm, type);
        if (r)
-               goto out_err_nodisable;
+               goto out_err_no_disable;
 
        r = hardware_enable_all();
        if (r)
-               goto out_err_nodisable;
+               goto out_err_no_disable;
 
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
        INIT_HLIST_HEAD(&kvm->mask_notifier_list);
@@ -470,10 +473,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
        r = -ENOMEM;
        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
        if (!kvm->memslots)
-               goto out_err_nosrcu;
+               goto out_err_no_srcu;
        kvm_init_memslots_id(kvm);
        if (init_srcu_struct(&kvm->srcu))
-               goto out_err_nosrcu;
+               goto out_err_no_srcu;
+       if (init_srcu_struct(&kvm->irq_srcu))
+               goto out_err_no_irq_srcu;
        for (i = 0; i < KVM_NR_BUSES; i++) {
                kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
                                        GFP_KERNEL);
@@ -502,10 +507,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
        return kvm;
 
 out_err:
+       cleanup_srcu_struct(&kvm->irq_srcu);
+out_err_no_irq_srcu:
        cleanup_srcu_struct(&kvm->srcu);
-out_err_nosrcu:
+out_err_no_srcu:
        hardware_disable_all();
-out_err_nodisable:
+out_err_no_disable:
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm->buses[i]);
        kfree(kvm->memslots);
@@ -601,6 +608,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
        kvm_arch_destroy_vm(kvm);
        kvm_destroy_devices(kvm);
        kvm_free_physmem(kvm);
+       cleanup_srcu_struct(&kvm->irq_srcu);
        cleanup_srcu_struct(&kvm->srcu);
        kvm_arch_free_vm(kvm);
        hardware_disable_all();
@@ -637,14 +645,12 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
  */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
-#ifndef CONFIG_S390
        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
        memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
        if (!memslot->dirty_bitmap)
                return -ENOMEM;
 
-#endif /* !CONFIG_S390 */
        return 0;
 }
 
@@ -2922,6 +2928,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
 
        return -EOPNOTSUPP;
 }
+EXPORT_SYMBOL_GPL(kvm_io_bus_write);
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,