KVM: Allow host IRQ sharing for assigned PCI 2.3 devices

author Jan Kiszka <jan.kiszka@siemens.com>

Tue, 28 Feb 2012 13:19:54 +0000 (14:19 +0100)

committer Avi Kivity <avi@redhat.com>

Thu, 8 Mar 2012 12:11:36 +0000 (14:11 +0200)
author Jan Kiszka <jan.kiszka@siemens.com>
Tue, 28 Feb 2012 13:19:54 +0000 (14:19 +0100)
committer Avi Kivity <avi@redhat.com>
Thu, 8 Mar 2012 12:11:36 +0000 (14:11 +0200)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt

index 59a38264a0edab5da7942e6aa6b6297c7efbb891..6386f8c0482eaa578f68dfe850e559691e9de95d 100644 (file)
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1169,6 +1169,14 @@ following flags are specified:
  
  /* Depends on KVM_CAP_IOMMU */
  #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
+/* The following two depend on KVM_CAP_PCI_2_3 */
+#define KVM_DEV_ASSIGN_PCI_2_3         (1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX       (1 << 2)
+
+If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
+via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
+assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
+guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
  
  The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
  isolation of the device.  Usages not specifying this flag are deprecated.
@@ -1441,6 +1449,39 @@ The "num_dirty" field is a performance hint for KVM to determine whether it
  should skip processing the bitmap and just invalidate everything.  It must
  be set to the number of set bits in the bitmap.
  
+4.60 KVM_ASSIGN_SET_INTX_MASK
+
+Capability: KVM_CAP_PCI_2_3
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Allows userspace to mask PCI INTx interrupts from the assigned device.  The
+kernel will not deliver INTx interrupts to the guest between setting and
+clearing of KVM_ASSIGN_SET_INTX_MASK via this interface.  This enables use of
+and emulation of PCI 2.3 INTx disable command register behavior.
+
+This may be used for both PCI 2.3 devices supporting INTx disable natively and
+older devices lacking this support. Userspace is responsible for emulating the
+read value of the INTx disable bit in the guest visible PCI command register.
+When modifying the INTx disable state, userspace should precede updating the
+physical device command register by calling this ioctl to inform the kernel of
+the new intended INTx mask state.
+
+Note that the kernel uses the device INTx disable bit to internally manage the
+device interrupt state for PCI 2.3 devices.  Reads of this register may
+therefore not match the expected value.  Writes should always use the guest
+intended INTx disable value rather than attempting to read-copy-update the
+current physical device state.  Races between user and kernel updates to the
+INTx disable bit are handled lazily in the kernel.  It's possible the device
+may generate unintended interrupts, but they will not be injected into the
+guest.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure.  The target device is specified
+by assigned_dev_id.  In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
+evaluated.
+
  4.62 KVM_CREATE_SPAPR_TCE
  
  Capability: KVM_CAP_SPAPR_TCE
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 9477dc6cccaed51790eee82bd1f8de9f2cbf70cc..6866083a48c120fb1c93e5910614b8f17876a898 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2143,6 +2143,7 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_XSAVE:
         case KVM_CAP_ASYNC_PF:
         case KVM_CAP_GET_TSC_KHZ:
+       case KVM_CAP_PCI_2_3:
                 r = 1;
                 break;
         case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h

index acbe42939089224d06b0307f7a7c41d6bd9d1ada..6c322a90b92f8439886a954b37d58f62f29abb43 100644 (file)
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -588,6 +588,7 @@ struct kvm_ppc_pvinfo {
  #define KVM_CAP_TSC_DEADLINE_TIMER 72
  #define KVM_CAP_S390_UCONTROL 73
  #define KVM_CAP_SYNC_REGS 74
+#define KVM_CAP_PCI_2_3 75
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
@@ -784,6 +785,9 @@ struct kvm_s390_ucas_mapping {
  /* Available with KVM_CAP_TSC_CONTROL */
  #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
  #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
+                                      struct kvm_assigned_pci_dev)
  
  /*
   * ioctls for vcpu fds
@@ -857,6 +861,8 @@ struct kvm_s390_ucas_mapping {
  #define KVM_SET_ONE_REG                  _IOW(KVMIO,  0xac, struct kvm_one_reg)
  
  #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
+#define KVM_DEV_ASSIGN_PCI_2_3         (1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX       (1 << 2)
  
  struct kvm_assigned_pci_dev {
         __u32 assigned_dev_id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index e42d85ae854101213827bccc44e73d1331d5426c..ec171c1d0878ef292c7736890d6c382a84e8a5b3 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel {
         unsigned int entries_nr;
         int host_irq;
         bool host_irq_disabled;
+       bool pci_2_3;
         struct msix_entry *host_msix_entries;
         int guest_irq;
         struct msix_entry *guest_msix_entries;
@@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel {
         struct pci_dev *dev;
         struct kvm *kvm;
         spinlock_t intx_lock;
+       struct mutex intx_mask_lock;
         char irq_name[32];
         struct pci_saved_state *pci_saved_state;
  };
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c

index ece80612b594f5cfb2d83de3b720426b05a9204b..08e05715df72ece1c91f3179ffc233899d7f1d00 100644 (file)
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -55,22 +55,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
         return index;
  }
  
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
  {
         struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+       int ret;
  
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
-               spin_lock(&assigned_dev->intx_lock);
+       spin_lock(&assigned_dev->intx_lock);
+       if (pci_check_and_mask_intx(assigned_dev->dev)) {
+               assigned_dev->host_irq_disabled = true;
+               ret = IRQ_WAKE_THREAD;
+       } else
+               ret = IRQ_NONE;
+       spin_unlock(&assigned_dev->intx_lock);
+
+       return ret;
+}
+
+static void
+kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+                                int vector)
+{
+       if (unlikely(assigned_dev->irq_requested_type &
+                    KVM_DEV_IRQ_GUEST_INTX)) {
+               mutex_lock(&assigned_dev->intx_mask_lock);
+               if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+                       kvm_set_irq(assigned_dev->kvm,
+                                   assigned_dev->irq_source_id, vector, 1);
+               mutex_unlock(&assigned_dev->intx_mask_lock);
+       } else
+               kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+                           vector, 1);
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+       if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+               spin_lock_irq(&assigned_dev->intx_lock);
                 disable_irq_nosync(irq);
                 assigned_dev->host_irq_disabled = true;
-               spin_unlock(&assigned_dev->intx_lock);
+               spin_unlock_irq(&assigned_dev->intx_lock);
         }
  
-       kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                   assigned_dev->guest_irq, 1);
+       kvm_assigned_dev_raise_guest_irq(assigned_dev,
+                                        assigned_dev->guest_irq);
+
+       return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+       kvm_assigned_dev_raise_guest_irq(assigned_dev,
+                                        assigned_dev->guest_irq);
  
         return IRQ_HANDLED;
  }
+#endif
  
  #ifdef __KVM_HAVE_MSIX
  static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
@@ -81,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
  
         if (index >= 0) {
                 vector = assigned_dev->guest_msix_entries[index].vector;
-               kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                           vector, 1);
+               kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
         }
  
         return IRQ_HANDLED;
@@ -98,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
  
         kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
  
-       /* The guest irq may be shared so this ack may be
-        * from another device.
-        */
-       spin_lock(&dev->intx_lock);
-       if (dev->host_irq_disabled) {
-               enable_irq(dev->host_irq);
-               dev->host_irq_disabled = false;
+       mutex_lock(&dev->intx_mask_lock);
+
+       if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+               bool reassert = false;
+
+               spin_lock_irq(&dev->intx_lock);
+               /*
+                * The guest IRQ may be shared so this ack can come from an
+                * IRQ for another guest device.
+                */
+               if (dev->host_irq_disabled) {
+                       if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
+                               enable_irq(dev->host_irq);
+                       else if (!pci_check_and_unmask_intx(dev->dev))
+                               reassert = true;
+                       dev->host_irq_disabled = reassert;
+               }
+               spin_unlock_irq(&dev->intx_lock);
+
+               if (reassert)
+                       kvm_set_irq(dev->kvm, dev->irq_source_id,
+                                   dev->guest_irq, 1);
         }
-       spin_unlock(&dev->intx_lock);
+
+       mutex_unlock(&dev->intx_mask_lock);
  }
  
  static void deassign_guest_irq(struct kvm *kvm,
@@ -154,7 +213,15 @@ static void deassign_host_irq(struct kvm *kvm,
                 pci_disable_msix(assigned_dev->dev);
         } else {
                 /* Deal with MSI and INTx */
-               disable_irq(assigned_dev->host_irq);
+               if ((assigned_dev->irq_requested_type &
+                    KVM_DEV_IRQ_HOST_INTX) &&
+                   (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+                       spin_lock_irq(&assigned_dev->intx_lock);
+                       pci_intx(assigned_dev->dev, false);
+                       spin_unlock_irq(&assigned_dev->intx_lock);
+                       synchronize_irq(assigned_dev->host_irq);
+               } else
+                       disable_irq(assigned_dev->host_irq);
  
                 free_irq(assigned_dev->host_irq, assigned_dev);
  
@@ -235,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
  static int assigned_device_enable_host_intx(struct kvm *kvm,
                                             struct kvm_assigned_dev_kernel *dev)
  {
+       irq_handler_t irq_handler;
+       unsigned long flags;
+
         dev->host_irq = dev->dev->irq;
-       /* Even though this is PCI, we don't want to use shared
-        * interrupts. Sharing host devices with guest-assigned devices
-        * on the same interrupt line is not a happy situation: there
-        * are going to be long delays in accepting, acking, etc.
+
+       /*
+        * We can only share the IRQ line with other host devices if we are
+        * able to disable the IRQ source at device-level - independently of
+        * the guest driver. Otherwise host devices may suffer from unbounded
+        * IRQ latencies when the guest keeps the line asserted.
          */
-       if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-                                IRQF_ONESHOT, dev->irq_name, dev))
+       if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+               irq_handler = kvm_assigned_dev_intx;
+               flags = IRQF_SHARED;
+       } else {
+               irq_handler = NULL;
+               flags = IRQF_ONESHOT;
+       }
+       if (request_threaded_irq(dev->host_irq, irq_handler,
+                                kvm_assigned_dev_thread_intx, flags,
+                                dev->irq_name, dev))
                 return -EIO;
+
+       if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+               spin_lock_irq(&dev->intx_lock);
+               pci_intx(dev->dev, true);
+               spin_unlock_irq(&dev->intx_lock);
+       }
         return 0;
  }
  
@@ -260,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
         }
  
         dev->host_irq = dev->dev->irq;
-       if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-                                0, dev->irq_name, dev)) {
+       if (request_threaded_irq(dev->host_irq, NULL,
+                                kvm_assigned_dev_thread_msi, 0,
+                                dev->irq_name, dev)) {
                 pci_disable_msi(dev->dev);
                 return -EIO;
         }
@@ -319,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
  {
         dev->guest_irq = irq->guest_irq;
         dev->ack_notifier.gsi = -1;
-       dev->host_irq_disabled = false;
         return 0;
  }
  #endif
@@ -331,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
  {
         dev->guest_irq = irq->guest_irq;
         dev->ack_notifier.gsi = -1;
-       dev->host_irq_disabled = false;
         return 0;
  }
  #endif
@@ -365,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
         default:
                 r = -EINVAL;
         }
+       dev->host_irq_disabled = false;
  
         if (!r)
                 dev->irq_requested_type |= host_irq_type;
@@ -466,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
  {
         int r = -ENODEV;
         struct kvm_assigned_dev_kernel *match;
+       unsigned long irq_type;
  
         mutex_lock(&kvm->lock);
  
@@ -474,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
         if (!match)
                 goto out;
  
-       r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+       irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+                                         KVM_DEV_IRQ_GUEST_MASK);
+       r = kvm_deassign_irq(kvm, match, irq_type);
  out:
         mutex_unlock(&kvm->lock);
         return r;
@@ -607,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
         if (!match->pci_saved_state)
                 printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
                        __func__, dev_name(&dev->dev));
+
+       if (!pci_intx_mask_supported(dev))
+               assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
         match->assigned_dev_id = assigned_dev->assigned_dev_id;
         match->host_segnr = assigned_dev->segnr;
         match->host_busnr = assigned_dev->busnr;
@@ -614,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
         match->flags = assigned_dev->flags;
         match->dev = dev;
         spin_lock_init(&match->intx_lock);
+       mutex_init(&match->intx_mask_lock);
         match->irq_source_id = -1;
         match->kvm = kvm;
         match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -759,6 +853,55 @@ msix_entry_out:
  }
  #endif
  
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+               struct kvm_assigned_pci_dev *assigned_dev)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *match;
+
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_dev->assigned_dev_id);
+       if (!match) {
+               r = -ENODEV;
+               goto out;
+       }
+
+       mutex_lock(&match->intx_mask_lock);
+
+       match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+       match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+       if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+               if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+                       kvm_set_irq(match->kvm, match->irq_source_id,
+                                   match->guest_irq, 0);
+                       /*
+                        * Masking at hardware-level is performed on demand,
+                        * i.e. when an IRQ actually arrives at the host.
+                        */
+               } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+                       /*
+                        * Unmask the IRQ line if required. Unmasking at
+                        * device level will be performed by user space.
+                        */
+                       spin_lock_irq(&match->intx_lock);
+                       if (match->host_irq_disabled) {
+                               enable_irq(match->host_irq);
+                               match->host_irq_disabled = false;
+                       }
+                       spin_unlock_irq(&match->intx_lock);
+               }
+       }
+
+       mutex_unlock(&match->intx_mask_lock);
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
  long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                                   unsigned long arg)
  {
@@ -866,6 +1009,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                 break;
         }
  #endif
+       case KVM_ASSIGN_SET_INTX_MASK: {
+               struct kvm_assigned_pci_dev assigned_dev;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+                       goto out;
+               r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+               break;
+       }
         default:
                 r = -ENOTTY;
                 break;
@@ -873,4 +1025,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
  out:
         return r;
  }
-
author	Jan Kiszka <jan.kiszka@siemens.com>
	Tue, 28 Feb 2012 13:19:54 +0000 (14:19 +0100)
committer	Avi Kivity <avi@redhat.com>
	Thu, 8 Mar 2012 12:11:36 +0000 (14:11 +0200)
Documentation/virtual/kvm/api.txt		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
include/linux/kvm.h		patch \| blob \| history
include/linux/kvm_host.h		patch \| blob \| history
virt/kvm/assigned-dev.c		patch \| blob \| history