/* Depends on KVM_CAP_IOMMU */
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
+/* The following two depend on KVM_CAP_PCI_2_3 */
+#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
+
+If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
+via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
+assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
+guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
isolation of the device. Usages not specifying this flag are deprecated.
should skip processing the bitmap and just invalidate everything. It must
be set to the number of set bits in the bitmap.
+4.60 KVM_ASSIGN_SET_INTX_MASK
+
+Capability: KVM_CAP_PCI_2_3
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Allows userspace to mask PCI INTx interrupts from the assigned device. The
+kernel will not deliver INTx interrupts to the guest between setting and
+clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of
+and emulation of PCI 2.3 INTx disable command register behavior.
+
+This may be used for both PCI 2.3 devices supporting INTx disable natively and
+older devices lacking this support. Userspace is responsible for emulating the
+read value of the INTx disable bit in the guest visible PCI command register.
+When modifying the INTx disable state, userspace should precede updating the
+physical device command register by calling this ioctl to inform the kernel of
+the new intended INTx mask state.
+
+Note that the kernel uses the device INTx disable bit to internally manage the
+device interrupt state for PCI 2.3 devices. Reads of this register may
+therefore not match the expected value. Writes should always use the guest
+intended INTx disable value rather than attempting to read-copy-update the
+current physical device state. Races between user and kernel updates to the
+INTx disable bit are handled lazily in the kernel. It's possible the device
+may generate unintended interrupts, but they will not be injected into the
+guest.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
+by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
+evaluated.
+
4.62 KVM_CREATE_SPAPR_TCE
Capability: KVM_CAP_SPAPR_TCE
return index;
}
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int ret;
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
- spin_lock(&assigned_dev->intx_lock);
+ spin_lock(&assigned_dev->intx_lock);
+ if (pci_check_and_mask_intx(assigned_dev->dev)) {
+ assigned_dev->host_irq_disabled = true;
+ ret = IRQ_WAKE_THREAD;
+ } else
+ ret = IRQ_NONE;
+ spin_unlock(&assigned_dev->intx_lock);
+
+ return ret;
+}
+
+static void
+kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+ int vector)
+{
+ if (unlikely(assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_GUEST_INTX)) {
+ mutex_lock(&assigned_dev->intx_mask_lock);
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+ kvm_set_irq(assigned_dev->kvm,
+ assigned_dev->irq_source_id, vector, 1);
+ mutex_unlock(&assigned_dev->intx_mask_lock);
+ } else
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ vector, 1);
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
disable_irq_nosync(irq);
assigned_dev->host_irq_disabled = true;
- spin_unlock(&assigned_dev->intx_lock);
+ spin_unlock_irq(&assigned_dev->intx_lock);
}
- kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- assigned_dev->guest_irq, 1);
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
return IRQ_HANDLED;
}
+#endif
#ifdef __KVM_HAVE_MSIX
static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
if (index >= 0) {
vector = assigned_dev->guest_msix_entries[index].vector;
- kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- vector, 1);
+ kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
}
return IRQ_HANDLED;
kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
- /* The guest irq may be shared so this ack may be
- * from another device.
- */
- spin_lock(&dev->intx_lock);
- if (dev->host_irq_disabled) {
- enable_irq(dev->host_irq);
- dev->host_irq_disabled = false;
+ mutex_lock(&dev->intx_mask_lock);
+
+ if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+ bool reassert = false;
+
+ spin_lock_irq(&dev->intx_lock);
+ /*
+ * The guest IRQ may be shared so this ack can come from an
+ * IRQ for another guest device.
+ */
+ if (dev->host_irq_disabled) {
+ if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
+ enable_irq(dev->host_irq);
+ else if (!pci_check_and_unmask_intx(dev->dev))
+ reassert = true;
+ dev->host_irq_disabled = reassert;
+ }
+ spin_unlock_irq(&dev->intx_lock);
+
+ if (reassert)
+ kvm_set_irq(dev->kvm, dev->irq_source_id,
+ dev->guest_irq, 1);
}
- spin_unlock(&dev->intx_lock);
+
+ mutex_unlock(&dev->intx_mask_lock);
}
static void deassign_guest_irq(struct kvm *kvm,
pci_disable_msix(assigned_dev->dev);
} else {
/* Deal with MSI and INTx */
- disable_irq(assigned_dev->host_irq);
+ if ((assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_HOST_INTX) &&
+ (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ pci_intx(assigned_dev->dev, false);
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ synchronize_irq(assigned_dev->host_irq);
+ } else
+ disable_irq(assigned_dev->host_irq);
free_irq(assigned_dev->host_irq, assigned_dev);
static int assigned_device_enable_host_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
+ irq_handler_t irq_handler;
+ unsigned long flags;
+
dev->host_irq = dev->dev->irq;
- /* Even though this is PCI, we don't want to use shared
- * interrupts. Sharing host devices with guest-assigned devices
- * on the same interrupt line is not a happy situation: there
- * are going to be long delays in accepting, acking, etc.
+
+ /*
+ * We can only share the IRQ line with other host devices if we are
+ * able to disable the IRQ source at device-level - independently of
+ * the guest driver. Otherwise host devices may suffer from unbounded
+ * IRQ latencies when the guest keeps the line asserted.
*/
- if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- IRQF_ONESHOT, dev->irq_name, dev))
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ irq_handler = kvm_assigned_dev_intx;
+ flags = IRQF_SHARED;
+ } else {
+ irq_handler = NULL;
+ flags = IRQF_ONESHOT;
+ }
+ if (request_threaded_irq(dev->host_irq, irq_handler,
+ kvm_assigned_dev_thread_intx, flags,
+ dev->irq_name, dev))
return -EIO;
+
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ spin_lock_irq(&dev->intx_lock);
+ pci_intx(dev->dev, true);
+ spin_unlock_irq(&dev->intx_lock);
+ }
return 0;
}
}
dev->host_irq = dev->dev->irq;
- if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- 0, dev->irq_name, dev)) {
+ if (request_threaded_irq(dev->host_irq, NULL,
+ kvm_assigned_dev_thread_msi, 0,
+ dev->irq_name, dev)) {
pci_disable_msi(dev->dev);
return -EIO;
}
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
return 0;
}
#endif
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
return 0;
}
#endif
default:
r = -EINVAL;
}
+ dev->host_irq_disabled = false;
if (!r)
dev->irq_requested_type |= host_irq_type;
{
int r = -ENODEV;
struct kvm_assigned_dev_kernel *match;
+ unsigned long irq_type;
mutex_lock(&kvm->lock);
if (!match)
goto out;
- r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+ irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+ KVM_DEV_IRQ_GUEST_MASK);
+ r = kvm_deassign_irq(kvm, match, irq_type);
out:
mutex_unlock(&kvm->lock);
return r;
if (!match->pci_saved_state)
printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
__func__, dev_name(&dev->dev));
+
+ if (!pci_intx_mask_supported(dev))
+ assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
match->host_busnr = assigned_dev->busnr;
match->flags = assigned_dev->flags;
match->dev = dev;
spin_lock_init(&match->intx_lock);
+ mutex_init(&match->intx_mask_lock);
match->irq_source_id = -1;
match->kvm = kvm;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
}
#endif
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ r = -ENODEV;
+ goto out;
+ }
+
+ mutex_lock(&match->intx_mask_lock);
+
+ match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+ match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+ if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+ kvm_set_irq(match->kvm, match->irq_source_id,
+ match->guest_irq, 0);
+ /*
+ * Masking at hardware-level is performed on demand,
+ * i.e. when an IRQ actually arrives at the host.
+ */
+ } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ /*
+ * Unmask the IRQ line if required. Unmasking at
+ * device level will be performed by user space.
+ */
+ spin_lock_irq(&match->intx_lock);
+ if (match->host_irq_disabled) {
+ enable_irq(match->host_irq);
+ match->host_irq_disabled = false;
+ }
+ spin_unlock_irq(&match->intx_lock);
+ }
+ }
+
+ mutex_unlock(&match->intx_mask_lock);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
unsigned long arg)
{
break;
}
#endif
+ case KVM_ASSIGN_SET_INTX_MASK: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+ break;
+ }
default:
r = -ENOTTY;
break;
out:
return r;
}
-