4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
48 struct mutex group_lock;
49 struct cdev group_cdev;
51 wait_queue_head_t release_q;
54 struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
59 struct vfio_container {
61 struct list_head group_list;
62 struct rw_semaphore group_lock;
63 struct vfio_iommu_driver *iommu_driver;
68 struct vfio_unbound_dev {
70 struct list_head unbound_next;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
94 const struct vfio_device_ops *ops;
95 struct vfio_group *group;
96 struct list_head group_next;
100 #ifdef CONFIG_VFIO_NOIOMMU
101 static bool noiommu __read_mostly;
102 module_param_named(enable_unsafe_noiommu_support,
103 noiommu, bool, S_IRUGO | S_IWUSR);
104 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
108 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
109 * and remove functions, any use cases other than acquiring the first
110 * reference for the purpose of calling vfio_add_group_dev() or removing
111 * that symmetric reference after vfio_del_group_dev() should use the raw
112 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
113 * removes the device from the dummy group and cannot be nested.
115 struct iommu_group *vfio_iommu_group_get(struct device *dev)
117 struct iommu_group *group;
118 int __maybe_unused ret;
120 group = iommu_group_get(dev);
122 #ifdef CONFIG_VFIO_NOIOMMU
124 * With noiommu enabled, an IOMMU group will be created for a device
125 * that doesn't already have one and doesn't have an iommu_ops on their
126 * bus. We use iommu_present() again in the main code to detect these
129 if (group || !noiommu || iommu_present(dev->bus))
132 group = iommu_group_alloc();
136 iommu_group_set_name(group, "vfio-noiommu");
137 ret = iommu_group_add_device(group, dev);
138 iommu_group_put(group);
143 * Where to taint? At this point we've added an IOMMU group for a
144 * device that is not backed by iommu_ops, therefore any iommu_
145 * callback using iommu_ops can legitimately Oops. So, while we may
146 * be about to give a DMA capable device to a user without IOMMU
147 * protection, which is clearly taint-worthy, let's go ahead and do
150 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
151 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
156 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
158 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
160 #ifdef CONFIG_VFIO_NOIOMMU
161 if (!iommu_present(dev->bus))
162 iommu_group_remove_device(dev);
165 iommu_group_put(group);
167 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
169 #ifdef CONFIG_VFIO_NOIOMMU
170 static void *vfio_noiommu_open(unsigned long arg)
172 if (arg != VFIO_NOIOMMU_IOMMU)
173 return ERR_PTR(-EINVAL);
174 if (!capable(CAP_SYS_RAWIO))
175 return ERR_PTR(-EPERM);
180 static void vfio_noiommu_release(void *iommu_data)
184 static long vfio_noiommu_ioctl(void *iommu_data,
185 unsigned int cmd, unsigned long arg)
187 if (cmd == VFIO_CHECK_EXTENSION)
188 return arg == VFIO_NOIOMMU_IOMMU ? 1 : 0;
193 static int vfio_iommu_present(struct device *dev, void *unused)
195 return iommu_present(dev->bus) ? 1 : 0;
198 static int vfio_noiommu_attach_group(void *iommu_data,
199 struct iommu_group *iommu_group)
201 return iommu_group_for_each_dev(iommu_group, NULL,
202 vfio_iommu_present) ? -EINVAL : 0;
205 static void vfio_noiommu_detach_group(void *iommu_data,
206 struct iommu_group *iommu_group)
210 static struct vfio_iommu_driver_ops vfio_noiommu_ops = {
211 .name = "vfio-noiommu",
212 .owner = THIS_MODULE,
213 .open = vfio_noiommu_open,
214 .release = vfio_noiommu_release,
215 .ioctl = vfio_noiommu_ioctl,
216 .attach_group = vfio_noiommu_attach_group,
217 .detach_group = vfio_noiommu_detach_group,
220 static struct vfio_iommu_driver vfio_noiommu_driver = {
221 .ops = &vfio_noiommu_ops,
225 * Wrap IOMMU drivers, the noiommu driver is the one and only driver for
226 * noiommu groups (and thus containers) and not available for normal groups.
228 #define vfio_for_each_iommu_driver(con, pos) \
229 for (pos = con->noiommu ? &vfio_noiommu_driver : \
230 list_first_entry(&vfio.iommu_drivers_list, \
231 struct vfio_iommu_driver, vfio_next); \
232 (con->noiommu ? pos != NULL : \
233 &pos->vfio_next != &vfio.iommu_drivers_list); \
234 pos = con->noiommu ? NULL : list_next_entry(pos, vfio_next))
236 #define vfio_for_each_iommu_driver(con, pos) \
237 list_for_each_entry(pos, &vfio.iommu_drivers_list, vfio_next)
242 * IOMMU driver registration
244 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
246 struct vfio_iommu_driver *driver, *tmp;
248 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
254 mutex_lock(&vfio.iommu_drivers_lock);
256 /* Check for duplicates */
257 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
258 if (tmp->ops == ops) {
259 mutex_unlock(&vfio.iommu_drivers_lock);
265 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
267 mutex_unlock(&vfio.iommu_drivers_lock);
271 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
273 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
275 struct vfio_iommu_driver *driver;
277 mutex_lock(&vfio.iommu_drivers_lock);
278 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
279 if (driver->ops == ops) {
280 list_del(&driver->vfio_next);
281 mutex_unlock(&vfio.iommu_drivers_lock);
286 mutex_unlock(&vfio.iommu_drivers_lock);
288 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
291 * Group minor allocation/free - both called with vfio.group_lock held
293 static int vfio_alloc_group_minor(struct vfio_group *group)
295 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
298 static void vfio_free_group_minor(int minor)
300 idr_remove(&vfio.group_idr, minor);
303 static int vfio_iommu_group_notifier(struct notifier_block *nb,
304 unsigned long action, void *data);
305 static void vfio_group_get(struct vfio_group *group);
308 * Container objects - containers are created when /dev/vfio/vfio is
309 * opened, but their lifecycle extends until the last user is done, so
310 * it's freed via kref. Must support container/group/device being
311 * closed in any order.
313 static void vfio_container_get(struct vfio_container *container)
315 kref_get(&container->kref);
318 static void vfio_container_release(struct kref *kref)
320 struct vfio_container *container;
321 container = container_of(kref, struct vfio_container, kref);
326 static void vfio_container_put(struct vfio_container *container)
328 kref_put(&container->kref, vfio_container_release);
331 static void vfio_group_unlock_and_free(struct vfio_group *group)
333 mutex_unlock(&vfio.group_lock);
335 * Unregister outside of lock. A spurious callback is harmless now
336 * that the group is no longer in vfio.group_list.
338 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
343 * Group objects - create, release, get, put, search
345 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
348 struct vfio_group *group, *tmp;
352 group = kzalloc(sizeof(*group), GFP_KERNEL);
354 return ERR_PTR(-ENOMEM);
356 kref_init(&group->kref);
357 INIT_LIST_HEAD(&group->device_list);
358 mutex_init(&group->device_lock);
359 INIT_LIST_HEAD(&group->unbound_list);
360 mutex_init(&group->unbound_lock);
361 atomic_set(&group->container_users, 0);
362 atomic_set(&group->opened, 0);
363 group->iommu_group = iommu_group;
364 group->noiommu = noiommu;
366 group->nb.notifier_call = vfio_iommu_group_notifier;
369 * blocking notifiers acquire a rwsem around registering and hold
370 * it around callback. Therefore, need to register outside of
371 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
372 * do anything unless it can find the group in vfio.group_list, so
373 * no harm in registering early.
375 ret = iommu_group_register_notifier(iommu_group, &group->nb);
381 mutex_lock(&vfio.group_lock);
383 /* Did we race creating this group? */
384 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
385 if (tmp->iommu_group == iommu_group) {
387 vfio_group_unlock_and_free(group);
392 minor = vfio_alloc_group_minor(group);
394 vfio_group_unlock_and_free(group);
395 return ERR_PTR(minor);
398 dev = device_create(vfio.class, NULL,
399 MKDEV(MAJOR(vfio.group_devt), minor),
400 group, "%s%d", noiommu ? "noiommu-" : "",
401 iommu_group_id(iommu_group));
403 vfio_free_group_minor(minor);
404 vfio_group_unlock_and_free(group);
405 return (struct vfio_group *)dev; /* ERR_PTR */
408 group->minor = minor;
411 list_add(&group->vfio_next, &vfio.group_list);
413 mutex_unlock(&vfio.group_lock);
418 /* called with vfio.group_lock held */
419 static void vfio_group_release(struct kref *kref)
421 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
422 struct vfio_unbound_dev *unbound, *tmp;
423 struct iommu_group *iommu_group = group->iommu_group;
425 WARN_ON(!list_empty(&group->device_list));
427 list_for_each_entry_safe(unbound, tmp,
428 &group->unbound_list, unbound_next) {
429 list_del(&unbound->unbound_next);
433 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
434 list_del(&group->vfio_next);
435 vfio_free_group_minor(group->minor);
436 vfio_group_unlock_and_free(group);
437 iommu_group_put(iommu_group);
440 static void vfio_group_put(struct vfio_group *group)
442 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
445 /* Assume group_lock or group reference is held */
446 static void vfio_group_get(struct vfio_group *group)
448 kref_get(&group->kref);
452 * Not really a try as we will sleep for mutex, but we need to make
453 * sure the group pointer is valid under lock and get a reference.
455 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
457 struct vfio_group *target = group;
459 mutex_lock(&vfio.group_lock);
460 list_for_each_entry(group, &vfio.group_list, vfio_next) {
461 if (group == target) {
462 vfio_group_get(group);
463 mutex_unlock(&vfio.group_lock);
467 mutex_unlock(&vfio.group_lock);
473 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
475 struct vfio_group *group;
477 mutex_lock(&vfio.group_lock);
478 list_for_each_entry(group, &vfio.group_list, vfio_next) {
479 if (group->iommu_group == iommu_group) {
480 vfio_group_get(group);
481 mutex_unlock(&vfio.group_lock);
485 mutex_unlock(&vfio.group_lock);
490 static struct vfio_group *vfio_group_get_from_minor(int minor)
492 struct vfio_group *group;
494 mutex_lock(&vfio.group_lock);
495 group = idr_find(&vfio.group_idr, minor);
497 mutex_unlock(&vfio.group_lock);
500 vfio_group_get(group);
501 mutex_unlock(&vfio.group_lock);
507 * Device objects - create, release, get, put, search
510 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
512 const struct vfio_device_ops *ops,
515 struct vfio_device *device;
517 device = kzalloc(sizeof(*device), GFP_KERNEL);
519 return ERR_PTR(-ENOMEM);
521 kref_init(&device->kref);
523 device->group = group;
525 device->device_data = device_data;
526 dev_set_drvdata(dev, device);
528 /* No need to get group_lock, caller has group reference */
529 vfio_group_get(group);
531 mutex_lock(&group->device_lock);
532 list_add(&device->group_next, &group->device_list);
533 mutex_unlock(&group->device_lock);
538 static void vfio_device_release(struct kref *kref)
540 struct vfio_device *device = container_of(kref,
541 struct vfio_device, kref);
542 struct vfio_group *group = device->group;
544 list_del(&device->group_next);
545 mutex_unlock(&group->device_lock);
547 dev_set_drvdata(device->dev, NULL);
551 /* vfio_del_group_dev may be waiting for this device */
552 wake_up(&vfio.release_q);
555 /* Device reference always implies a group reference */
556 void vfio_device_put(struct vfio_device *device)
558 struct vfio_group *group = device->group;
559 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
560 vfio_group_put(group);
562 EXPORT_SYMBOL_GPL(vfio_device_put);
564 static void vfio_device_get(struct vfio_device *device)
566 vfio_group_get(device->group);
567 kref_get(&device->kref);
570 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
573 struct vfio_device *device;
575 mutex_lock(&group->device_lock);
576 list_for_each_entry(device, &group->device_list, group_next) {
577 if (device->dev == dev) {
578 vfio_device_get(device);
579 mutex_unlock(&group->device_lock);
583 mutex_unlock(&group->device_lock);
588 * Some drivers, like pci-stub, are only used to prevent other drivers from
589 * claiming a device and are therefore perfectly legitimate for a user owned
590 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
591 * of the device, but it does prevent the user from having direct access to
592 * the device, which is useful in some circumstances.
594 * We also assume that we can include PCI interconnect devices, ie. bridges.
595 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
596 * then all of the downstream devices will be part of the same IOMMU group as
597 * the bridge. Thus, if placing the bridge into the user owned IOVA space
598 * breaks anything, it only does so for user owned devices downstream. Note
599 * that error notification via MSI can be affected for platforms that handle
600 * MSI within the same IOVA space as DMA.
602 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
604 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
608 if (dev_is_pci(dev)) {
609 struct pci_dev *pdev = to_pci_dev(dev);
611 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
615 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
616 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
624 * A vfio group is viable for use by userspace if all devices are in
625 * one of the following states:
627 * - bound to a vfio driver
628 * - bound to a whitelisted driver
629 * - a PCI interconnect device
631 * We use two methods to determine whether a device is bound to a vfio
632 * driver. The first is to test whether the device exists in the vfio
633 * group. The second is to test if the device exists on the group
634 * unbound_list, indicating it's in the middle of transitioning from
635 * a vfio driver to driver-less.
637 static int vfio_dev_viable(struct device *dev, void *data)
639 struct vfio_group *group = data;
640 struct vfio_device *device;
641 struct device_driver *drv = ACCESS_ONCE(dev->driver);
642 struct vfio_unbound_dev *unbound;
645 mutex_lock(&group->unbound_lock);
646 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
647 if (dev == unbound->dev) {
652 mutex_unlock(&group->unbound_lock);
654 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
657 device = vfio_group_get_device(group, dev);
659 vfio_device_put(device);
667 * Async device support
669 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
671 struct vfio_device *device;
673 /* Do we already know about it? We shouldn't */
674 device = vfio_group_get_device(group, dev);
675 if (WARN_ON_ONCE(device)) {
676 vfio_device_put(device);
680 /* Nothing to do for idle groups */
681 if (!atomic_read(&group->container_users))
684 /* TODO Prevent device auto probing */
685 WARN("Device %s added to live group %d!\n", dev_name(dev),
686 iommu_group_id(group->iommu_group));
691 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
693 /* We don't care what happens when the group isn't in use */
694 if (!atomic_read(&group->container_users))
697 return vfio_dev_viable(dev, group);
700 static int vfio_iommu_group_notifier(struct notifier_block *nb,
701 unsigned long action, void *data)
703 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
704 struct device *dev = data;
705 struct vfio_unbound_dev *unbound;
708 * Need to go through a group_lock lookup to get a reference or we
709 * risk racing a group being removed. Ignore spurious notifies.
711 group = vfio_group_try_get(group);
716 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
717 vfio_group_nb_add_dev(group, dev);
719 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
721 * Nothing to do here. If the device is in use, then the
722 * vfio sub-driver should block the remove callback until
723 * it is unused. If the device is unused or attached to a
724 * stub driver, then it should be released and we don't
725 * care that it will be going away.
728 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
729 pr_debug("%s: Device %s, group %d binding to driver\n",
730 __func__, dev_name(dev),
731 iommu_group_id(group->iommu_group));
733 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
734 pr_debug("%s: Device %s, group %d bound to driver %s\n",
735 __func__, dev_name(dev),
736 iommu_group_id(group->iommu_group), dev->driver->name);
737 BUG_ON(vfio_group_nb_verify(group, dev));
739 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
740 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
741 __func__, dev_name(dev),
742 iommu_group_id(group->iommu_group), dev->driver->name);
744 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
745 pr_debug("%s: Device %s, group %d unbound from driver\n",
746 __func__, dev_name(dev),
747 iommu_group_id(group->iommu_group));
749 * XXX An unbound device in a live group is ok, but we'd
750 * really like to avoid the above BUG_ON by preventing other
751 * drivers from binding to it. Once that occurs, we have to
752 * stop the system to maintain isolation. At a minimum, we'd
753 * want a toggle to disable driver auto probe for this device.
756 mutex_lock(&group->unbound_lock);
757 list_for_each_entry(unbound,
758 &group->unbound_list, unbound_next) {
759 if (dev == unbound->dev) {
760 list_del(&unbound->unbound_next);
765 mutex_unlock(&group->unbound_lock);
769 vfio_group_put(group);
776 int vfio_add_group_dev(struct device *dev,
777 const struct vfio_device_ops *ops, void *device_data)
779 struct iommu_group *iommu_group;
780 struct vfio_group *group;
781 struct vfio_device *device;
783 iommu_group = iommu_group_get(dev);
787 group = vfio_group_get_from_iommu(iommu_group);
789 group = vfio_create_group(iommu_group,
790 !iommu_present(dev->bus));
792 iommu_group_put(iommu_group);
793 return PTR_ERR(group);
797 * A found vfio_group already holds a reference to the
798 * iommu_group. A created vfio_group keeps the reference.
800 iommu_group_put(iommu_group);
803 device = vfio_group_get_device(group, dev);
805 WARN(1, "Device %s already exists on group %d\n",
806 dev_name(dev), iommu_group_id(iommu_group));
807 vfio_device_put(device);
808 vfio_group_put(group);
812 device = vfio_group_create_device(group, dev, ops, device_data);
813 if (IS_ERR(device)) {
814 vfio_group_put(group);
815 return PTR_ERR(device);
819 * Drop all but the vfio_device reference. The vfio_device holds
820 * a reference to the vfio_group, which holds a reference to the
823 vfio_group_put(group);
827 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
830 * Get a reference to the vfio_device for a device. Even if the
831 * caller thinks they own the device, they could be racing with a
832 * release call path, so we can't trust drvdata for the shortcut.
833 * Go the long way around, from the iommu_group to the vfio_group
834 * to the vfio_device.
836 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
838 struct iommu_group *iommu_group;
839 struct vfio_group *group;
840 struct vfio_device *device;
842 iommu_group = iommu_group_get(dev);
846 group = vfio_group_get_from_iommu(iommu_group);
847 iommu_group_put(iommu_group);
851 device = vfio_group_get_device(group, dev);
852 vfio_group_put(group);
856 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
858 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
861 struct vfio_device *it, *device = NULL;
863 mutex_lock(&group->device_lock);
864 list_for_each_entry(it, &group->device_list, group_next) {
865 if (!strcmp(dev_name(it->dev), buf)) {
867 vfio_device_get(device);
871 mutex_unlock(&group->device_lock);
877 * Caller must hold a reference to the vfio_device
879 void *vfio_device_data(struct vfio_device *device)
881 return device->device_data;
883 EXPORT_SYMBOL_GPL(vfio_device_data);
885 /* Given a referenced group, check if it contains the device */
886 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
888 struct vfio_device *device;
890 device = vfio_group_get_device(group, dev);
894 vfio_device_put(device);
899 * Decrement the device reference count and wait for the device to be
900 * removed. Open file descriptors for the device... */
901 void *vfio_del_group_dev(struct device *dev)
903 struct vfio_device *device = dev_get_drvdata(dev);
904 struct vfio_group *group = device->group;
905 void *device_data = device->device_data;
906 struct vfio_unbound_dev *unbound;
909 bool interrupted = false;
912 * The group exists so long as we have a device reference. Get
913 * a group reference and use it to scan for the device going away.
915 vfio_group_get(group);
918 * When the device is removed from the group, the group suddenly
919 * becomes non-viable; the device has a driver (until the unbind
920 * completes), but it's not present in the group. This is bad news
921 * for any external users that need to re-acquire a group reference
922 * in order to match and release their existing reference. To
923 * solve this, we track such devices on the unbound_list to bridge
924 * the gap until they're fully unbound.
926 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
929 mutex_lock(&group->unbound_lock);
930 list_add(&unbound->unbound_next, &group->unbound_list);
931 mutex_unlock(&group->unbound_lock);
935 vfio_device_put(device);
938 * If the device is still present in the group after the above
939 * 'put', then it is in use and we need to request it from the
940 * bus driver. The driver may in turn need to request the
941 * device from the user. We send the request on an arbitrary
942 * interval with counter to allow the driver to take escalating
943 * measures to release the device if it has the ability to do so.
946 device = vfio_group_get_device(group, dev);
950 if (device->ops->request)
951 device->ops->request(device_data, i++);
953 vfio_device_put(device);
956 ret = wait_event_timeout(vfio.release_q,
957 !vfio_dev_present(group, dev), HZ * 10);
959 ret = wait_event_interruptible_timeout(vfio.release_q,
960 !vfio_dev_present(group, dev), HZ * 10);
961 if (ret == -ERESTARTSYS) {
964 "Device is currently in use, task"
966 "blocked until device is released",
967 current->comm, task_pid_nr(current));
972 vfio_group_put(group);
976 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
979 * VFIO base fd, /dev/vfio/vfio
981 static long vfio_ioctl_check_extension(struct vfio_container *container,
984 struct vfio_iommu_driver *driver;
987 down_read(&container->group_lock);
989 driver = container->iommu_driver;
992 /* No base extensions yet */
995 * If no driver is set, poll all registered drivers for
996 * extensions and return the first positive result. If
997 * a driver is already set, further queries will be passed
998 * only to that driver.
1001 mutex_lock(&vfio.iommu_drivers_lock);
1002 vfio_for_each_iommu_driver(container, driver) {
1003 if (!try_module_get(driver->ops->owner))
1006 ret = driver->ops->ioctl(NULL,
1007 VFIO_CHECK_EXTENSION,
1009 module_put(driver->ops->owner);
1013 mutex_unlock(&vfio.iommu_drivers_lock);
1015 ret = driver->ops->ioctl(container->iommu_data,
1016 VFIO_CHECK_EXTENSION, arg);
1019 up_read(&container->group_lock);
1024 /* hold write lock on container->group_lock */
1025 static int __vfio_container_attach_groups(struct vfio_container *container,
1026 struct vfio_iommu_driver *driver,
1029 struct vfio_group *group;
1032 list_for_each_entry(group, &container->group_list, container_next) {
1033 ret = driver->ops->attach_group(data, group->iommu_group);
1041 list_for_each_entry_continue_reverse(group, &container->group_list,
1043 driver->ops->detach_group(data, group->iommu_group);
1049 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1052 struct vfio_iommu_driver *driver;
1055 down_write(&container->group_lock);
1058 * The container is designed to be an unprivileged interface while
1059 * the group can be assigned to specific users. Therefore, only by
1060 * adding a group to a container does the user get the privilege of
1061 * enabling the iommu, which may allocate finite resources. There
1062 * is no unset_iommu, but by removing all the groups from a container,
1063 * the container is deprivileged and returns to an unset state.
1065 if (list_empty(&container->group_list) || container->iommu_driver) {
1066 up_write(&container->group_lock);
1070 mutex_lock(&vfio.iommu_drivers_lock);
1071 vfio_for_each_iommu_driver(container, driver) {
1074 if (!try_module_get(driver->ops->owner))
1078 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1079 * so test which iommu driver reported support for this
1080 * extension and call open on them. We also pass them the
1081 * magic, allowing a single driver to support multiple
1082 * interfaces if they'd like.
1084 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1085 module_put(driver->ops->owner);
1089 /* module reference holds the driver we're working on */
1090 mutex_unlock(&vfio.iommu_drivers_lock);
1092 data = driver->ops->open(arg);
1094 ret = PTR_ERR(data);
1095 module_put(driver->ops->owner);
1096 goto skip_drivers_unlock;
1099 ret = __vfio_container_attach_groups(container, driver, data);
1101 container->iommu_driver = driver;
1102 container->iommu_data = data;
1104 driver->ops->release(data);
1105 module_put(driver->ops->owner);
1108 goto skip_drivers_unlock;
1111 mutex_unlock(&vfio.iommu_drivers_lock);
1112 skip_drivers_unlock:
1113 up_write(&container->group_lock);
1118 static long vfio_fops_unl_ioctl(struct file *filep,
1119 unsigned int cmd, unsigned long arg)
1121 struct vfio_container *container = filep->private_data;
1122 struct vfio_iommu_driver *driver;
1130 case VFIO_GET_API_VERSION:
1131 ret = VFIO_API_VERSION;
1133 case VFIO_CHECK_EXTENSION:
1134 ret = vfio_ioctl_check_extension(container, arg);
1136 case VFIO_SET_IOMMU:
1137 ret = vfio_ioctl_set_iommu(container, arg);
1140 down_read(&container->group_lock);
1142 driver = container->iommu_driver;
1143 data = container->iommu_data;
1145 if (driver) /* passthrough all unrecognized ioctls */
1146 ret = driver->ops->ioctl(data, cmd, arg);
1148 up_read(&container->group_lock);
1154 #ifdef CONFIG_COMPAT
1155 static long vfio_fops_compat_ioctl(struct file *filep,
1156 unsigned int cmd, unsigned long arg)
1158 arg = (unsigned long)compat_ptr(arg);
1159 return vfio_fops_unl_ioctl(filep, cmd, arg);
1161 #endif /* CONFIG_COMPAT */
1163 static int vfio_fops_open(struct inode *inode, struct file *filep)
1165 struct vfio_container *container;
1167 container = kzalloc(sizeof(*container), GFP_KERNEL);
1171 INIT_LIST_HEAD(&container->group_list);
1172 init_rwsem(&container->group_lock);
1173 kref_init(&container->kref);
1175 filep->private_data = container;
1180 static int vfio_fops_release(struct inode *inode, struct file *filep)
1182 struct vfio_container *container = filep->private_data;
1184 filep->private_data = NULL;
1186 vfio_container_put(container);
1192 * Once an iommu driver is set, we optionally pass read/write/mmap
1193 * on to the driver, allowing management interfaces beyond ioctl.
1195 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1196 size_t count, loff_t *ppos)
1198 struct vfio_container *container = filep->private_data;
1199 struct vfio_iommu_driver *driver;
1200 ssize_t ret = -EINVAL;
1202 down_read(&container->group_lock);
1204 driver = container->iommu_driver;
1205 if (likely(driver && driver->ops->read))
1206 ret = driver->ops->read(container->iommu_data,
1209 up_read(&container->group_lock);
1214 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1215 size_t count, loff_t *ppos)
1217 struct vfio_container *container = filep->private_data;
1218 struct vfio_iommu_driver *driver;
1219 ssize_t ret = -EINVAL;
1221 down_read(&container->group_lock);
1223 driver = container->iommu_driver;
1224 if (likely(driver && driver->ops->write))
1225 ret = driver->ops->write(container->iommu_data,
1228 up_read(&container->group_lock);
1233 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1235 struct vfio_container *container = filep->private_data;
1236 struct vfio_iommu_driver *driver;
1239 down_read(&container->group_lock);
1241 driver = container->iommu_driver;
1242 if (likely(driver && driver->ops->mmap))
1243 ret = driver->ops->mmap(container->iommu_data, vma);
1245 up_read(&container->group_lock);
1250 static const struct file_operations vfio_fops = {
1251 .owner = THIS_MODULE,
1252 .open = vfio_fops_open,
1253 .release = vfio_fops_release,
1254 .read = vfio_fops_read,
1255 .write = vfio_fops_write,
1256 .unlocked_ioctl = vfio_fops_unl_ioctl,
1257 #ifdef CONFIG_COMPAT
1258 .compat_ioctl = vfio_fops_compat_ioctl,
1260 .mmap = vfio_fops_mmap,
1264 * VFIO Group fd, /dev/vfio/$GROUP
1266 static void __vfio_group_unset_container(struct vfio_group *group)
1268 struct vfio_container *container = group->container;
1269 struct vfio_iommu_driver *driver;
1271 down_write(&container->group_lock);
1273 driver = container->iommu_driver;
1275 driver->ops->detach_group(container->iommu_data,
1276 group->iommu_group);
1278 group->container = NULL;
1279 list_del(&group->container_next);
1281 /* Detaching the last group deprivileges a container, remove iommu */
1282 if (driver && list_empty(&container->group_list)) {
1283 driver->ops->release(container->iommu_data);
1284 module_put(driver->ops->owner);
1285 container->iommu_driver = NULL;
1286 container->iommu_data = NULL;
1289 up_write(&container->group_lock);
1291 vfio_container_put(container);
1295 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1296 * if there was no container to unset. Since the ioctl is called on
1297 * the group, we know that still exists, therefore the only valid
1298 * transition here is 1->0.
1300 static int vfio_group_unset_container(struct vfio_group *group)
1302 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1309 __vfio_group_unset_container(group);
1315 * When removing container users, anything that removes the last user
1316 * implicitly removes the group from the container. That is, if the
1317 * group file descriptor is closed, as well as any device file descriptors,
1318 * the group is free.
1320 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1322 if (0 == atomic_dec_if_positive(&group->container_users))
1323 __vfio_group_unset_container(group);
1326 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1329 struct vfio_container *container;
1330 struct vfio_iommu_driver *driver;
1333 if (atomic_read(&group->container_users))
1336 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1339 f = fdget(container_fd);
1343 /* Sanity check, is this really our fd? */
1344 if (f.file->f_op != &vfio_fops) {
1349 container = f.file->private_data;
1350 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1352 down_write(&container->group_lock);
1354 /* Real groups and fake groups cannot mix */
1355 if (!list_empty(&container->group_list) &&
1356 container->noiommu != group->noiommu) {
1361 driver = container->iommu_driver;
1363 ret = driver->ops->attach_group(container->iommu_data,
1364 group->iommu_group);
1369 group->container = container;
1370 container->noiommu = group->noiommu;
1371 list_add(&group->container_next, &container->group_list);
1373 /* Get a reference on the container and mark a user within the group */
1374 vfio_container_get(container);
1375 atomic_inc(&group->container_users);
1378 up_write(&container->group_lock);
1383 static bool vfio_group_viable(struct vfio_group *group)
1385 return (iommu_group_for_each_dev(group->iommu_group,
1386 group, vfio_dev_viable) == 0);
1389 static const struct file_operations vfio_device_fops;
1391 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1393 struct vfio_device *device;
1397 if (0 == atomic_read(&group->container_users) ||
1398 !group->container->iommu_driver || !vfio_group_viable(group))
1401 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1404 device = vfio_device_get_from_name(group, buf);
1408 ret = device->ops->open(device->device_data);
1410 vfio_device_put(device);
1415 * We can't use anon_inode_getfd() because we need to modify
1416 * the f_mode flags directly to allow more than just ioctls
1418 ret = get_unused_fd_flags(O_CLOEXEC);
1420 device->ops->release(device->device_data);
1421 vfio_device_put(device);
1425 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1427 if (IS_ERR(filep)) {
1429 ret = PTR_ERR(filep);
1430 device->ops->release(device->device_data);
1431 vfio_device_put(device);
1436 * TODO: add an anon_inode interface to do this.
1437 * Appears to be missing by lack of need rather than
1438 * explicitly prevented. Now there's need.
1440 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1442 atomic_inc(&group->container_users);
1444 fd_install(ret, filep);
1447 dev_warn(device->dev, "vfio-noiommu device opened by user "
1448 "(%s:%d)\n", current->comm, task_pid_nr(current));
1453 static long vfio_group_fops_unl_ioctl(struct file *filep,
1454 unsigned int cmd, unsigned long arg)
1456 struct vfio_group *group = filep->private_data;
1460 case VFIO_GROUP_GET_STATUS:
1462 struct vfio_group_status status;
1463 unsigned long minsz;
1465 minsz = offsetofend(struct vfio_group_status, flags);
1467 if (copy_from_user(&status, (void __user *)arg, minsz))
1470 if (status.argsz < minsz)
1475 if (vfio_group_viable(group))
1476 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1478 if (group->container)
1479 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1481 if (copy_to_user((void __user *)arg, &status, minsz))
1487 case VFIO_GROUP_SET_CONTAINER:
1491 if (get_user(fd, (int __user *)arg))
1497 ret = vfio_group_set_container(group, fd);
1500 case VFIO_GROUP_UNSET_CONTAINER:
1501 ret = vfio_group_unset_container(group);
1503 case VFIO_GROUP_GET_DEVICE_FD:
1507 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1509 return PTR_ERR(buf);
1511 ret = vfio_group_get_device_fd(group, buf);
1520 #ifdef CONFIG_COMPAT
1521 static long vfio_group_fops_compat_ioctl(struct file *filep,
1522 unsigned int cmd, unsigned long arg)
1524 arg = (unsigned long)compat_ptr(arg);
1525 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1527 #endif /* CONFIG_COMPAT */
1529 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1531 struct vfio_group *group;
1534 group = vfio_group_get_from_minor(iminor(inode));
1538 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1539 vfio_group_put(group);
1543 /* Do we need multiple instances of the group open? Seems not. */
1544 opened = atomic_cmpxchg(&group->opened, 0, 1);
1546 vfio_group_put(group);
1550 /* Is something still in use from a previous open? */
1551 if (group->container) {
1552 atomic_dec(&group->opened);
1553 vfio_group_put(group);
1557 filep->private_data = group;
1562 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1564 struct vfio_group *group = filep->private_data;
1566 filep->private_data = NULL;
1568 vfio_group_try_dissolve_container(group);
1570 atomic_dec(&group->opened);
1572 vfio_group_put(group);
1577 static const struct file_operations vfio_group_fops = {
1578 .owner = THIS_MODULE,
1579 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1580 #ifdef CONFIG_COMPAT
1581 .compat_ioctl = vfio_group_fops_compat_ioctl,
1583 .open = vfio_group_fops_open,
1584 .release = vfio_group_fops_release,
1590 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1592 struct vfio_device *device = filep->private_data;
1594 device->ops->release(device->device_data);
1596 vfio_group_try_dissolve_container(device->group);
1598 vfio_device_put(device);
1603 static long vfio_device_fops_unl_ioctl(struct file *filep,
1604 unsigned int cmd, unsigned long arg)
1606 struct vfio_device *device = filep->private_data;
1608 if (unlikely(!device->ops->ioctl))
1611 return device->ops->ioctl(device->device_data, cmd, arg);
1614 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1615 size_t count, loff_t *ppos)
1617 struct vfio_device *device = filep->private_data;
1619 if (unlikely(!device->ops->read))
1622 return device->ops->read(device->device_data, buf, count, ppos);
1625 static ssize_t vfio_device_fops_write(struct file *filep,
1626 const char __user *buf,
1627 size_t count, loff_t *ppos)
1629 struct vfio_device *device = filep->private_data;
1631 if (unlikely(!device->ops->write))
1634 return device->ops->write(device->device_data, buf, count, ppos);
1637 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1639 struct vfio_device *device = filep->private_data;
1641 if (unlikely(!device->ops->mmap))
1644 return device->ops->mmap(device->device_data, vma);
1647 #ifdef CONFIG_COMPAT
1648 static long vfio_device_fops_compat_ioctl(struct file *filep,
1649 unsigned int cmd, unsigned long arg)
1651 arg = (unsigned long)compat_ptr(arg);
1652 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1654 #endif /* CONFIG_COMPAT */
1656 static const struct file_operations vfio_device_fops = {
1657 .owner = THIS_MODULE,
1658 .release = vfio_device_fops_release,
1659 .read = vfio_device_fops_read,
1660 .write = vfio_device_fops_write,
1661 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1662 #ifdef CONFIG_COMPAT
1663 .compat_ioctl = vfio_device_fops_compat_ioctl,
1665 .mmap = vfio_device_fops_mmap,
1669 * External user API, exported by symbols to be linked dynamically.
1671 * The protocol includes:
1672 * 1. do normal VFIO init operation:
1673 * - opening a new container;
1674 * - attaching group(s) to it;
1675 * - setting an IOMMU driver for a container.
1676 * When IOMMU is set for a container, all groups in it are
1677 * considered ready to use by an external user.
1679 * 2. User space passes a group fd to an external user.
1680 * The external user calls vfio_group_get_external_user()
1682 * - the group is initialized;
1683 * - IOMMU is set for it.
1684 * If both checks passed, vfio_group_get_external_user()
1685 * increments the container user counter to prevent
1686 * the VFIO group from disposal before KVM exits.
1688 * 3. The external user calls vfio_external_user_iommu_id()
1689 * to know an IOMMU ID.
1691 * 4. When the external KVM finishes, it calls
1692 * vfio_group_put_external_user() to release the VFIO group.
1693 * This call decrements the container user counter.
1695 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1697 struct vfio_group *group = filep->private_data;
1699 if (filep->f_op != &vfio_group_fops)
1700 return ERR_PTR(-EINVAL);
1702 if (!atomic_inc_not_zero(&group->container_users))
1703 return ERR_PTR(-EINVAL);
1705 if (group->noiommu) {
1706 atomic_dec(&group->container_users);
1707 return ERR_PTR(-EPERM);
1710 if (!group->container->iommu_driver ||
1711 !vfio_group_viable(group)) {
1712 atomic_dec(&group->container_users);
1713 return ERR_PTR(-EINVAL);
1716 vfio_group_get(group);
1720 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1722 void vfio_group_put_external_user(struct vfio_group *group)
1724 vfio_group_put(group);
1725 vfio_group_try_dissolve_container(group);
1727 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1729 int vfio_external_user_iommu_id(struct vfio_group *group)
1731 return iommu_group_id(group->iommu_group);
1733 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1735 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1737 return vfio_ioctl_check_extension(group->container, arg);
1739 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1742 * Module/class support
1744 static char *vfio_devnode(struct device *dev, umode_t *mode)
1746 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1749 static struct miscdevice vfio_dev = {
1750 .minor = VFIO_MINOR,
1753 .nodename = "vfio/vfio",
1754 .mode = S_IRUGO | S_IWUGO,
1757 static int __init vfio_init(void)
1761 idr_init(&vfio.group_idr);
1762 mutex_init(&vfio.group_lock);
1763 mutex_init(&vfio.iommu_drivers_lock);
1764 INIT_LIST_HEAD(&vfio.group_list);
1765 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1766 init_waitqueue_head(&vfio.release_q);
1768 ret = misc_register(&vfio_dev);
1770 pr_err("vfio: misc device register failed\n");
1774 /* /dev/vfio/$GROUP */
1775 vfio.class = class_create(THIS_MODULE, "vfio");
1776 if (IS_ERR(vfio.class)) {
1777 ret = PTR_ERR(vfio.class);
1781 vfio.class->devnode = vfio_devnode;
1783 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
1785 goto err_alloc_chrdev;
1787 cdev_init(&vfio.group_cdev, &vfio_group_fops);
1788 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
1792 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1795 * Attempt to load known iommu-drivers. This gives us a working
1796 * environment without the user needing to explicitly load iommu
1799 request_module_nowait("vfio_iommu_type1");
1800 request_module_nowait("vfio_iommu_spapr_tce");
1805 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1807 class_destroy(vfio.class);
1810 misc_deregister(&vfio_dev);
1814 static void __exit vfio_cleanup(void)
1816 WARN_ON(!list_empty(&vfio.group_list));
1818 idr_destroy(&vfio.group_idr);
1819 cdev_del(&vfio.group_cdev);
1820 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1821 class_destroy(vfio.class);
1823 misc_deregister(&vfio_dev);
1826 module_init(vfio_init);
1827 module_exit(vfio_cleanup);
1829 MODULE_VERSION(DRIVER_VERSION);
1830 MODULE_LICENSE("GPL v2");
1831 MODULE_AUTHOR(DRIVER_AUTHOR);
1832 MODULE_DESCRIPTION(DRIVER_DESC);
1833 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
1834 MODULE_ALIAS("devname:vfio/vfio");