2 * VFIO: IOMMU DMA mapping support for TCE on POWER
4 * Copyright (C) 2013 IBM Corp. All rights reserved.
5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio_iommu_type1.c:
12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
13 * Author: Alex Williamson <alex.williamson@redhat.com>
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <asm/iommu.h>
25 #define DRIVER_VERSION "0.1"
26 #define DRIVER_AUTHOR "aik@ozlabs.ru"
27 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
29 static void tce_iommu_detach_group(void *iommu_data,
30 struct iommu_group *iommu_group);
33 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
35 * This code handles mapping and unmapping of user data buffers
36 * into DMA'ble space using the IOMMU
40 * The container descriptor supports only a single group per container.
41 * Required by the API as the container is not supplied with the IOMMU group
42 * at the moment of initialization.
44 struct tce_container {
46 struct iommu_table *tbl;
50 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
53 * Check that the TCE table granularity is not bigger than the size of
54 * a page we just found. Otherwise the hardware can get access to
55 * a bigger memory chunk that it should.
57 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
60 static int tce_iommu_enable(struct tce_container *container)
63 unsigned long locked, lock_limit, npages;
64 struct iommu_table *tbl = container->tbl;
70 return -ESRCH; /* process exited */
72 if (container->enabled)
76 * When userspace pages are mapped into the IOMMU, they are effectively
77 * locked memory, so, theoretically, we need to update the accounting
78 * of locked pages on each map and unmap. For powerpc, the map unmap
79 * paths can be very hot, though, and the accounting would kill
80 * performance, especially since it would be difficult to impossible
81 * to handle the accounting in real mode only.
83 * To address that, rather than precisely accounting every page, we
84 * instead account for a worst case on locked memory when the iommu is
85 * enabled and disabled. The worst case upper bound on locked memory
86 * is the size of the whole iommu window, which is usually relatively
87 * small (compared to total memory sizes) on POWER hardware.
89 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
90 * that would effectively kill the guest at random points, much better
91 * enforcing the limit based on the max that the guest can map.
93 down_write(¤t->mm->mmap_sem);
94 npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
95 locked = current->mm->locked_vm + npages;
96 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
97 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
98 pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
99 rlimit(RLIMIT_MEMLOCK));
103 current->mm->locked_vm += npages;
104 container->enabled = true;
106 up_write(¤t->mm->mmap_sem);
111 static void tce_iommu_disable(struct tce_container *container)
113 if (!container->enabled)
116 container->enabled = false;
118 if (!container->tbl || !current->mm)
121 down_write(¤t->mm->mmap_sem);
122 current->mm->locked_vm -= (container->tbl->it_size <<
123 IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
124 up_write(¤t->mm->mmap_sem);
127 static void *tce_iommu_open(unsigned long arg)
129 struct tce_container *container;
131 if (arg != VFIO_SPAPR_TCE_IOMMU) {
132 pr_err("tce_vfio: Wrong IOMMU type\n");
133 return ERR_PTR(-EINVAL);
136 container = kzalloc(sizeof(*container), GFP_KERNEL);
138 return ERR_PTR(-ENOMEM);
140 mutex_init(&container->lock);
145 static void tce_iommu_release(void *iommu_data)
147 struct tce_container *container = iommu_data;
149 WARN_ON(container->tbl && !container->tbl->it_group);
150 tce_iommu_disable(container);
152 if (container->tbl && container->tbl->it_group)
153 tce_iommu_detach_group(iommu_data, container->tbl->it_group);
155 mutex_destroy(&container->lock);
160 static int tce_iommu_clear(struct tce_container *container,
161 struct iommu_table *tbl,
162 unsigned long entry, unsigned long pages)
164 unsigned long oldtce;
167 for ( ; pages; --pages, ++entry) {
168 oldtce = iommu_clear_tce(tbl, entry);
172 page = pfn_to_page(oldtce >> PAGE_SHIFT);
175 if (oldtce & TCE_PCI_WRITE)
184 static long tce_iommu_build(struct tce_container *container,
185 struct iommu_table *tbl,
186 unsigned long entry, unsigned long tce, unsigned long pages)
189 struct page *page = NULL;
191 enum dma_data_direction direction = iommu_tce_direction(tce);
193 for (i = 0; i < pages; ++i) {
194 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
196 ret = get_user_pages_fast(tce & PAGE_MASK, 1,
197 direction != DMA_TO_DEVICE, &page);
198 if (unlikely(ret != 1)) {
203 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
208 hva = (unsigned long) page_address(page) + offset;
210 ret = iommu_tce_build(tbl, entry + i, hva, direction);
213 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
214 __func__, entry << tbl->it_page_shift,
218 tce += IOMMU_PAGE_SIZE_4K;
222 tce_iommu_clear(container, tbl, entry, i);
227 static long tce_iommu_ioctl(void *iommu_data,
228 unsigned int cmd, unsigned long arg)
230 struct tce_container *container = iommu_data;
235 case VFIO_CHECK_EXTENSION:
237 case VFIO_SPAPR_TCE_IOMMU:
241 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
245 return (ret < 0) ? 0 : ret;
247 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
248 struct vfio_iommu_spapr_tce_info info;
249 struct iommu_table *tbl = container->tbl;
254 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
257 if (copy_from_user(&info, (void __user *)arg, minsz))
260 if (info.argsz < minsz)
263 info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
264 info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
267 if (copy_to_user((void __user *)arg, &info, minsz))
272 case VFIO_IOMMU_MAP_DMA: {
273 struct vfio_iommu_type1_dma_map param;
274 struct iommu_table *tbl = container->tbl;
280 BUG_ON(!tbl->it_group);
282 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
284 if (copy_from_user(¶m, (void __user *)arg, minsz))
287 if (param.argsz < minsz)
290 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
291 VFIO_DMA_MAP_FLAG_WRITE))
294 if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
295 (param.vaddr & ~IOMMU_PAGE_MASK_4K))
298 /* iova is checked by the IOMMU API */
300 if (param.flags & VFIO_DMA_MAP_FLAG_READ)
302 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
303 tce |= TCE_PCI_WRITE;
305 ret = iommu_tce_put_param_check(tbl, param.iova, tce);
309 ret = tce_iommu_build(container, tbl,
310 param.iova >> IOMMU_PAGE_SHIFT_4K,
311 tce, param.size >> IOMMU_PAGE_SHIFT_4K);
313 iommu_flush_tce(tbl);
317 case VFIO_IOMMU_UNMAP_DMA: {
318 struct vfio_iommu_type1_dma_unmap param;
319 struct iommu_table *tbl = container->tbl;
324 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
327 if (copy_from_user(¶m, (void __user *)arg, minsz))
330 if (param.argsz < minsz)
333 /* No flag is supported now */
337 if (param.size & ~IOMMU_PAGE_MASK_4K)
340 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
341 param.size >> IOMMU_PAGE_SHIFT_4K);
345 ret = tce_iommu_clear(container, tbl,
346 param.iova >> IOMMU_PAGE_SHIFT_4K,
347 param.size >> IOMMU_PAGE_SHIFT_4K);
348 iommu_flush_tce(tbl);
352 case VFIO_IOMMU_ENABLE:
353 mutex_lock(&container->lock);
354 ret = tce_iommu_enable(container);
355 mutex_unlock(&container->lock);
359 case VFIO_IOMMU_DISABLE:
360 mutex_lock(&container->lock);
361 tce_iommu_disable(container);
362 mutex_unlock(&container->lock);
365 if (!container->tbl || !container->tbl->it_group)
368 return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
375 static int tce_iommu_attach_group(void *iommu_data,
376 struct iommu_group *iommu_group)
379 struct tce_container *container = iommu_data;
380 struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
383 mutex_lock(&container->lock);
385 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
386 iommu_group_id(iommu_group), iommu_group); */
387 if (container->tbl) {
388 pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
389 iommu_group_id(container->tbl->it_group),
390 iommu_group_id(iommu_group));
392 } else if (container->enabled) {
393 pr_err("tce_vfio: attaching group #%u to enabled container\n",
394 iommu_group_id(iommu_group));
397 ret = iommu_take_ownership(tbl);
399 container->tbl = tbl;
402 mutex_unlock(&container->lock);
407 static void tce_iommu_detach_group(void *iommu_data,
408 struct iommu_group *iommu_group)
410 struct tce_container *container = iommu_data;
411 struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
414 mutex_lock(&container->lock);
415 if (tbl != container->tbl) {
416 pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
417 iommu_group_id(iommu_group),
418 iommu_group_id(tbl->it_group));
420 if (container->enabled) {
421 pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
422 iommu_group_id(tbl->it_group));
423 tce_iommu_disable(container);
426 /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
427 iommu_group_id(iommu_group), iommu_group); */
428 container->tbl = NULL;
429 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
430 iommu_release_ownership(tbl);
432 mutex_unlock(&container->lock);
435 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
436 .name = "iommu-vfio-powerpc",
437 .owner = THIS_MODULE,
438 .open = tce_iommu_open,
439 .release = tce_iommu_release,
440 .ioctl = tce_iommu_ioctl,
441 .attach_group = tce_iommu_attach_group,
442 .detach_group = tce_iommu_detach_group,
445 static int __init tce_iommu_init(void)
447 return vfio_register_iommu_driver(&tce_iommu_driver_ops);
450 static void __exit tce_iommu_cleanup(void)
452 vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
455 module_init(tce_iommu_init);
456 module_exit(tce_iommu_cleanup);
458 MODULE_VERSION(DRIVER_VERSION);
459 MODULE_LICENSE("GPL v2");
460 MODULE_AUTHOR(DRIVER_AUTHOR);
461 MODULE_DESCRIPTION(DRIVER_DESC);