]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/vfio/vfio_iommu_spapr_tce.c
powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
[karo-tx-linux.git] / drivers / vfio / vfio_iommu_spapr_tce.c
1 /*
2  * VFIO: IOMMU DMA mapping support for TCE on POWER
3  *
4  * Copyright (C) 2013 IBM Corp.  All rights reserved.
5  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  * Derived from original vfio_iommu_type1.c:
12  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
13  *     Author: Alex Williamson <alex.williamson@redhat.com>
14  */
15
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <asm/iommu.h>
23 #include <asm/tce.h>
24
25 #define DRIVER_VERSION  "0.1"
26 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
27 #define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
28
29 static void tce_iommu_detach_group(void *iommu_data,
30                 struct iommu_group *iommu_group);
31
32 static long try_increment_locked_vm(long npages)
33 {
34         long ret = 0, locked, lock_limit;
35
36         if (!current || !current->mm)
37                 return -ESRCH; /* process exited */
38
39         if (!npages)
40                 return 0;
41
42         down_write(&current->mm->mmap_sem);
43         locked = current->mm->locked_vm + npages;
44         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
45         if (locked > lock_limit && !capable(CAP_IPC_LOCK))
46                 ret = -ENOMEM;
47         else
48                 current->mm->locked_vm += npages;
49
50         pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
51                         npages << PAGE_SHIFT,
52                         current->mm->locked_vm << PAGE_SHIFT,
53                         rlimit(RLIMIT_MEMLOCK),
54                         ret ? " - exceeded" : "");
55
56         up_write(&current->mm->mmap_sem);
57
58         return ret;
59 }
60
61 static void decrement_locked_vm(long npages)
62 {
63         if (!current || !current->mm || !npages)
64                 return; /* process exited */
65
66         down_write(&current->mm->mmap_sem);
67         if (WARN_ON_ONCE(npages > current->mm->locked_vm))
68                 npages = current->mm->locked_vm;
69         current->mm->locked_vm -= npages;
70         pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
71                         npages << PAGE_SHIFT,
72                         current->mm->locked_vm << PAGE_SHIFT,
73                         rlimit(RLIMIT_MEMLOCK));
74         up_write(&current->mm->mmap_sem);
75 }
76
77 /*
78  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
79  *
80  * This code handles mapping and unmapping of user data buffers
81  * into DMA'ble space using the IOMMU
82  */
83
84 /*
85  * The container descriptor supports only a single group per container.
86  * Required by the API as the container is not supplied with the IOMMU group
87  * at the moment of initialization.
88  */
89 struct tce_container {
90         struct mutex lock;
91         struct iommu_group *grp;
92         bool enabled;
93         unsigned long locked_pages;
94 };
95
96 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
97 {
98         /*
99          * Check that the TCE table granularity is not bigger than the size of
100          * a page we just found. Otherwise the hardware can get access to
101          * a bigger memory chunk that it should.
102          */
103         return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
104 }
105
106 static long tce_iommu_find_table(struct tce_container *container,
107                 phys_addr_t ioba, struct iommu_table **ptbl)
108 {
109         long i;
110         struct iommu_table_group *table_group;
111
112         table_group = iommu_group_get_iommudata(container->grp);
113         if (!table_group)
114                 return -1;
115
116         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
117                 struct iommu_table *tbl = table_group->tables[i];
118
119                 if (tbl) {
120                         unsigned long entry = ioba >> tbl->it_page_shift;
121                         unsigned long start = tbl->it_offset;
122                         unsigned long end = start + tbl->it_size;
123
124                         if ((start <= entry) && (entry < end)) {
125                                 *ptbl = tbl;
126                                 return i;
127                         }
128                 }
129         }
130
131         return -1;
132 }
133
134 static int tce_iommu_enable(struct tce_container *container)
135 {
136         int ret = 0;
137         unsigned long locked;
138         struct iommu_table *tbl;
139         struct iommu_table_group *table_group;
140
141         if (!container->grp)
142                 return -ENXIO;
143
144         if (!current->mm)
145                 return -ESRCH; /* process exited */
146
147         if (container->enabled)
148                 return -EBUSY;
149
150         /*
151          * When userspace pages are mapped into the IOMMU, they are effectively
152          * locked memory, so, theoretically, we need to update the accounting
153          * of locked pages on each map and unmap.  For powerpc, the map unmap
154          * paths can be very hot, though, and the accounting would kill
155          * performance, especially since it would be difficult to impossible
156          * to handle the accounting in real mode only.
157          *
158          * To address that, rather than precisely accounting every page, we
159          * instead account for a worst case on locked memory when the iommu is
160          * enabled and disabled.  The worst case upper bound on locked memory
161          * is the size of the whole iommu window, which is usually relatively
162          * small (compared to total memory sizes) on POWER hardware.
163          *
164          * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
165          * that would effectively kill the guest at random points, much better
166          * enforcing the limit based on the max that the guest can map.
167          *
168          * Unfortunately at the moment it counts whole tables, no matter how
169          * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
170          * each with 2GB DMA window, 8GB will be counted here. The reason for
171          * this is that we cannot tell here the amount of RAM used by the guest
172          * as this information is only available from KVM and VFIO is
173          * KVM agnostic.
174          */
175         table_group = iommu_group_get_iommudata(container->grp);
176         if (!table_group)
177                 return -ENODEV;
178
179         tbl = table_group->tables[0];
180         locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
181         ret = try_increment_locked_vm(locked);
182         if (ret)
183                 return ret;
184
185         container->locked_pages = locked;
186
187         container->enabled = true;
188
189         return ret;
190 }
191
192 static void tce_iommu_disable(struct tce_container *container)
193 {
194         if (!container->enabled)
195                 return;
196
197         container->enabled = false;
198
199         if (!current->mm)
200                 return;
201
202         decrement_locked_vm(container->locked_pages);
203 }
204
205 static void *tce_iommu_open(unsigned long arg)
206 {
207         struct tce_container *container;
208
209         if (arg != VFIO_SPAPR_TCE_IOMMU) {
210                 pr_err("tce_vfio: Wrong IOMMU type\n");
211                 return ERR_PTR(-EINVAL);
212         }
213
214         container = kzalloc(sizeof(*container), GFP_KERNEL);
215         if (!container)
216                 return ERR_PTR(-ENOMEM);
217
218         mutex_init(&container->lock);
219
220         return container;
221 }
222
223 static void tce_iommu_release(void *iommu_data)
224 {
225         struct tce_container *container = iommu_data;
226
227         WARN_ON(container->grp);
228
229         if (container->grp)
230                 tce_iommu_detach_group(iommu_data, container->grp);
231
232         tce_iommu_disable(container);
233         mutex_destroy(&container->lock);
234
235         kfree(container);
236 }
237
238 static void tce_iommu_unuse_page(struct tce_container *container,
239                 unsigned long oldtce)
240 {
241         struct page *page;
242
243         if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
244                 return;
245
246         page = pfn_to_page(oldtce >> PAGE_SHIFT);
247
248         if (oldtce & TCE_PCI_WRITE)
249                 SetPageDirty(page);
250
251         put_page(page);
252 }
253
254 static int tce_iommu_clear(struct tce_container *container,
255                 struct iommu_table *tbl,
256                 unsigned long entry, unsigned long pages)
257 {
258         unsigned long oldtce;
259
260         for ( ; pages; --pages, ++entry) {
261                 oldtce = iommu_clear_tce(tbl, entry);
262                 if (!oldtce)
263                         continue;
264
265                 tce_iommu_unuse_page(container, oldtce);
266         }
267
268         return 0;
269 }
270
271 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
272 {
273         struct page *page = NULL;
274         enum dma_data_direction direction = iommu_tce_direction(tce);
275
276         if (get_user_pages_fast(tce & PAGE_MASK, 1,
277                         direction != DMA_TO_DEVICE, &page) != 1)
278                 return -EFAULT;
279
280         *hpa = __pa((unsigned long) page_address(page));
281
282         return 0;
283 }
284
285 static long tce_iommu_build(struct tce_container *container,
286                 struct iommu_table *tbl,
287                 unsigned long entry, unsigned long tce, unsigned long pages)
288 {
289         long i, ret = 0;
290         struct page *page;
291         unsigned long hpa;
292         enum dma_data_direction direction = iommu_tce_direction(tce);
293
294         for (i = 0; i < pages; ++i) {
295                 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
296
297                 ret = tce_iommu_use_page(tce, &hpa);
298                 if (ret)
299                         break;
300
301                 page = pfn_to_page(hpa >> PAGE_SHIFT);
302                 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
303                         ret = -EPERM;
304                         break;
305                 }
306
307                 hpa |= offset;
308                 ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa),
309                                 direction);
310                 if (ret) {
311                         tce_iommu_unuse_page(container, hpa);
312                         pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
313                                         __func__, entry << tbl->it_page_shift,
314                                         tce, ret);
315                         break;
316                 }
317                 tce += IOMMU_PAGE_SIZE(tbl);
318         }
319
320         if (ret)
321                 tce_iommu_clear(container, tbl, entry, i);
322
323         return ret;
324 }
325
326 static long tce_iommu_ioctl(void *iommu_data,
327                                  unsigned int cmd, unsigned long arg)
328 {
329         struct tce_container *container = iommu_data;
330         unsigned long minsz;
331         long ret;
332
333         switch (cmd) {
334         case VFIO_CHECK_EXTENSION:
335                 switch (arg) {
336                 case VFIO_SPAPR_TCE_IOMMU:
337                         ret = 1;
338                         break;
339                 default:
340                         ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
341                         break;
342                 }
343
344                 return (ret < 0) ? 0 : ret;
345
346         case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
347                 struct vfio_iommu_spapr_tce_info info;
348                 struct iommu_table *tbl;
349                 struct iommu_table_group *table_group;
350
351                 if (WARN_ON(!container->grp))
352                         return -ENXIO;
353
354                 table_group = iommu_group_get_iommudata(container->grp);
355
356                 tbl = table_group->tables[0];
357                 if (WARN_ON_ONCE(!tbl))
358                         return -ENXIO;
359
360                 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
361                                 dma32_window_size);
362
363                 if (copy_from_user(&info, (void __user *)arg, minsz))
364                         return -EFAULT;
365
366                 if (info.argsz < minsz)
367                         return -EINVAL;
368
369                 info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
370                 info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
371                 info.flags = 0;
372
373                 if (copy_to_user((void __user *)arg, &info, minsz))
374                         return -EFAULT;
375
376                 return 0;
377         }
378         case VFIO_IOMMU_MAP_DMA: {
379                 struct vfio_iommu_type1_dma_map param;
380                 struct iommu_table *tbl = NULL;
381                 unsigned long tce;
382                 long num;
383
384                 if (!container->enabled)
385                         return -EPERM;
386
387                 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
388
389                 if (copy_from_user(&param, (void __user *)arg, minsz))
390                         return -EFAULT;
391
392                 if (param.argsz < minsz)
393                         return -EINVAL;
394
395                 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
396                                 VFIO_DMA_MAP_FLAG_WRITE))
397                         return -EINVAL;
398
399                 num = tce_iommu_find_table(container, param.iova, &tbl);
400                 if (num < 0)
401                         return -ENXIO;
402
403                 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
404                                 (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
405                         return -EINVAL;
406
407                 /* iova is checked by the IOMMU API */
408                 tce = param.vaddr;
409                 if (param.flags & VFIO_DMA_MAP_FLAG_READ)
410                         tce |= TCE_PCI_READ;
411                 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
412                         tce |= TCE_PCI_WRITE;
413
414                 ret = iommu_tce_put_param_check(tbl, param.iova, tce);
415                 if (ret)
416                         return ret;
417
418                 ret = tce_iommu_build(container, tbl,
419                                 param.iova >> tbl->it_page_shift,
420                                 tce, param.size >> tbl->it_page_shift);
421
422                 iommu_flush_tce(tbl);
423
424                 return ret;
425         }
426         case VFIO_IOMMU_UNMAP_DMA: {
427                 struct vfio_iommu_type1_dma_unmap param;
428                 struct iommu_table *tbl = NULL;
429                 long num;
430
431                 if (!container->enabled)
432                         return -EPERM;
433
434                 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
435                                 size);
436
437                 if (copy_from_user(&param, (void __user *)arg, minsz))
438                         return -EFAULT;
439
440                 if (param.argsz < minsz)
441                         return -EINVAL;
442
443                 /* No flag is supported now */
444                 if (param.flags)
445                         return -EINVAL;
446
447                 num = tce_iommu_find_table(container, param.iova, &tbl);
448                 if (num < 0)
449                         return -ENXIO;
450
451                 if (param.size & ~IOMMU_PAGE_MASK(tbl))
452                         return -EINVAL;
453
454                 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
455                                 param.size >> tbl->it_page_shift);
456                 if (ret)
457                         return ret;
458
459                 ret = tce_iommu_clear(container, tbl,
460                                 param.iova >> tbl->it_page_shift,
461                                 param.size >> tbl->it_page_shift);
462                 iommu_flush_tce(tbl);
463
464                 return ret;
465         }
466         case VFIO_IOMMU_ENABLE:
467                 mutex_lock(&container->lock);
468                 ret = tce_iommu_enable(container);
469                 mutex_unlock(&container->lock);
470                 return ret;
471
472
473         case VFIO_IOMMU_DISABLE:
474                 mutex_lock(&container->lock);
475                 tce_iommu_disable(container);
476                 mutex_unlock(&container->lock);
477                 return 0;
478         case VFIO_EEH_PE_OP:
479                 if (!container->grp)
480                         return -ENODEV;
481
482                 return vfio_spapr_iommu_eeh_ioctl(container->grp,
483                                                   cmd, arg);
484         }
485
486         return -ENOTTY;
487 }
488
489 static int tce_iommu_attach_group(void *iommu_data,
490                 struct iommu_group *iommu_group)
491 {
492         int ret;
493         struct tce_container *container = iommu_data;
494         struct iommu_table_group *table_group;
495
496         mutex_lock(&container->lock);
497
498         /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
499                         iommu_group_id(iommu_group), iommu_group); */
500         if (container->grp) {
501                 pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
502                                 iommu_group_id(container->grp),
503                                 iommu_group_id(iommu_group));
504                 ret = -EBUSY;
505                 goto unlock_exit;
506         }
507
508         if (container->enabled) {
509                 pr_err("tce_vfio: attaching group #%u to enabled container\n",
510                                 iommu_group_id(iommu_group));
511                 ret = -EBUSY;
512                 goto unlock_exit;
513         }
514
515         table_group = iommu_group_get_iommudata(iommu_group);
516         if (!table_group) {
517                 ret = -ENXIO;
518                 goto unlock_exit;
519         }
520
521         ret = iommu_take_ownership(table_group->tables[0]);
522         if (!ret)
523                 container->grp = iommu_group;
524
525 unlock_exit:
526         mutex_unlock(&container->lock);
527
528         return ret;
529 }
530
531 static void tce_iommu_detach_group(void *iommu_data,
532                 struct iommu_group *iommu_group)
533 {
534         struct tce_container *container = iommu_data;
535         struct iommu_table_group *table_group;
536         struct iommu_table *tbl;
537
538         mutex_lock(&container->lock);
539         if (iommu_group != container->grp) {
540                 pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
541                                 iommu_group_id(iommu_group),
542                                 iommu_group_id(container->grp));
543                 goto unlock_exit;
544         }
545
546         if (container->enabled) {
547                 pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
548                                 iommu_group_id(container->grp));
549                 tce_iommu_disable(container);
550         }
551
552         /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
553            iommu_group_id(iommu_group), iommu_group); */
554         container->grp = NULL;
555
556         table_group = iommu_group_get_iommudata(iommu_group);
557         BUG_ON(!table_group);
558
559         tbl = table_group->tables[0];
560         tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
561         iommu_release_ownership(tbl);
562
563 unlock_exit:
564         mutex_unlock(&container->lock);
565 }
566
567 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
568         .name           = "iommu-vfio-powerpc",
569         .owner          = THIS_MODULE,
570         .open           = tce_iommu_open,
571         .release        = tce_iommu_release,
572         .ioctl          = tce_iommu_ioctl,
573         .attach_group   = tce_iommu_attach_group,
574         .detach_group   = tce_iommu_detach_group,
575 };
576
577 static int __init tce_iommu_init(void)
578 {
579         return vfio_register_iommu_driver(&tce_iommu_driver_ops);
580 }
581
582 static void __exit tce_iommu_cleanup(void)
583 {
584         vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
585 }
586
587 module_init(tce_iommu_init);
588 module_exit(tce_iommu_cleanup);
589
590 MODULE_VERSION(DRIVER_VERSION);
591 MODULE_LICENSE("GPL v2");
592 MODULE_AUTHOR(DRIVER_AUTHOR);
593 MODULE_DESCRIPTION(DRIVER_DESC);
594