]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/iommu/amd_iommu_v2.c
iommu/amd: Implement IO page-fault handler
[karo-tx-linux.git] / drivers / iommu / amd_iommu_v2.c
1 /*
2  * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
3  * Author: Joerg Roedel <joerg.roedel@amd.com>
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 as published
7  * by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
17  */
18
19 #include <linux/amd-iommu.h>
20 #include <linux/mm_types.h>
21 #include <linux/module.h>
22 #include <linux/sched.h>
23 #include <linux/iommu.h>
24 #include <linux/wait.h>
25 #include <linux/pci.h>
26 #include <linux/gfp.h>
27
28 #include "amd_iommu_types.h"
29 #include "amd_iommu_proto.h"
30
31 MODULE_LICENSE("GPL v2");
32 MODULE_AUTHOR("Joerg Roedel <joerg.roedel@amd.com>");
33
34 #define MAX_DEVICES             0x10000
35 #define PRI_QUEUE_SIZE          512
36
37 struct pri_queue {
38         atomic_t inflight;
39         bool finish;
40         int status;
41 };
42
43 struct pasid_state {
44         struct list_head list;                  /* For global state-list */
45         atomic_t count;                         /* Reference count */
46         struct task_struct *task;               /* Task bound to this PASID */
47         struct mm_struct *mm;                   /* mm_struct for the faults */
48         struct pri_queue pri[PRI_QUEUE_SIZE];   /* PRI tag states */
49         struct device_state *device_state;      /* Link to our device_state */
50         int pasid;                              /* PASID index */
51         spinlock_t lock;                        /* Protect pri_queues */
52         wait_queue_head_t wq;                   /* To wait for count == 0 */
53 };
54
55 struct device_state {
56         atomic_t count;
57         struct pci_dev *pdev;
58         struct pasid_state **states;
59         struct iommu_domain *domain;
60         int pasid_levels;
61         int max_pasids;
62         spinlock_t lock;
63         wait_queue_head_t wq;
64 };
65
66 struct fault {
67         struct work_struct work;
68         struct device_state *dev_state;
69         struct pasid_state *state;
70         struct mm_struct *mm;
71         u64 address;
72         u16 devid;
73         u16 pasid;
74         u16 tag;
75         u16 finish;
76         u16 flags;
77 };
78
79 struct device_state **state_table;
80 static spinlock_t state_lock;
81
82 /* List and lock for all pasid_states */
83 static LIST_HEAD(pasid_state_list);
84 static DEFINE_SPINLOCK(ps_lock);
85
86 static struct workqueue_struct *iommu_wq;
87
88 static void free_pasid_states(struct device_state *dev_state);
89 static void unbind_pasid(struct device_state *dev_state, int pasid);
90
91 static u16 device_id(struct pci_dev *pdev)
92 {
93         u16 devid;
94
95         devid = pdev->bus->number;
96         devid = (devid << 8) | pdev->devfn;
97
98         return devid;
99 }
100
101 static struct device_state *get_device_state(u16 devid)
102 {
103         struct device_state *dev_state;
104         unsigned long flags;
105
106         spin_lock_irqsave(&state_lock, flags);
107         dev_state = state_table[devid];
108         if (dev_state != NULL)
109                 atomic_inc(&dev_state->count);
110         spin_unlock_irqrestore(&state_lock, flags);
111
112         return dev_state;
113 }
114
115 static void free_device_state(struct device_state *dev_state)
116 {
117         /*
118          * First detach device from domain - No more PRI requests will arrive
119          * from that device after it is unbound from the IOMMUv2 domain.
120          */
121         iommu_detach_device(dev_state->domain, &dev_state->pdev->dev);
122
123         /* Everything is down now, free the IOMMUv2 domain */
124         iommu_domain_free(dev_state->domain);
125
126         /* Finally get rid of the device-state */
127         kfree(dev_state);
128 }
129
130 static void put_device_state(struct device_state *dev_state)
131 {
132         if (atomic_dec_and_test(&dev_state->count))
133                 wake_up(&dev_state->wq);
134 }
135
136 static void put_device_state_wait(struct device_state *dev_state)
137 {
138         DEFINE_WAIT(wait);
139
140         prepare_to_wait(&dev_state->wq, &wait, TASK_UNINTERRUPTIBLE);
141         if (!atomic_dec_and_test(&dev_state->count))
142                 schedule();
143         finish_wait(&dev_state->wq, &wait);
144
145         free_device_state(dev_state);
146 }
147 static void link_pasid_state(struct pasid_state *pasid_state)
148 {
149         spin_lock(&ps_lock);
150         list_add_tail(&pasid_state->list, &pasid_state_list);
151         spin_unlock(&ps_lock);
152 }
153
154 static void __unlink_pasid_state(struct pasid_state *pasid_state)
155 {
156         list_del(&pasid_state->list);
157 }
158
159 static void unlink_pasid_state(struct pasid_state *pasid_state)
160 {
161         spin_lock(&ps_lock);
162         __unlink_pasid_state(pasid_state);
163         spin_unlock(&ps_lock);
164 }
165
166 /* Must be called under dev_state->lock */
167 static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
168                                                   int pasid, bool alloc)
169 {
170         struct pasid_state **root, **ptr;
171         int level, index;
172
173         level = dev_state->pasid_levels;
174         root  = dev_state->states;
175
176         while (true) {
177
178                 index = (pasid >> (9 * level)) & 0x1ff;
179                 ptr   = &root[index];
180
181                 if (level == 0)
182                         break;
183
184                 if (*ptr == NULL) {
185                         if (!alloc)
186                                 return NULL;
187
188                         *ptr = (void *)get_zeroed_page(GFP_ATOMIC);
189                         if (*ptr == NULL)
190                                 return NULL;
191                 }
192
193                 root   = (struct pasid_state **)*ptr;
194                 level -= 1;
195         }
196
197         return ptr;
198 }
199
200 static int set_pasid_state(struct device_state *dev_state,
201                            struct pasid_state *pasid_state,
202                            int pasid)
203 {
204         struct pasid_state **ptr;
205         unsigned long flags;
206         int ret;
207
208         spin_lock_irqsave(&dev_state->lock, flags);
209         ptr = __get_pasid_state_ptr(dev_state, pasid, true);
210
211         ret = -ENOMEM;
212         if (ptr == NULL)
213                 goto out_unlock;
214
215         ret = -ENOMEM;
216         if (*ptr != NULL)
217                 goto out_unlock;
218
219         *ptr = pasid_state;
220
221         ret = 0;
222
223 out_unlock:
224         spin_unlock_irqrestore(&dev_state->lock, flags);
225
226         return ret;
227 }
228
229 static void clear_pasid_state(struct device_state *dev_state, int pasid)
230 {
231         struct pasid_state **ptr;
232         unsigned long flags;
233
234         spin_lock_irqsave(&dev_state->lock, flags);
235         ptr = __get_pasid_state_ptr(dev_state, pasid, true);
236
237         if (ptr == NULL)
238                 goto out_unlock;
239
240         *ptr = NULL;
241
242 out_unlock:
243         spin_unlock_irqrestore(&dev_state->lock, flags);
244 }
245
246 static struct pasid_state *get_pasid_state(struct device_state *dev_state,
247                                            int pasid)
248 {
249         struct pasid_state **ptr, *ret = NULL;
250         unsigned long flags;
251
252         spin_lock_irqsave(&dev_state->lock, flags);
253         ptr = __get_pasid_state_ptr(dev_state, pasid, false);
254
255         if (ptr == NULL)
256                 goto out_unlock;
257
258         ret = *ptr;
259         if (ret)
260                 atomic_inc(&ret->count);
261
262 out_unlock:
263         spin_unlock_irqrestore(&dev_state->lock, flags);
264
265         return ret;
266 }
267
268 static void free_pasid_state(struct pasid_state *pasid_state)
269 {
270         kfree(pasid_state);
271 }
272
273 static void put_pasid_state(struct pasid_state *pasid_state)
274 {
275         if (atomic_dec_and_test(&pasid_state->count)) {
276                 put_device_state(pasid_state->device_state);
277                 wake_up(&pasid_state->wq);
278         }
279 }
280
281 static void put_pasid_state_wait(struct pasid_state *pasid_state)
282 {
283         DEFINE_WAIT(wait);
284
285         prepare_to_wait(&pasid_state->wq, &wait, TASK_UNINTERRUPTIBLE);
286
287         if (atomic_dec_and_test(&pasid_state->count))
288                 put_device_state(pasid_state->device_state);
289         else
290                 schedule();
291
292         finish_wait(&pasid_state->wq, &wait);
293         mmput(pasid_state->mm);
294         free_pasid_state(pasid_state);
295 }
296
297 static void unbind_pasid(struct device_state *dev_state, int pasid)
298 {
299         struct pasid_state *pasid_state;
300
301         pasid_state = get_pasid_state(dev_state, pasid);
302         if (pasid_state == NULL)
303                 return;
304
305         unlink_pasid_state(pasid_state);
306
307         amd_iommu_domain_clear_gcr3(dev_state->domain, pasid);
308         clear_pasid_state(dev_state, pasid);
309
310         put_pasid_state(pasid_state); /* Reference taken in this function */
311         put_pasid_state_wait(pasid_state); /* Reference from bind() function */
312 }
313
314 static void free_pasid_states_level1(struct pasid_state **tbl)
315 {
316         int i;
317
318         for (i = 0; i < 512; ++i) {
319                 if (tbl[i] == NULL)
320                         continue;
321
322                 free_page((unsigned long)tbl[i]);
323         }
324 }
325
326 static void free_pasid_states_level2(struct pasid_state **tbl)
327 {
328         struct pasid_state **ptr;
329         int i;
330
331         for (i = 0; i < 512; ++i) {
332                 if (tbl[i] == NULL)
333                         continue;
334
335                 ptr = (struct pasid_state **)tbl[i];
336                 free_pasid_states_level1(ptr);
337         }
338 }
339
340 static void free_pasid_states(struct device_state *dev_state)
341 {
342         struct pasid_state *pasid_state;
343         int i;
344
345         for (i = 0; i < dev_state->max_pasids; ++i) {
346                 pasid_state = get_pasid_state(dev_state, i);
347                 if (pasid_state == NULL)
348                         continue;
349
350                 put_pasid_state(pasid_state);
351                 unbind_pasid(dev_state, i);
352         }
353
354         if (dev_state->pasid_levels == 2)
355                 free_pasid_states_level2(dev_state->states);
356         else if (dev_state->pasid_levels == 1)
357                 free_pasid_states_level1(dev_state->states);
358         else if (dev_state->pasid_levels != 0)
359                 BUG();
360
361         free_page((unsigned long)dev_state->states);
362 }
363
364 static void set_pri_tag_status(struct pasid_state *pasid_state,
365                                u16 tag, int status)
366 {
367         unsigned long flags;
368
369         spin_lock_irqsave(&pasid_state->lock, flags);
370         pasid_state->pri[tag].status = status;
371         spin_unlock_irqrestore(&pasid_state->lock, flags);
372 }
373
374 static void finish_pri_tag(struct device_state *dev_state,
375                            struct pasid_state *pasid_state,
376                            u16 tag)
377 {
378         unsigned long flags;
379
380         spin_lock_irqsave(&pasid_state->lock, flags);
381         if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) &&
382             pasid_state->pri[tag].finish) {
383                 amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid,
384                                        pasid_state->pri[tag].status, tag);
385                 pasid_state->pri[tag].finish = false;
386                 pasid_state->pri[tag].status = PPR_SUCCESS;
387         }
388         spin_unlock_irqrestore(&pasid_state->lock, flags);
389 }
390
391 static void do_fault(struct work_struct *work)
392 {
393         struct fault *fault = container_of(work, struct fault, work);
394         int npages, write;
395         struct page *page;
396
397         write = !!(fault->flags & PPR_FAULT_WRITE);
398
399         npages = get_user_pages(fault->state->task, fault->state->mm,
400                                 fault->address, 1, write, 0, &page, NULL);
401
402         if (npages == 1)
403                 put_page(page);
404         else
405                 set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
406
407         finish_pri_tag(fault->dev_state, fault->state, fault->tag);
408
409         put_pasid_state(fault->state);
410
411         kfree(fault);
412 }
413
414 static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
415 {
416         struct amd_iommu_fault *iommu_fault;
417         struct pasid_state *pasid_state;
418         struct device_state *dev_state;
419         unsigned long flags;
420         struct fault *fault;
421         bool finish;
422         u16 tag;
423         int ret;
424
425         iommu_fault = data;
426         tag         = iommu_fault->tag & 0x1ff;
427         finish      = (iommu_fault->tag >> 9) & 1;
428
429         ret = NOTIFY_DONE;
430         dev_state = get_device_state(iommu_fault->device_id);
431         if (dev_state == NULL)
432                 goto out;
433
434         pasid_state = get_pasid_state(dev_state, iommu_fault->pasid);
435         if (pasid_state == NULL) {
436                 /* We know the device but not the PASID -> send INVALID */
437                 amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid,
438                                        PPR_INVALID, tag);
439                 goto out_drop_state;
440         }
441
442         spin_lock_irqsave(&pasid_state->lock, flags);
443         atomic_inc(&pasid_state->pri[tag].inflight);
444         if (finish)
445                 pasid_state->pri[tag].finish = true;
446         spin_unlock_irqrestore(&pasid_state->lock, flags);
447
448         fault = kzalloc(sizeof(*fault), GFP_ATOMIC);
449         if (fault == NULL) {
450                 /* We are OOM - send success and let the device re-fault */
451                 finish_pri_tag(dev_state, pasid_state, tag);
452                 goto out_drop_state;
453         }
454
455         fault->dev_state = dev_state;
456         fault->address   = iommu_fault->address;
457         fault->state     = pasid_state;
458         fault->tag       = tag;
459         fault->finish    = finish;
460         fault->flags     = iommu_fault->flags;
461         INIT_WORK(&fault->work, do_fault);
462
463         queue_work(iommu_wq, &fault->work);
464
465         ret = NOTIFY_OK;
466
467 out_drop_state:
468         put_device_state(dev_state);
469
470 out:
471         return ret;
472 }
473
474 static struct notifier_block ppr_nb = {
475         .notifier_call = ppr_notifier,
476 };
477
478 int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
479                          struct task_struct *task)
480 {
481         struct pasid_state *pasid_state;
482         struct device_state *dev_state;
483         u16 devid;
484         int ret;
485
486         might_sleep();
487
488         if (!amd_iommu_v2_supported())
489                 return -ENODEV;
490
491         devid     = device_id(pdev);
492         dev_state = get_device_state(devid);
493
494         if (dev_state == NULL)
495                 return -EINVAL;
496
497         ret = -EINVAL;
498         if (pasid < 0 || pasid >= dev_state->max_pasids)
499                 goto out;
500
501         ret = -ENOMEM;
502         pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL);
503         if (pasid_state == NULL)
504                 goto out;
505
506         atomic_set(&pasid_state->count, 1);
507         init_waitqueue_head(&pasid_state->wq);
508         pasid_state->task         = task;
509         pasid_state->mm           = get_task_mm(task);
510         pasid_state->device_state = dev_state;
511         pasid_state->pasid        = pasid;
512
513         if (pasid_state->mm == NULL)
514                 goto out_free;
515
516         ret = set_pasid_state(dev_state, pasid_state, pasid);
517         if (ret)
518                 goto out_free;
519
520         ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid,
521                                         __pa(pasid_state->mm->pgd));
522         if (ret)
523                 goto out_clear_state;
524
525         link_pasid_state(pasid_state);
526
527         return 0;
528
529 out_clear_state:
530         clear_pasid_state(dev_state, pasid);
531
532 out_free:
533         free_pasid_state(pasid_state);
534
535 out:
536         put_device_state(dev_state);
537
538         return ret;
539 }
540 EXPORT_SYMBOL(amd_iommu_bind_pasid);
541
542 void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
543 {
544         struct device_state *dev_state;
545         u16 devid;
546
547         might_sleep();
548
549         if (!amd_iommu_v2_supported())
550                 return;
551
552         devid = device_id(pdev);
553         dev_state = get_device_state(devid);
554         if (dev_state == NULL)
555                 return;
556
557         if (pasid < 0 || pasid >= dev_state->max_pasids)
558                 goto out;
559
560         unbind_pasid(dev_state, pasid);
561
562 out:
563         put_device_state(dev_state);
564 }
565 EXPORT_SYMBOL(amd_iommu_unbind_pasid);
566
567 int amd_iommu_init_device(struct pci_dev *pdev, int pasids)
568 {
569         struct device_state *dev_state;
570         unsigned long flags;
571         int ret, tmp;
572         u16 devid;
573
574         might_sleep();
575
576         if (!amd_iommu_v2_supported())
577                 return -ENODEV;
578
579         if (pasids <= 0 || pasids > (PASID_MASK + 1))
580                 return -EINVAL;
581
582         devid = device_id(pdev);
583
584         dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL);
585         if (dev_state == NULL)
586                 return -ENOMEM;
587
588         spin_lock_init(&dev_state->lock);
589         init_waitqueue_head(&dev_state->wq);
590         dev_state->pdev = pdev;
591
592         tmp = pasids;
593         for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9)
594                 dev_state->pasid_levels += 1;
595
596         atomic_set(&dev_state->count, 1);
597         dev_state->max_pasids = pasids;
598
599         ret = -ENOMEM;
600         dev_state->states = (void *)get_zeroed_page(GFP_KERNEL);
601         if (dev_state->states == NULL)
602                 goto out_free_dev_state;
603
604         dev_state->domain = iommu_domain_alloc(&pci_bus_type);
605         if (dev_state->domain == NULL)
606                 goto out_free_states;
607
608         amd_iommu_domain_direct_map(dev_state->domain);
609
610         ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids);
611         if (ret)
612                 goto out_free_domain;
613
614         ret = iommu_attach_device(dev_state->domain, &pdev->dev);
615         if (ret != 0)
616                 goto out_free_domain;
617
618         spin_lock_irqsave(&state_lock, flags);
619
620         if (state_table[devid] != NULL) {
621                 spin_unlock_irqrestore(&state_lock, flags);
622                 ret = -EBUSY;
623                 goto out_free_domain;
624         }
625
626         state_table[devid] = dev_state;
627
628         spin_unlock_irqrestore(&state_lock, flags);
629
630         return 0;
631
632 out_free_domain:
633         iommu_domain_free(dev_state->domain);
634
635 out_free_states:
636         free_page((unsigned long)dev_state->states);
637
638 out_free_dev_state:
639         kfree(dev_state);
640
641         return ret;
642 }
643 EXPORT_SYMBOL(amd_iommu_init_device);
644
645 void amd_iommu_free_device(struct pci_dev *pdev)
646 {
647         struct device_state *dev_state;
648         unsigned long flags;
649         u16 devid;
650
651         if (!amd_iommu_v2_supported())
652                 return;
653
654         devid = device_id(pdev);
655
656         spin_lock_irqsave(&state_lock, flags);
657
658         dev_state = state_table[devid];
659         if (dev_state == NULL) {
660                 spin_unlock_irqrestore(&state_lock, flags);
661                 return;
662         }
663
664         state_table[devid] = NULL;
665
666         spin_unlock_irqrestore(&state_lock, flags);
667
668         /* Get rid of any remaining pasid states */
669         free_pasid_states(dev_state);
670
671         put_device_state_wait(dev_state);
672 }
673 EXPORT_SYMBOL(amd_iommu_free_device);
674
675 static int __init amd_iommu_v2_init(void)
676 {
677         size_t state_table_size;
678         int ret;
679
680         pr_info("AMD IOMMUv2 driver by Joerg Roedel <joerg.roedel@amd.com>");
681
682         spin_lock_init(&state_lock);
683
684         state_table_size = MAX_DEVICES * sizeof(struct device_state *);
685         state_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
686                                                get_order(state_table_size));
687         if (state_table == NULL)
688                 return -ENOMEM;
689
690         ret = -ENOMEM;
691         iommu_wq = create_workqueue("amd_iommu_v2");
692         if (iommu_wq == NULL) {
693                 ret = -ENOMEM;
694                 goto out_free;
695         }
696
697         amd_iommu_register_ppr_notifier(&ppr_nb);
698
699         return 0;
700
701 out_free:
702         free_pages((unsigned long)state_table, get_order(state_table_size));
703
704         return ret;
705 }
706
707 static void __exit amd_iommu_v2_exit(void)
708 {
709         struct device_state *dev_state;
710         size_t state_table_size;
711         int i;
712
713         amd_iommu_unregister_ppr_notifier(&ppr_nb);
714
715         flush_workqueue(iommu_wq);
716
717         /*
718          * The loop below might call flush_workqueue(), so call
719          * destroy_workqueue() after it
720          */
721         for (i = 0; i < MAX_DEVICES; ++i) {
722                 dev_state = get_device_state(i);
723
724                 if (dev_state == NULL)
725                         continue;
726
727                 WARN_ON_ONCE(1);
728
729                 put_device_state(dev_state);
730                 amd_iommu_free_device(dev_state->pdev);
731         }
732
733         destroy_workqueue(iommu_wq);
734
735         state_table_size = MAX_DEVICES * sizeof(struct device_state *);
736         free_pages((unsigned long)state_table, get_order(state_table_size));
737 }
738
739 module_init(amd_iommu_v2_init);
740 module_exit(amd_iommu_v2_exit);