]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/pci/intel-iommu.c
intel-iommu: make init_dmars() static
[karo-tx-linux.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56
57 static void flush_unmaps_timeout(unsigned long data);
58
59 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
60
61 #define HIGH_WATER_MARK 250
62 struct deferred_flush_tables {
63         int next;
64         struct iova *iova[HIGH_WATER_MARK];
65         struct dmar_domain *domain[HIGH_WATER_MARK];
66 };
67
68 static struct deferred_flush_tables *deferred_flush;
69
70 /* bitmap for indexing intel_iommus */
71 static int g_num_of_iommus;
72
73 static DEFINE_SPINLOCK(async_umap_flush_lock);
74 static LIST_HEAD(unmaps_to_do);
75
76 static int timer_on;
77 static long list_size;
78
79 static void domain_remove_dev_info(struct dmar_domain *domain);
80
81 int dmar_disabled;
82 static int __initdata dmar_map_gfx = 1;
83 static int dmar_forcedac;
84 static int intel_iommu_strict;
85
86 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
87 static DEFINE_SPINLOCK(device_domain_lock);
88 static LIST_HEAD(device_domain_list);
89
90 static int __init intel_iommu_setup(char *str)
91 {
92         if (!str)
93                 return -EINVAL;
94         while (*str) {
95                 if (!strncmp(str, "off", 3)) {
96                         dmar_disabled = 1;
97                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
98                 } else if (!strncmp(str, "igfx_off", 8)) {
99                         dmar_map_gfx = 0;
100                         printk(KERN_INFO
101                                 "Intel-IOMMU: disable GFX device mapping\n");
102                 } else if (!strncmp(str, "forcedac", 8)) {
103                         printk(KERN_INFO
104                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
105                         dmar_forcedac = 1;
106                 } else if (!strncmp(str, "strict", 6)) {
107                         printk(KERN_INFO
108                                 "Intel-IOMMU: disable batched IOTLB flush\n");
109                         intel_iommu_strict = 1;
110                 }
111
112                 str += strcspn(str, ",");
113                 while (*str == ',')
114                         str++;
115         }
116         return 0;
117 }
118 __setup("intel_iommu=", intel_iommu_setup);
119
120 static struct kmem_cache *iommu_domain_cache;
121 static struct kmem_cache *iommu_devinfo_cache;
122 static struct kmem_cache *iommu_iova_cache;
123
124 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
125 {
126         unsigned int flags;
127         void *vaddr;
128
129         /* trying to avoid low memory issues */
130         flags = current->flags & PF_MEMALLOC;
131         current->flags |= PF_MEMALLOC;
132         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
133         current->flags &= (~PF_MEMALLOC | flags);
134         return vaddr;
135 }
136
137
138 static inline void *alloc_pgtable_page(void)
139 {
140         unsigned int flags;
141         void *vaddr;
142
143         /* trying to avoid low memory issues */
144         flags = current->flags & PF_MEMALLOC;
145         current->flags |= PF_MEMALLOC;
146         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
147         current->flags &= (~PF_MEMALLOC | flags);
148         return vaddr;
149 }
150
151 static inline void free_pgtable_page(void *vaddr)
152 {
153         free_page((unsigned long)vaddr);
154 }
155
156 static inline void *alloc_domain_mem(void)
157 {
158         return iommu_kmem_cache_alloc(iommu_domain_cache);
159 }
160
161 static void free_domain_mem(void *vaddr)
162 {
163         kmem_cache_free(iommu_domain_cache, vaddr);
164 }
165
166 static inline void * alloc_devinfo_mem(void)
167 {
168         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
169 }
170
171 static inline void free_devinfo_mem(void *vaddr)
172 {
173         kmem_cache_free(iommu_devinfo_cache, vaddr);
174 }
175
176 struct iova *alloc_iova_mem(void)
177 {
178         return iommu_kmem_cache_alloc(iommu_iova_cache);
179 }
180
181 void free_iova_mem(struct iova *iova)
182 {
183         kmem_cache_free(iommu_iova_cache, iova);
184 }
185
186 /* Gets context entry for a given bus and devfn */
187 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
188                 u8 bus, u8 devfn)
189 {
190         struct root_entry *root;
191         struct context_entry *context;
192         unsigned long phy_addr;
193         unsigned long flags;
194
195         spin_lock_irqsave(&iommu->lock, flags);
196         root = &iommu->root_entry[bus];
197         context = get_context_addr_from_root(root);
198         if (!context) {
199                 context = (struct context_entry *)alloc_pgtable_page();
200                 if (!context) {
201                         spin_unlock_irqrestore(&iommu->lock, flags);
202                         return NULL;
203                 }
204                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
205                 phy_addr = virt_to_phys((void *)context);
206                 set_root_value(root, phy_addr);
207                 set_root_present(root);
208                 __iommu_flush_cache(iommu, root, sizeof(*root));
209         }
210         spin_unlock_irqrestore(&iommu->lock, flags);
211         return &context[devfn];
212 }
213
214 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
215 {
216         struct root_entry *root;
217         struct context_entry *context;
218         int ret;
219         unsigned long flags;
220
221         spin_lock_irqsave(&iommu->lock, flags);
222         root = &iommu->root_entry[bus];
223         context = get_context_addr_from_root(root);
224         if (!context) {
225                 ret = 0;
226                 goto out;
227         }
228         ret = context_present(context[devfn]);
229 out:
230         spin_unlock_irqrestore(&iommu->lock, flags);
231         return ret;
232 }
233
234 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
235 {
236         struct root_entry *root;
237         struct context_entry *context;
238         unsigned long flags;
239
240         spin_lock_irqsave(&iommu->lock, flags);
241         root = &iommu->root_entry[bus];
242         context = get_context_addr_from_root(root);
243         if (context) {
244                 context_clear_entry(context[devfn]);
245                 __iommu_flush_cache(iommu, &context[devfn], \
246                         sizeof(*context));
247         }
248         spin_unlock_irqrestore(&iommu->lock, flags);
249 }
250
251 static void free_context_table(struct intel_iommu *iommu)
252 {
253         struct root_entry *root;
254         int i;
255         unsigned long flags;
256         struct context_entry *context;
257
258         spin_lock_irqsave(&iommu->lock, flags);
259         if (!iommu->root_entry) {
260                 goto out;
261         }
262         for (i = 0; i < ROOT_ENTRY_NR; i++) {
263                 root = &iommu->root_entry[i];
264                 context = get_context_addr_from_root(root);
265                 if (context)
266                         free_pgtable_page(context);
267         }
268         free_pgtable_page(iommu->root_entry);
269         iommu->root_entry = NULL;
270 out:
271         spin_unlock_irqrestore(&iommu->lock, flags);
272 }
273
274 /* page table handling */
275 #define LEVEL_STRIDE            (9)
276 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
277
278 static inline int agaw_to_level(int agaw)
279 {
280         return agaw + 2;
281 }
282
283 static inline int agaw_to_width(int agaw)
284 {
285         return 30 + agaw * LEVEL_STRIDE;
286
287 }
288
289 static inline int width_to_agaw(int width)
290 {
291         return (width - 30) / LEVEL_STRIDE;
292 }
293
294 static inline unsigned int level_to_offset_bits(int level)
295 {
296         return (12 + (level - 1) * LEVEL_STRIDE);
297 }
298
299 static inline int address_level_offset(u64 addr, int level)
300 {
301         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
302 }
303
304 static inline u64 level_mask(int level)
305 {
306         return ((u64)-1 << level_to_offset_bits(level));
307 }
308
309 static inline u64 level_size(int level)
310 {
311         return ((u64)1 << level_to_offset_bits(level));
312 }
313
314 static inline u64 align_to_level(u64 addr, int level)
315 {
316         return ((addr + level_size(level) - 1) & level_mask(level));
317 }
318
319 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
320 {
321         int addr_width = agaw_to_width(domain->agaw);
322         struct dma_pte *parent, *pte = NULL;
323         int level = agaw_to_level(domain->agaw);
324         int offset;
325         unsigned long flags;
326
327         BUG_ON(!domain->pgd);
328
329         addr &= (((u64)1) << addr_width) - 1;
330         parent = domain->pgd;
331
332         spin_lock_irqsave(&domain->mapping_lock, flags);
333         while (level > 0) {
334                 void *tmp_page;
335
336                 offset = address_level_offset(addr, level);
337                 pte = &parent[offset];
338                 if (level == 1)
339                         break;
340
341                 if (!dma_pte_present(*pte)) {
342                         tmp_page = alloc_pgtable_page();
343
344                         if (!tmp_page) {
345                                 spin_unlock_irqrestore(&domain->mapping_lock,
346                                         flags);
347                                 return NULL;
348                         }
349                         __iommu_flush_cache(domain->iommu, tmp_page,
350                                         PAGE_SIZE);
351                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
352                         /*
353                          * high level table always sets r/w, last level page
354                          * table control read/write
355                          */
356                         dma_set_pte_readable(*pte);
357                         dma_set_pte_writable(*pte);
358                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
359                 }
360                 parent = phys_to_virt(dma_pte_addr(*pte));
361                 level--;
362         }
363
364         spin_unlock_irqrestore(&domain->mapping_lock, flags);
365         return pte;
366 }
367
368 /* return address's pte at specific level */
369 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
370                 int level)
371 {
372         struct dma_pte *parent, *pte = NULL;
373         int total = agaw_to_level(domain->agaw);
374         int offset;
375
376         parent = domain->pgd;
377         while (level <= total) {
378                 offset = address_level_offset(addr, total);
379                 pte = &parent[offset];
380                 if (level == total)
381                         return pte;
382
383                 if (!dma_pte_present(*pte))
384                         break;
385                 parent = phys_to_virt(dma_pte_addr(*pte));
386                 total--;
387         }
388         return NULL;
389 }
390
391 /* clear one page's page table */
392 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
393 {
394         struct dma_pte *pte = NULL;
395
396         /* get last level pte */
397         pte = dma_addr_level_pte(domain, addr, 1);
398
399         if (pte) {
400                 dma_clear_pte(*pte);
401                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
402         }
403 }
404
405 /* clear last level pte, a tlb flush should be followed */
406 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
407 {
408         int addr_width = agaw_to_width(domain->agaw);
409
410         start &= (((u64)1) << addr_width) - 1;
411         end &= (((u64)1) << addr_width) - 1;
412         /* in case it's partial page */
413         start = PAGE_ALIGN(start);
414         end &= PAGE_MASK;
415
416         /* we don't need lock here, nobody else touches the iova range */
417         while (start < end) {
418                 dma_pte_clear_one(domain, start);
419                 start += VTD_PAGE_SIZE;
420         }
421 }
422
423 /* free page table pages. last level pte should already be cleared */
424 static void dma_pte_free_pagetable(struct dmar_domain *domain,
425         u64 start, u64 end)
426 {
427         int addr_width = agaw_to_width(domain->agaw);
428         struct dma_pte *pte;
429         int total = agaw_to_level(domain->agaw);
430         int level;
431         u64 tmp;
432
433         start &= (((u64)1) << addr_width) - 1;
434         end &= (((u64)1) << addr_width) - 1;
435
436         /* we don't need lock here, nobody else touches the iova range */
437         level = 2;
438         while (level <= total) {
439                 tmp = align_to_level(start, level);
440                 if (tmp >= end || (tmp + level_size(level) > end))
441                         return;
442
443                 while (tmp < end) {
444                         pte = dma_addr_level_pte(domain, tmp, level);
445                         if (pte) {
446                                 free_pgtable_page(
447                                         phys_to_virt(dma_pte_addr(*pte)));
448                                 dma_clear_pte(*pte);
449                                 __iommu_flush_cache(domain->iommu,
450                                                 pte, sizeof(*pte));
451                         }
452                         tmp += level_size(level);
453                 }
454                 level++;
455         }
456         /* free pgd */
457         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
458                 free_pgtable_page(domain->pgd);
459                 domain->pgd = NULL;
460         }
461 }
462
463 /* iommu handling */
464 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
465 {
466         struct root_entry *root;
467         unsigned long flags;
468
469         root = (struct root_entry *)alloc_pgtable_page();
470         if (!root)
471                 return -ENOMEM;
472
473         __iommu_flush_cache(iommu, root, ROOT_SIZE);
474
475         spin_lock_irqsave(&iommu->lock, flags);
476         iommu->root_entry = root;
477         spin_unlock_irqrestore(&iommu->lock, flags);
478
479         return 0;
480 }
481
482 static void iommu_set_root_entry(struct intel_iommu *iommu)
483 {
484         void *addr;
485         u32 cmd, sts;
486         unsigned long flag;
487
488         addr = iommu->root_entry;
489
490         spin_lock_irqsave(&iommu->register_lock, flag);
491         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
492
493         cmd = iommu->gcmd | DMA_GCMD_SRTP;
494         writel(cmd, iommu->reg + DMAR_GCMD_REG);
495
496         /* Make sure hardware complete it */
497         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
498                 readl, (sts & DMA_GSTS_RTPS), sts);
499
500         spin_unlock_irqrestore(&iommu->register_lock, flag);
501 }
502
503 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
504 {
505         u32 val;
506         unsigned long flag;
507
508         if (!cap_rwbf(iommu->cap))
509                 return;
510         val = iommu->gcmd | DMA_GCMD_WBF;
511
512         spin_lock_irqsave(&iommu->register_lock, flag);
513         writel(val, iommu->reg + DMAR_GCMD_REG);
514
515         /* Make sure hardware complete it */
516         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
517                         readl, (!(val & DMA_GSTS_WBFS)), val);
518
519         spin_unlock_irqrestore(&iommu->register_lock, flag);
520 }
521
522 /* return value determine if we need a write buffer flush */
523 static int __iommu_flush_context(struct intel_iommu *iommu,
524         u16 did, u16 source_id, u8 function_mask, u64 type,
525         int non_present_entry_flush)
526 {
527         u64 val = 0;
528         unsigned long flag;
529
530         /*
531          * In the non-present entry flush case, if hardware doesn't cache
532          * non-present entry we do nothing and if hardware cache non-present
533          * entry, we flush entries of domain 0 (the domain id is used to cache
534          * any non-present entries)
535          */
536         if (non_present_entry_flush) {
537                 if (!cap_caching_mode(iommu->cap))
538                         return 1;
539                 else
540                         did = 0;
541         }
542
543         switch (type) {
544         case DMA_CCMD_GLOBAL_INVL:
545                 val = DMA_CCMD_GLOBAL_INVL;
546                 break;
547         case DMA_CCMD_DOMAIN_INVL:
548                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
549                 break;
550         case DMA_CCMD_DEVICE_INVL:
551                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
552                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
553                 break;
554         default:
555                 BUG();
556         }
557         val |= DMA_CCMD_ICC;
558
559         spin_lock_irqsave(&iommu->register_lock, flag);
560         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
561
562         /* Make sure hardware complete it */
563         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
564                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
565
566         spin_unlock_irqrestore(&iommu->register_lock, flag);
567
568         /* flush context entry will implicitly flush write buffer */
569         return 0;
570 }
571
572 /* return value determine if we need a write buffer flush */
573 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
574         u64 addr, unsigned int size_order, u64 type,
575         int non_present_entry_flush)
576 {
577         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
578         u64 val = 0, val_iva = 0;
579         unsigned long flag;
580
581         /*
582          * In the non-present entry flush case, if hardware doesn't cache
583          * non-present entry we do nothing and if hardware cache non-present
584          * entry, we flush entries of domain 0 (the domain id is used to cache
585          * any non-present entries)
586          */
587         if (non_present_entry_flush) {
588                 if (!cap_caching_mode(iommu->cap))
589                         return 1;
590                 else
591                         did = 0;
592         }
593
594         switch (type) {
595         case DMA_TLB_GLOBAL_FLUSH:
596                 /* global flush doesn't need set IVA_REG */
597                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
598                 break;
599         case DMA_TLB_DSI_FLUSH:
600                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
601                 break;
602         case DMA_TLB_PSI_FLUSH:
603                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
604                 /* Note: always flush non-leaf currently */
605                 val_iva = size_order | addr;
606                 break;
607         default:
608                 BUG();
609         }
610         /* Note: set drain read/write */
611 #if 0
612         /*
613          * This is probably to be super secure.. Looks like we can
614          * ignore it without any impact.
615          */
616         if (cap_read_drain(iommu->cap))
617                 val |= DMA_TLB_READ_DRAIN;
618 #endif
619         if (cap_write_drain(iommu->cap))
620                 val |= DMA_TLB_WRITE_DRAIN;
621
622         spin_lock_irqsave(&iommu->register_lock, flag);
623         /* Note: Only uses first TLB reg currently */
624         if (val_iva)
625                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
626         dmar_writeq(iommu->reg + tlb_offset + 8, val);
627
628         /* Make sure hardware complete it */
629         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
630                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
631
632         spin_unlock_irqrestore(&iommu->register_lock, flag);
633
634         /* check IOTLB invalidation granularity */
635         if (DMA_TLB_IAIG(val) == 0)
636                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
637         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
638                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
639                         (unsigned long long)DMA_TLB_IIRG(type),
640                         (unsigned long long)DMA_TLB_IAIG(val));
641         /* flush iotlb entry will implicitly flush write buffer */
642         return 0;
643 }
644
645 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
646         u64 addr, unsigned int pages, int non_present_entry_flush)
647 {
648         unsigned int mask;
649
650         BUG_ON(addr & (~VTD_PAGE_MASK));
651         BUG_ON(pages == 0);
652
653         /* Fallback to domain selective flush if no PSI support */
654         if (!cap_pgsel_inv(iommu->cap))
655                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
656                                                 DMA_TLB_DSI_FLUSH,
657                                                 non_present_entry_flush);
658
659         /*
660          * PSI requires page size to be 2 ^ x, and the base address is naturally
661          * aligned to the size
662          */
663         mask = ilog2(__roundup_pow_of_two(pages));
664         /* Fallback to domain selective flush if size is too big */
665         if (mask > cap_max_amask_val(iommu->cap))
666                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
667                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
668
669         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
670                                         DMA_TLB_PSI_FLUSH,
671                                         non_present_entry_flush);
672 }
673
674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
675 {
676         u32 pmen;
677         unsigned long flags;
678
679         spin_lock_irqsave(&iommu->register_lock, flags);
680         pmen = readl(iommu->reg + DMAR_PMEN_REG);
681         pmen &= ~DMA_PMEN_EPM;
682         writel(pmen, iommu->reg + DMAR_PMEN_REG);
683
684         /* wait for the protected region status bit to clear */
685         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
686                 readl, !(pmen & DMA_PMEN_PRS), pmen);
687
688         spin_unlock_irqrestore(&iommu->register_lock, flags);
689 }
690
691 static int iommu_enable_translation(struct intel_iommu *iommu)
692 {
693         u32 sts;
694         unsigned long flags;
695
696         spin_lock_irqsave(&iommu->register_lock, flags);
697         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
698
699         /* Make sure hardware complete it */
700         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
701                 readl, (sts & DMA_GSTS_TES), sts);
702
703         iommu->gcmd |= DMA_GCMD_TE;
704         spin_unlock_irqrestore(&iommu->register_lock, flags);
705         return 0;
706 }
707
708 static int iommu_disable_translation(struct intel_iommu *iommu)
709 {
710         u32 sts;
711         unsigned long flag;
712
713         spin_lock_irqsave(&iommu->register_lock, flag);
714         iommu->gcmd &= ~DMA_GCMD_TE;
715         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
716
717         /* Make sure hardware complete it */
718         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
719                 readl, (!(sts & DMA_GSTS_TES)), sts);
720
721         spin_unlock_irqrestore(&iommu->register_lock, flag);
722         return 0;
723 }
724
725 /* iommu interrupt handling. Most stuff are MSI-like. */
726
727 static const char *fault_reason_strings[] =
728 {
729         "Software",
730         "Present bit in root entry is clear",
731         "Present bit in context entry is clear",
732         "Invalid context entry",
733         "Access beyond MGAW",
734         "PTE Write access is not set",
735         "PTE Read access is not set",
736         "Next page table ptr is invalid",
737         "Root table address invalid",
738         "Context table ptr is invalid",
739         "non-zero reserved fields in RTP",
740         "non-zero reserved fields in CTP",
741         "non-zero reserved fields in PTE",
742 };
743 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
744
745 const char *dmar_get_fault_reason(u8 fault_reason)
746 {
747         if (fault_reason > MAX_FAULT_REASON_IDX)
748                 return "Unknown";
749         else
750                 return fault_reason_strings[fault_reason];
751 }
752
753 void dmar_msi_unmask(unsigned int irq)
754 {
755         struct intel_iommu *iommu = get_irq_data(irq);
756         unsigned long flag;
757
758         /* unmask it */
759         spin_lock_irqsave(&iommu->register_lock, flag);
760         writel(0, iommu->reg + DMAR_FECTL_REG);
761         /* Read a reg to force flush the post write */
762         readl(iommu->reg + DMAR_FECTL_REG);
763         spin_unlock_irqrestore(&iommu->register_lock, flag);
764 }
765
766 void dmar_msi_mask(unsigned int irq)
767 {
768         unsigned long flag;
769         struct intel_iommu *iommu = get_irq_data(irq);
770
771         /* mask it */
772         spin_lock_irqsave(&iommu->register_lock, flag);
773         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
774         /* Read a reg to force flush the post write */
775         readl(iommu->reg + DMAR_FECTL_REG);
776         spin_unlock_irqrestore(&iommu->register_lock, flag);
777 }
778
779 void dmar_msi_write(int irq, struct msi_msg *msg)
780 {
781         struct intel_iommu *iommu = get_irq_data(irq);
782         unsigned long flag;
783
784         spin_lock_irqsave(&iommu->register_lock, flag);
785         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
786         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
787         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
788         spin_unlock_irqrestore(&iommu->register_lock, flag);
789 }
790
791 void dmar_msi_read(int irq, struct msi_msg *msg)
792 {
793         struct intel_iommu *iommu = get_irq_data(irq);
794         unsigned long flag;
795
796         spin_lock_irqsave(&iommu->register_lock, flag);
797         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
798         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
799         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
800         spin_unlock_irqrestore(&iommu->register_lock, flag);
801 }
802
803 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
804                 u8 fault_reason, u16 source_id, unsigned long long addr)
805 {
806         const char *reason;
807
808         reason = dmar_get_fault_reason(fault_reason);
809
810         printk(KERN_ERR
811                 "DMAR:[%s] Request device [%02x:%02x.%d] "
812                 "fault addr %llx \n"
813                 "DMAR:[fault reason %02d] %s\n",
814                 (type ? "DMA Read" : "DMA Write"),
815                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
816                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
817         return 0;
818 }
819
820 #define PRIMARY_FAULT_REG_LEN (16)
821 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
822 {
823         struct intel_iommu *iommu = dev_id;
824         int reg, fault_index;
825         u32 fault_status;
826         unsigned long flag;
827
828         spin_lock_irqsave(&iommu->register_lock, flag);
829         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
830
831         /* TBD: ignore advanced fault log currently */
832         if (!(fault_status & DMA_FSTS_PPF))
833                 goto clear_overflow;
834
835         fault_index = dma_fsts_fault_record_index(fault_status);
836         reg = cap_fault_reg_offset(iommu->cap);
837         while (1) {
838                 u8 fault_reason;
839                 u16 source_id;
840                 u64 guest_addr;
841                 int type;
842                 u32 data;
843
844                 /* highest 32 bits */
845                 data = readl(iommu->reg + reg +
846                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
847                 if (!(data & DMA_FRCD_F))
848                         break;
849
850                 fault_reason = dma_frcd_fault_reason(data);
851                 type = dma_frcd_type(data);
852
853                 data = readl(iommu->reg + reg +
854                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
855                 source_id = dma_frcd_source_id(data);
856
857                 guest_addr = dmar_readq(iommu->reg + reg +
858                                 fault_index * PRIMARY_FAULT_REG_LEN);
859                 guest_addr = dma_frcd_page_addr(guest_addr);
860                 /* clear the fault */
861                 writel(DMA_FRCD_F, iommu->reg + reg +
862                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
863
864                 spin_unlock_irqrestore(&iommu->register_lock, flag);
865
866                 iommu_page_fault_do_one(iommu, type, fault_reason,
867                                 source_id, guest_addr);
868
869                 fault_index++;
870                 if (fault_index > cap_num_fault_regs(iommu->cap))
871                         fault_index = 0;
872                 spin_lock_irqsave(&iommu->register_lock, flag);
873         }
874 clear_overflow:
875         /* clear primary fault overflow */
876         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
877         if (fault_status & DMA_FSTS_PFO)
878                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
879
880         spin_unlock_irqrestore(&iommu->register_lock, flag);
881         return IRQ_HANDLED;
882 }
883
884 int dmar_set_interrupt(struct intel_iommu *iommu)
885 {
886         int irq, ret;
887
888         irq = create_irq();
889         if (!irq) {
890                 printk(KERN_ERR "IOMMU: no free vectors\n");
891                 return -EINVAL;
892         }
893
894         set_irq_data(irq, iommu);
895         iommu->irq = irq;
896
897         ret = arch_setup_dmar_msi(irq);
898         if (ret) {
899                 set_irq_data(irq, NULL);
900                 iommu->irq = 0;
901                 destroy_irq(irq);
902                 return 0;
903         }
904
905         /* Force fault register is cleared */
906         iommu_page_fault(irq, iommu);
907
908         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
909         if (ret)
910                 printk(KERN_ERR "IOMMU: can't request irq\n");
911         return ret;
912 }
913
914 static int iommu_init_domains(struct intel_iommu *iommu)
915 {
916         unsigned long ndomains;
917         unsigned long nlongs;
918
919         ndomains = cap_ndoms(iommu->cap);
920         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
921         nlongs = BITS_TO_LONGS(ndomains);
922
923         /* TBD: there might be 64K domains,
924          * consider other allocation for future chip
925          */
926         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
927         if (!iommu->domain_ids) {
928                 printk(KERN_ERR "Allocating domain id array failed\n");
929                 return -ENOMEM;
930         }
931         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
932                         GFP_KERNEL);
933         if (!iommu->domains) {
934                 printk(KERN_ERR "Allocating domain array failed\n");
935                 kfree(iommu->domain_ids);
936                 return -ENOMEM;
937         }
938
939         spin_lock_init(&iommu->lock);
940
941         /*
942          * if Caching mode is set, then invalid translations are tagged
943          * with domainid 0. Hence we need to pre-allocate it.
944          */
945         if (cap_caching_mode(iommu->cap))
946                 set_bit(0, iommu->domain_ids);
947         return 0;
948 }
949
950
951 static void domain_exit(struct dmar_domain *domain);
952
953 void free_dmar_iommu(struct intel_iommu *iommu)
954 {
955         struct dmar_domain *domain;
956         int i;
957
958         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
959         for (; i < cap_ndoms(iommu->cap); ) {
960                 domain = iommu->domains[i];
961                 clear_bit(i, iommu->domain_ids);
962                 domain_exit(domain);
963                 i = find_next_bit(iommu->domain_ids,
964                         cap_ndoms(iommu->cap), i+1);
965         }
966
967         if (iommu->gcmd & DMA_GCMD_TE)
968                 iommu_disable_translation(iommu);
969
970         if (iommu->irq) {
971                 set_irq_data(iommu->irq, NULL);
972                 /* This will mask the irq */
973                 free_irq(iommu->irq, iommu);
974                 destroy_irq(iommu->irq);
975         }
976
977         kfree(iommu->domains);
978         kfree(iommu->domain_ids);
979
980         /* free context mapping */
981         free_context_table(iommu);
982 }
983
984 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
985 {
986         unsigned long num;
987         unsigned long ndomains;
988         struct dmar_domain *domain;
989         unsigned long flags;
990
991         domain = alloc_domain_mem();
992         if (!domain)
993                 return NULL;
994
995         ndomains = cap_ndoms(iommu->cap);
996
997         spin_lock_irqsave(&iommu->lock, flags);
998         num = find_first_zero_bit(iommu->domain_ids, ndomains);
999         if (num >= ndomains) {
1000                 spin_unlock_irqrestore(&iommu->lock, flags);
1001                 free_domain_mem(domain);
1002                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1003                 return NULL;
1004         }
1005
1006         set_bit(num, iommu->domain_ids);
1007         domain->id = num;
1008         domain->iommu = iommu;
1009         iommu->domains[num] = domain;
1010         spin_unlock_irqrestore(&iommu->lock, flags);
1011
1012         return domain;
1013 }
1014
1015 static void iommu_free_domain(struct dmar_domain *domain)
1016 {
1017         unsigned long flags;
1018
1019         spin_lock_irqsave(&domain->iommu->lock, flags);
1020         clear_bit(domain->id, domain->iommu->domain_ids);
1021         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1022 }
1023
1024 static struct iova_domain reserved_iova_list;
1025 static struct lock_class_key reserved_alloc_key;
1026 static struct lock_class_key reserved_rbtree_key;
1027
1028 static void dmar_init_reserved_ranges(void)
1029 {
1030         struct pci_dev *pdev = NULL;
1031         struct iova *iova;
1032         int i;
1033         u64 addr, size;
1034
1035         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1036
1037         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1038                 &reserved_alloc_key);
1039         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1040                 &reserved_rbtree_key);
1041
1042         /* IOAPIC ranges shouldn't be accessed by DMA */
1043         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1044                 IOVA_PFN(IOAPIC_RANGE_END));
1045         if (!iova)
1046                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1047
1048         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1049         for_each_pci_dev(pdev) {
1050                 struct resource *r;
1051
1052                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1053                         r = &pdev->resource[i];
1054                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1055                                 continue;
1056                         addr = r->start;
1057                         addr &= PAGE_MASK;
1058                         size = r->end - addr;
1059                         size = PAGE_ALIGN(size);
1060                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1061                                 IOVA_PFN(size + addr) - 1);
1062                         if (!iova)
1063                                 printk(KERN_ERR "Reserve iova failed\n");
1064                 }
1065         }
1066
1067 }
1068
1069 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1070 {
1071         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1072 }
1073
1074 static inline int guestwidth_to_adjustwidth(int gaw)
1075 {
1076         int agaw;
1077         int r = (gaw - 12) % 9;
1078
1079         if (r == 0)
1080                 agaw = gaw;
1081         else
1082                 agaw = gaw + 9 - r;
1083         if (agaw > 64)
1084                 agaw = 64;
1085         return agaw;
1086 }
1087
1088 static int domain_init(struct dmar_domain *domain, int guest_width)
1089 {
1090         struct intel_iommu *iommu;
1091         int adjust_width, agaw;
1092         unsigned long sagaw;
1093
1094         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1095         spin_lock_init(&domain->mapping_lock);
1096
1097         domain_reserve_special_ranges(domain);
1098
1099         /* calculate AGAW */
1100         iommu = domain->iommu;
1101         if (guest_width > cap_mgaw(iommu->cap))
1102                 guest_width = cap_mgaw(iommu->cap);
1103         domain->gaw = guest_width;
1104         adjust_width = guestwidth_to_adjustwidth(guest_width);
1105         agaw = width_to_agaw(adjust_width);
1106         sagaw = cap_sagaw(iommu->cap);
1107         if (!test_bit(agaw, &sagaw)) {
1108                 /* hardware doesn't support it, choose a bigger one */
1109                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1110                 agaw = find_next_bit(&sagaw, 5, agaw);
1111                 if (agaw >= 5)
1112                         return -ENODEV;
1113         }
1114         domain->agaw = agaw;
1115         INIT_LIST_HEAD(&domain->devices);
1116
1117         /* always allocate the top pgd */
1118         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1119         if (!domain->pgd)
1120                 return -ENOMEM;
1121         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1122         return 0;
1123 }
1124
1125 static void domain_exit(struct dmar_domain *domain)
1126 {
1127         u64 end;
1128
1129         /* Domain 0 is reserved, so dont process it */
1130         if (!domain)
1131                 return;
1132
1133         domain_remove_dev_info(domain);
1134         /* destroy iovas */
1135         put_iova_domain(&domain->iovad);
1136         end = DOMAIN_MAX_ADDR(domain->gaw);
1137         end = end & (~PAGE_MASK);
1138
1139         /* clear ptes */
1140         dma_pte_clear_range(domain, 0, end);
1141
1142         /* free page tables */
1143         dma_pte_free_pagetable(domain, 0, end);
1144
1145         iommu_free_domain(domain);
1146         free_domain_mem(domain);
1147 }
1148
1149 static int domain_context_mapping_one(struct dmar_domain *domain,
1150                 u8 bus, u8 devfn)
1151 {
1152         struct context_entry *context;
1153         struct intel_iommu *iommu = domain->iommu;
1154         unsigned long flags;
1155
1156         pr_debug("Set context mapping for %02x:%02x.%d\n",
1157                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1158         BUG_ON(!domain->pgd);
1159         context = device_to_context_entry(iommu, bus, devfn);
1160         if (!context)
1161                 return -ENOMEM;
1162         spin_lock_irqsave(&iommu->lock, flags);
1163         if (context_present(*context)) {
1164                 spin_unlock_irqrestore(&iommu->lock, flags);
1165                 return 0;
1166         }
1167
1168         context_set_domain_id(*context, domain->id);
1169         context_set_address_width(*context, domain->agaw);
1170         context_set_address_root(*context, virt_to_phys(domain->pgd));
1171         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1172         context_set_fault_enable(*context);
1173         context_set_present(*context);
1174         __iommu_flush_cache(iommu, context, sizeof(*context));
1175
1176         /* it's a non-present to present mapping */
1177         if (iommu->flush.flush_context(iommu, domain->id,
1178                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1179                 DMA_CCMD_DEVICE_INVL, 1))
1180                 iommu_flush_write_buffer(iommu);
1181         else
1182                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1183
1184         spin_unlock_irqrestore(&iommu->lock, flags);
1185         return 0;
1186 }
1187
1188 static int
1189 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1190 {
1191         int ret;
1192         struct pci_dev *tmp, *parent;
1193
1194         ret = domain_context_mapping_one(domain, pdev->bus->number,
1195                 pdev->devfn);
1196         if (ret)
1197                 return ret;
1198
1199         /* dependent device mapping */
1200         tmp = pci_find_upstream_pcie_bridge(pdev);
1201         if (!tmp)
1202                 return 0;
1203         /* Secondary interface's bus number and devfn 0 */
1204         parent = pdev->bus->self;
1205         while (parent != tmp) {
1206                 ret = domain_context_mapping_one(domain, parent->bus->number,
1207                         parent->devfn);
1208                 if (ret)
1209                         return ret;
1210                 parent = parent->bus->self;
1211         }
1212         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1213                 return domain_context_mapping_one(domain,
1214                         tmp->subordinate->number, 0);
1215         else /* this is a legacy PCI bridge */
1216                 return domain_context_mapping_one(domain,
1217                         tmp->bus->number, tmp->devfn);
1218 }
1219
1220 static int domain_context_mapped(struct dmar_domain *domain,
1221         struct pci_dev *pdev)
1222 {
1223         int ret;
1224         struct pci_dev *tmp, *parent;
1225
1226         ret = device_context_mapped(domain->iommu,
1227                 pdev->bus->number, pdev->devfn);
1228         if (!ret)
1229                 return ret;
1230         /* dependent device mapping */
1231         tmp = pci_find_upstream_pcie_bridge(pdev);
1232         if (!tmp)
1233                 return ret;
1234         /* Secondary interface's bus number and devfn 0 */
1235         parent = pdev->bus->self;
1236         while (parent != tmp) {
1237                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1238                         parent->devfn);
1239                 if (!ret)
1240                         return ret;
1241                 parent = parent->bus->self;
1242         }
1243         if (tmp->is_pcie)
1244                 return device_context_mapped(domain->iommu,
1245                         tmp->subordinate->number, 0);
1246         else
1247                 return device_context_mapped(domain->iommu,
1248                         tmp->bus->number, tmp->devfn);
1249 }
1250
1251 static int
1252 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1253                         u64 hpa, size_t size, int prot)
1254 {
1255         u64 start_pfn, end_pfn;
1256         struct dma_pte *pte;
1257         int index;
1258         int addr_width = agaw_to_width(domain->agaw);
1259
1260         hpa &= (((u64)1) << addr_width) - 1;
1261
1262         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1263                 return -EINVAL;
1264         iova &= PAGE_MASK;
1265         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1266         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1267         index = 0;
1268         while (start_pfn < end_pfn) {
1269                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1270                 if (!pte)
1271                         return -ENOMEM;
1272                 /* We don't need lock here, nobody else
1273                  * touches the iova range
1274                  */
1275                 BUG_ON(dma_pte_addr(*pte));
1276                 dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1277                 dma_set_pte_prot(*pte, prot);
1278                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1279                 start_pfn++;
1280                 index++;
1281         }
1282         return 0;
1283 }
1284
1285 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1286 {
1287         clear_context_table(domain->iommu, bus, devfn);
1288         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1289                                            DMA_CCMD_GLOBAL_INVL, 0);
1290         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1291                                          DMA_TLB_GLOBAL_FLUSH, 0);
1292 }
1293
1294 static void domain_remove_dev_info(struct dmar_domain *domain)
1295 {
1296         struct device_domain_info *info;
1297         unsigned long flags;
1298
1299         spin_lock_irqsave(&device_domain_lock, flags);
1300         while (!list_empty(&domain->devices)) {
1301                 info = list_entry(domain->devices.next,
1302                         struct device_domain_info, link);
1303                 list_del(&info->link);
1304                 list_del(&info->global);
1305                 if (info->dev)
1306                         info->dev->dev.archdata.iommu = NULL;
1307                 spin_unlock_irqrestore(&device_domain_lock, flags);
1308
1309                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1310                 free_devinfo_mem(info);
1311
1312                 spin_lock_irqsave(&device_domain_lock, flags);
1313         }
1314         spin_unlock_irqrestore(&device_domain_lock, flags);
1315 }
1316
1317 /*
1318  * find_domain
1319  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1320  */
1321 static struct dmar_domain *
1322 find_domain(struct pci_dev *pdev)
1323 {
1324         struct device_domain_info *info;
1325
1326         /* No lock here, assumes no domain exit in normal case */
1327         info = pdev->dev.archdata.iommu;
1328         if (info)
1329                 return info->domain;
1330         return NULL;
1331 }
1332
1333 /* domain is initialized */
1334 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1335 {
1336         struct dmar_domain *domain, *found = NULL;
1337         struct intel_iommu *iommu;
1338         struct dmar_drhd_unit *drhd;
1339         struct device_domain_info *info, *tmp;
1340         struct pci_dev *dev_tmp;
1341         unsigned long flags;
1342         int bus = 0, devfn = 0;
1343
1344         domain = find_domain(pdev);
1345         if (domain)
1346                 return domain;
1347
1348         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1349         if (dev_tmp) {
1350                 if (dev_tmp->is_pcie) {
1351                         bus = dev_tmp->subordinate->number;
1352                         devfn = 0;
1353                 } else {
1354                         bus = dev_tmp->bus->number;
1355                         devfn = dev_tmp->devfn;
1356                 }
1357                 spin_lock_irqsave(&device_domain_lock, flags);
1358                 list_for_each_entry(info, &device_domain_list, global) {
1359                         if (info->bus == bus && info->devfn == devfn) {
1360                                 found = info->domain;
1361                                 break;
1362                         }
1363                 }
1364                 spin_unlock_irqrestore(&device_domain_lock, flags);
1365                 /* pcie-pci bridge already has a domain, uses it */
1366                 if (found) {
1367                         domain = found;
1368                         goto found_domain;
1369                 }
1370         }
1371
1372         /* Allocate new domain for the device */
1373         drhd = dmar_find_matched_drhd_unit(pdev);
1374         if (!drhd) {
1375                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1376                         pci_name(pdev));
1377                 return NULL;
1378         }
1379         iommu = drhd->iommu;
1380
1381         domain = iommu_alloc_domain(iommu);
1382         if (!domain)
1383                 goto error;
1384
1385         if (domain_init(domain, gaw)) {
1386                 domain_exit(domain);
1387                 goto error;
1388         }
1389
1390         /* register pcie-to-pci device */
1391         if (dev_tmp) {
1392                 info = alloc_devinfo_mem();
1393                 if (!info) {
1394                         domain_exit(domain);
1395                         goto error;
1396                 }
1397                 info->bus = bus;
1398                 info->devfn = devfn;
1399                 info->dev = NULL;
1400                 info->domain = domain;
1401                 /* This domain is shared by devices under p2p bridge */
1402                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1403
1404                 /* pcie-to-pci bridge already has a domain, uses it */
1405                 found = NULL;
1406                 spin_lock_irqsave(&device_domain_lock, flags);
1407                 list_for_each_entry(tmp, &device_domain_list, global) {
1408                         if (tmp->bus == bus && tmp->devfn == devfn) {
1409                                 found = tmp->domain;
1410                                 break;
1411                         }
1412                 }
1413                 if (found) {
1414                         free_devinfo_mem(info);
1415                         domain_exit(domain);
1416                         domain = found;
1417                 } else {
1418                         list_add(&info->link, &domain->devices);
1419                         list_add(&info->global, &device_domain_list);
1420                 }
1421                 spin_unlock_irqrestore(&device_domain_lock, flags);
1422         }
1423
1424 found_domain:
1425         info = alloc_devinfo_mem();
1426         if (!info)
1427                 goto error;
1428         info->bus = pdev->bus->number;
1429         info->devfn = pdev->devfn;
1430         info->dev = pdev;
1431         info->domain = domain;
1432         spin_lock_irqsave(&device_domain_lock, flags);
1433         /* somebody is fast */
1434         found = find_domain(pdev);
1435         if (found != NULL) {
1436                 spin_unlock_irqrestore(&device_domain_lock, flags);
1437                 if (found != domain) {
1438                         domain_exit(domain);
1439                         domain = found;
1440                 }
1441                 free_devinfo_mem(info);
1442                 return domain;
1443         }
1444         list_add(&info->link, &domain->devices);
1445         list_add(&info->global, &device_domain_list);
1446         pdev->dev.archdata.iommu = info;
1447         spin_unlock_irqrestore(&device_domain_lock, flags);
1448         return domain;
1449 error:
1450         /* recheck it here, maybe others set it */
1451         return find_domain(pdev);
1452 }
1453
1454 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1455                                       unsigned long long start,
1456                                       unsigned long long end)
1457 {
1458         struct dmar_domain *domain;
1459         unsigned long size;
1460         unsigned long long base;
1461         int ret;
1462
1463         printk(KERN_INFO
1464                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1465                 pci_name(pdev), start, end);
1466         /* page table init */
1467         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1468         if (!domain)
1469                 return -ENOMEM;
1470
1471         /* The address might not be aligned */
1472         base = start & PAGE_MASK;
1473         size = end - base;
1474         size = PAGE_ALIGN(size);
1475         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1476                         IOVA_PFN(base + size) - 1)) {
1477                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1478                 ret = -ENOMEM;
1479                 goto error;
1480         }
1481
1482         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1483                 size, base, pci_name(pdev));
1484         /*
1485          * RMRR range might have overlap with physical memory range,
1486          * clear it first
1487          */
1488         dma_pte_clear_range(domain, base, base + size);
1489
1490         ret = domain_page_mapping(domain, base, base, size,
1491                 DMA_PTE_READ|DMA_PTE_WRITE);
1492         if (ret)
1493                 goto error;
1494
1495         /* context entry init */
1496         ret = domain_context_mapping(domain, pdev);
1497         if (!ret)
1498                 return 0;
1499 error:
1500         domain_exit(domain);
1501         return ret;
1502
1503 }
1504
1505 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1506         struct pci_dev *pdev)
1507 {
1508         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1509                 return 0;
1510         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1511                 rmrr->end_address + 1);
1512 }
1513
1514 #ifdef CONFIG_DMAR_GFX_WA
1515 struct iommu_prepare_data {
1516         struct pci_dev *pdev;
1517         int ret;
1518 };
1519
1520 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1521                                          unsigned long end_pfn, void *datax)
1522 {
1523         struct iommu_prepare_data *data;
1524
1525         data = (struct iommu_prepare_data *)datax;
1526
1527         data->ret = iommu_prepare_identity_map(data->pdev,
1528                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1529         return data->ret;
1530
1531 }
1532
1533 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1534 {
1535         int nid;
1536         struct iommu_prepare_data data;
1537
1538         data.pdev = pdev;
1539         data.ret = 0;
1540
1541         for_each_online_node(nid) {
1542                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1543                 if (data.ret)
1544                         return data.ret;
1545         }
1546         return data.ret;
1547 }
1548
1549 static void __init iommu_prepare_gfx_mapping(void)
1550 {
1551         struct pci_dev *pdev = NULL;
1552         int ret;
1553
1554         for_each_pci_dev(pdev) {
1555                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1556                                 !IS_GFX_DEVICE(pdev))
1557                         continue;
1558                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1559                         pci_name(pdev));
1560                 ret = iommu_prepare_with_active_regions(pdev);
1561                 if (ret)
1562                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1563         }
1564 }
1565 #endif
1566
1567 #ifdef CONFIG_DMAR_FLOPPY_WA
1568 static inline void iommu_prepare_isa(void)
1569 {
1570         struct pci_dev *pdev;
1571         int ret;
1572
1573         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1574         if (!pdev)
1575                 return;
1576
1577         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1578         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1579
1580         if (ret)
1581                 printk("IOMMU: Failed to create 0-64M identity map, "
1582                         "floppy might not work\n");
1583
1584 }
1585 #else
1586 static inline void iommu_prepare_isa(void)
1587 {
1588         return;
1589 }
1590 #endif /* !CONFIG_DMAR_FLPY_WA */
1591
1592 static int __init init_dmars(void)
1593 {
1594         struct dmar_drhd_unit *drhd;
1595         struct dmar_rmrr_unit *rmrr;
1596         struct pci_dev *pdev;
1597         struct intel_iommu *iommu;
1598         int i, ret, unit = 0;
1599
1600         /*
1601          * for each drhd
1602          *    allocate root
1603          *    initialize and program root entry to not present
1604          * endfor
1605          */
1606         for_each_drhd_unit(drhd) {
1607                 g_num_of_iommus++;
1608                 /*
1609                  * lock not needed as this is only incremented in the single
1610                  * threaded kernel __init code path all other access are read
1611                  * only
1612                  */
1613         }
1614
1615         deferred_flush = kzalloc(g_num_of_iommus *
1616                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1617         if (!deferred_flush) {
1618                 ret = -ENOMEM;
1619                 goto error;
1620         }
1621
1622         for_each_drhd_unit(drhd) {
1623                 if (drhd->ignored)
1624                         continue;
1625
1626                 iommu = drhd->iommu;
1627
1628                 ret = iommu_init_domains(iommu);
1629                 if (ret)
1630                         goto error;
1631
1632                 /*
1633                  * TBD:
1634                  * we could share the same root & context tables
1635                  * amoung all IOMMU's. Need to Split it later.
1636                  */
1637                 ret = iommu_alloc_root_entry(iommu);
1638                 if (ret) {
1639                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1640                         goto error;
1641                 }
1642         }
1643
1644         for_each_drhd_unit(drhd) {
1645                 if (drhd->ignored)
1646                         continue;
1647
1648                 iommu = drhd->iommu;
1649                 if (dmar_enable_qi(iommu)) {
1650                         /*
1651                          * Queued Invalidate not enabled, use Register Based
1652                          * Invalidate
1653                          */
1654                         iommu->flush.flush_context = __iommu_flush_context;
1655                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1656                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1657                                "invalidation\n",
1658                                (unsigned long long)drhd->reg_base_addr);
1659                 } else {
1660                         iommu->flush.flush_context = qi_flush_context;
1661                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1662                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1663                                "invalidation\n",
1664                                (unsigned long long)drhd->reg_base_addr);
1665                 }
1666         }
1667
1668         /*
1669          * For each rmrr
1670          *   for each dev attached to rmrr
1671          *   do
1672          *     locate drhd for dev, alloc domain for dev
1673          *     allocate free domain
1674          *     allocate page table entries for rmrr
1675          *     if context not allocated for bus
1676          *           allocate and init context
1677          *           set present in root table for this bus
1678          *     init context with domain, translation etc
1679          *    endfor
1680          * endfor
1681          */
1682         for_each_rmrr_units(rmrr) {
1683                 for (i = 0; i < rmrr->devices_cnt; i++) {
1684                         pdev = rmrr->devices[i];
1685                         /* some BIOS lists non-exist devices in DMAR table */
1686                         if (!pdev)
1687                                 continue;
1688                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1689                         if (ret)
1690                                 printk(KERN_ERR
1691                                  "IOMMU: mapping reserved region failed\n");
1692                 }
1693         }
1694
1695         iommu_prepare_gfx_mapping();
1696
1697         iommu_prepare_isa();
1698
1699         /*
1700          * for each drhd
1701          *   enable fault log
1702          *   global invalidate context cache
1703          *   global invalidate iotlb
1704          *   enable translation
1705          */
1706         for_each_drhd_unit(drhd) {
1707                 if (drhd->ignored)
1708                         continue;
1709                 iommu = drhd->iommu;
1710                 sprintf (iommu->name, "dmar%d", unit++);
1711
1712                 iommu_flush_write_buffer(iommu);
1713
1714                 ret = dmar_set_interrupt(iommu);
1715                 if (ret)
1716                         goto error;
1717
1718                 iommu_set_root_entry(iommu);
1719
1720                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1721                                            0);
1722                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1723                                          0);
1724                 iommu_disable_protect_mem_regions(iommu);
1725
1726                 ret = iommu_enable_translation(iommu);
1727                 if (ret)
1728                         goto error;
1729         }
1730
1731         return 0;
1732 error:
1733         for_each_drhd_unit(drhd) {
1734                 if (drhd->ignored)
1735                         continue;
1736                 iommu = drhd->iommu;
1737                 free_iommu(iommu);
1738         }
1739         return ret;
1740 }
1741
1742 static inline u64 aligned_size(u64 host_addr, size_t size)
1743 {
1744         u64 addr;
1745         addr = (host_addr & (~PAGE_MASK)) + size;
1746         return PAGE_ALIGN(addr);
1747 }
1748
1749 struct iova *
1750 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1751 {
1752         struct iova *piova;
1753
1754         /* Make sure it's in range */
1755         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1756         if (!size || (IOVA_START_ADDR + size > end))
1757                 return NULL;
1758
1759         piova = alloc_iova(&domain->iovad,
1760                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1761         return piova;
1762 }
1763
1764 static struct iova *
1765 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1766                    size_t size, u64 dma_mask)
1767 {
1768         struct pci_dev *pdev = to_pci_dev(dev);
1769         struct iova *iova = NULL;
1770
1771         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1772                 iova = iommu_alloc_iova(domain, size, dma_mask);
1773         else {
1774                 /*
1775                  * First try to allocate an io virtual address in
1776                  * DMA_32BIT_MASK and if that fails then try allocating
1777                  * from higher range
1778                  */
1779                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1780                 if (!iova)
1781                         iova = iommu_alloc_iova(domain, size, dma_mask);
1782         }
1783
1784         if (!iova) {
1785                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1786                 return NULL;
1787         }
1788
1789         return iova;
1790 }
1791
1792 static struct dmar_domain *
1793 get_valid_domain_for_dev(struct pci_dev *pdev)
1794 {
1795         struct dmar_domain *domain;
1796         int ret;
1797
1798         domain = get_domain_for_dev(pdev,
1799                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1800         if (!domain) {
1801                 printk(KERN_ERR
1802                         "Allocating domain for %s failed", pci_name(pdev));
1803                 return NULL;
1804         }
1805
1806         /* make sure context mapping is ok */
1807         if (unlikely(!domain_context_mapped(domain, pdev))) {
1808                 ret = domain_context_mapping(domain, pdev);
1809                 if (ret) {
1810                         printk(KERN_ERR
1811                                 "Domain context map for %s failed",
1812                                 pci_name(pdev));
1813                         return NULL;
1814                 }
1815         }
1816
1817         return domain;
1818 }
1819
1820 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1821                                      size_t size, int dir, u64 dma_mask)
1822 {
1823         struct pci_dev *pdev = to_pci_dev(hwdev);
1824         struct dmar_domain *domain;
1825         phys_addr_t start_paddr;
1826         struct iova *iova;
1827         int prot = 0;
1828         int ret;
1829
1830         BUG_ON(dir == DMA_NONE);
1831         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1832                 return paddr;
1833
1834         domain = get_valid_domain_for_dev(pdev);
1835         if (!domain)
1836                 return 0;
1837
1838         size = aligned_size((u64)paddr, size);
1839
1840         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1841         if (!iova)
1842                 goto error;
1843
1844         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1845
1846         /*
1847          * Check if DMAR supports zero-length reads on write only
1848          * mappings..
1849          */
1850         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1851                         !cap_zlr(domain->iommu->cap))
1852                 prot |= DMA_PTE_READ;
1853         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1854                 prot |= DMA_PTE_WRITE;
1855         /*
1856          * paddr - (paddr + size) might be partial page, we should map the whole
1857          * page.  Note: if two part of one page are separately mapped, we
1858          * might have two guest_addr mapping to the same host paddr, but this
1859          * is not a big problem
1860          */
1861         ret = domain_page_mapping(domain, start_paddr,
1862                 ((u64)paddr) & PAGE_MASK, size, prot);
1863         if (ret)
1864                 goto error;
1865
1866         /* it's a non-present to present mapping */
1867         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1868                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
1869         if (ret)
1870                 iommu_flush_write_buffer(domain->iommu);
1871
1872         return start_paddr + ((u64)paddr & (~PAGE_MASK));
1873
1874 error:
1875         if (iova)
1876                 __free_iova(&domain->iovad, iova);
1877         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1878                 pci_name(pdev), size, (unsigned long long)paddr, dir);
1879         return 0;
1880 }
1881
1882 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
1883                             size_t size, int dir)
1884 {
1885         return __intel_map_single(hwdev, paddr, size, dir,
1886                                   to_pci_dev(hwdev)->dma_mask);
1887 }
1888
1889 static void flush_unmaps(void)
1890 {
1891         int i, j;
1892
1893         timer_on = 0;
1894
1895         /* just flush them all */
1896         for (i = 0; i < g_num_of_iommus; i++) {
1897                 if (deferred_flush[i].next) {
1898                         struct intel_iommu *iommu =
1899                                 deferred_flush[i].domain[0]->iommu;
1900
1901                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1902                                                  DMA_TLB_GLOBAL_FLUSH, 0);
1903                         for (j = 0; j < deferred_flush[i].next; j++) {
1904                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
1905                                                 deferred_flush[i].iova[j]);
1906                         }
1907                         deferred_flush[i].next = 0;
1908                 }
1909         }
1910
1911         list_size = 0;
1912 }
1913
1914 static void flush_unmaps_timeout(unsigned long data)
1915 {
1916         unsigned long flags;
1917
1918         spin_lock_irqsave(&async_umap_flush_lock, flags);
1919         flush_unmaps();
1920         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1921 }
1922
1923 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1924 {
1925         unsigned long flags;
1926         int next, iommu_id;
1927
1928         spin_lock_irqsave(&async_umap_flush_lock, flags);
1929         if (list_size == HIGH_WATER_MARK)
1930                 flush_unmaps();
1931
1932         iommu_id = dom->iommu->seq_id;
1933
1934         next = deferred_flush[iommu_id].next;
1935         deferred_flush[iommu_id].domain[next] = dom;
1936         deferred_flush[iommu_id].iova[next] = iova;
1937         deferred_flush[iommu_id].next++;
1938
1939         if (!timer_on) {
1940                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1941                 timer_on = 1;
1942         }
1943         list_size++;
1944         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1945 }
1946
1947 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
1948                         int dir)
1949 {
1950         struct pci_dev *pdev = to_pci_dev(dev);
1951         struct dmar_domain *domain;
1952         unsigned long start_addr;
1953         struct iova *iova;
1954
1955         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1956                 return;
1957         domain = find_domain(pdev);
1958         BUG_ON(!domain);
1959
1960         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1961         if (!iova)
1962                 return;
1963
1964         start_addr = iova->pfn_lo << PAGE_SHIFT;
1965         size = aligned_size((u64)dev_addr, size);
1966
1967         pr_debug("Device %s unmapping: %lx@%llx\n",
1968                 pci_name(pdev), size, (unsigned long long)start_addr);
1969
1970         /*  clear the whole page */
1971         dma_pte_clear_range(domain, start_addr, start_addr + size);
1972         /* free page tables */
1973         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1974         if (intel_iommu_strict) {
1975                 if (iommu_flush_iotlb_psi(domain->iommu,
1976                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
1977                         iommu_flush_write_buffer(domain->iommu);
1978                 /* free iova */
1979                 __free_iova(&domain->iovad, iova);
1980         } else {
1981                 add_unmap(domain, iova);
1982                 /*
1983                  * queue up the release of the unmap to save the 1/6th of the
1984                  * cpu used up by the iotlb flush operation...
1985                  */
1986         }
1987 }
1988
1989 void *intel_alloc_coherent(struct device *hwdev, size_t size,
1990                            dma_addr_t *dma_handle, gfp_t flags)
1991 {
1992         void *vaddr;
1993         int order;
1994
1995         size = PAGE_ALIGN(size);
1996         order = get_order(size);
1997         flags &= ~(GFP_DMA | GFP_DMA32);
1998
1999         vaddr = (void *)__get_free_pages(flags, order);
2000         if (!vaddr)
2001                 return NULL;
2002         memset(vaddr, 0, size);
2003
2004         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2005                                          DMA_BIDIRECTIONAL,
2006                                          hwdev->coherent_dma_mask);
2007         if (*dma_handle)
2008                 return vaddr;
2009         free_pages((unsigned long)vaddr, order);
2010         return NULL;
2011 }
2012
2013 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2014                          dma_addr_t dma_handle)
2015 {
2016         int order;
2017
2018         size = PAGE_ALIGN(size);
2019         order = get_order(size);
2020
2021         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2022         free_pages((unsigned long)vaddr, order);
2023 }
2024
2025 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2026
2027 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2028                     int nelems, int dir)
2029 {
2030         int i;
2031         struct pci_dev *pdev = to_pci_dev(hwdev);
2032         struct dmar_domain *domain;
2033         unsigned long start_addr;
2034         struct iova *iova;
2035         size_t size = 0;
2036         void *addr;
2037         struct scatterlist *sg;
2038
2039         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2040                 return;
2041
2042         domain = find_domain(pdev);
2043
2044         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2045         if (!iova)
2046                 return;
2047         for_each_sg(sglist, sg, nelems, i) {
2048                 addr = SG_ENT_VIRT_ADDRESS(sg);
2049                 size += aligned_size((u64)addr, sg->length);
2050         }
2051
2052         start_addr = iova->pfn_lo << PAGE_SHIFT;
2053
2054         /*  clear the whole page */
2055         dma_pte_clear_range(domain, start_addr, start_addr + size);
2056         /* free page tables */
2057         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2058
2059         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2060                         size >> VTD_PAGE_SHIFT, 0))
2061                 iommu_flush_write_buffer(domain->iommu);
2062
2063         /* free iova */
2064         __free_iova(&domain->iovad, iova);
2065 }
2066
2067 static int intel_nontranslate_map_sg(struct device *hddev,
2068         struct scatterlist *sglist, int nelems, int dir)
2069 {
2070         int i;
2071         struct scatterlist *sg;
2072
2073         for_each_sg(sglist, sg, nelems, i) {
2074                 BUG_ON(!sg_page(sg));
2075                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2076                 sg->dma_length = sg->length;
2077         }
2078         return nelems;
2079 }
2080
2081 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2082                  int dir)
2083 {
2084         void *addr;
2085         int i;
2086         struct pci_dev *pdev = to_pci_dev(hwdev);
2087         struct dmar_domain *domain;
2088         size_t size = 0;
2089         int prot = 0;
2090         size_t offset = 0;
2091         struct iova *iova = NULL;
2092         int ret;
2093         struct scatterlist *sg;
2094         unsigned long start_addr;
2095
2096         BUG_ON(dir == DMA_NONE);
2097         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2098                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2099
2100         domain = get_valid_domain_for_dev(pdev);
2101         if (!domain)
2102                 return 0;
2103
2104         for_each_sg(sglist, sg, nelems, i) {
2105                 addr = SG_ENT_VIRT_ADDRESS(sg);
2106                 addr = (void *)virt_to_phys(addr);
2107                 size += aligned_size((u64)addr, sg->length);
2108         }
2109
2110         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2111         if (!iova) {
2112                 sglist->dma_length = 0;
2113                 return 0;
2114         }
2115
2116         /*
2117          * Check if DMAR supports zero-length reads on write only
2118          * mappings..
2119          */
2120         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2121                         !cap_zlr(domain->iommu->cap))
2122                 prot |= DMA_PTE_READ;
2123         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2124                 prot |= DMA_PTE_WRITE;
2125
2126         start_addr = iova->pfn_lo << PAGE_SHIFT;
2127         offset = 0;
2128         for_each_sg(sglist, sg, nelems, i) {
2129                 addr = SG_ENT_VIRT_ADDRESS(sg);
2130                 addr = (void *)virt_to_phys(addr);
2131                 size = aligned_size((u64)addr, sg->length);
2132                 ret = domain_page_mapping(domain, start_addr + offset,
2133                         ((u64)addr) & PAGE_MASK,
2134                         size, prot);
2135                 if (ret) {
2136                         /*  clear the page */
2137                         dma_pte_clear_range(domain, start_addr,
2138                                   start_addr + offset);
2139                         /* free page tables */
2140                         dma_pte_free_pagetable(domain, start_addr,
2141                                   start_addr + offset);
2142                         /* free iova */
2143                         __free_iova(&domain->iovad, iova);
2144                         return 0;
2145                 }
2146                 sg->dma_address = start_addr + offset +
2147                                 ((u64)addr & (~PAGE_MASK));
2148                 sg->dma_length = sg->length;
2149                 offset += size;
2150         }
2151
2152         /* it's a non-present to present mapping */
2153         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2154                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2155                 iommu_flush_write_buffer(domain->iommu);
2156         return nelems;
2157 }
2158
2159 static struct dma_mapping_ops intel_dma_ops = {
2160         .alloc_coherent = intel_alloc_coherent,
2161         .free_coherent = intel_free_coherent,
2162         .map_single = intel_map_single,
2163         .unmap_single = intel_unmap_single,
2164         .map_sg = intel_map_sg,
2165         .unmap_sg = intel_unmap_sg,
2166 };
2167
2168 static inline int iommu_domain_cache_init(void)
2169 {
2170         int ret = 0;
2171
2172         iommu_domain_cache = kmem_cache_create("iommu_domain",
2173                                          sizeof(struct dmar_domain),
2174                                          0,
2175                                          SLAB_HWCACHE_ALIGN,
2176
2177                                          NULL);
2178         if (!iommu_domain_cache) {
2179                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2180                 ret = -ENOMEM;
2181         }
2182
2183         return ret;
2184 }
2185
2186 static inline int iommu_devinfo_cache_init(void)
2187 {
2188         int ret = 0;
2189
2190         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2191                                          sizeof(struct device_domain_info),
2192                                          0,
2193                                          SLAB_HWCACHE_ALIGN,
2194                                          NULL);
2195         if (!iommu_devinfo_cache) {
2196                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2197                 ret = -ENOMEM;
2198         }
2199
2200         return ret;
2201 }
2202
2203 static inline int iommu_iova_cache_init(void)
2204 {
2205         int ret = 0;
2206
2207         iommu_iova_cache = kmem_cache_create("iommu_iova",
2208                                          sizeof(struct iova),
2209                                          0,
2210                                          SLAB_HWCACHE_ALIGN,
2211                                          NULL);
2212         if (!iommu_iova_cache) {
2213                 printk(KERN_ERR "Couldn't create iova cache\n");
2214                 ret = -ENOMEM;
2215         }
2216
2217         return ret;
2218 }
2219
2220 static int __init iommu_init_mempool(void)
2221 {
2222         int ret;
2223         ret = iommu_iova_cache_init();
2224         if (ret)
2225                 return ret;
2226
2227         ret = iommu_domain_cache_init();
2228         if (ret)
2229                 goto domain_error;
2230
2231         ret = iommu_devinfo_cache_init();
2232         if (!ret)
2233                 return ret;
2234
2235         kmem_cache_destroy(iommu_domain_cache);
2236 domain_error:
2237         kmem_cache_destroy(iommu_iova_cache);
2238
2239         return -ENOMEM;
2240 }
2241
2242 static void __init iommu_exit_mempool(void)
2243 {
2244         kmem_cache_destroy(iommu_devinfo_cache);
2245         kmem_cache_destroy(iommu_domain_cache);
2246         kmem_cache_destroy(iommu_iova_cache);
2247
2248 }
2249
2250 static void __init init_no_remapping_devices(void)
2251 {
2252         struct dmar_drhd_unit *drhd;
2253
2254         for_each_drhd_unit(drhd) {
2255                 if (!drhd->include_all) {
2256                         int i;
2257                         for (i = 0; i < drhd->devices_cnt; i++)
2258                                 if (drhd->devices[i] != NULL)
2259                                         break;
2260                         /* ignore DMAR unit if no pci devices exist */
2261                         if (i == drhd->devices_cnt)
2262                                 drhd->ignored = 1;
2263                 }
2264         }
2265
2266         if (dmar_map_gfx)
2267                 return;
2268
2269         for_each_drhd_unit(drhd) {
2270                 int i;
2271                 if (drhd->ignored || drhd->include_all)
2272                         continue;
2273
2274                 for (i = 0; i < drhd->devices_cnt; i++)
2275                         if (drhd->devices[i] &&
2276                                 !IS_GFX_DEVICE(drhd->devices[i]))
2277                                 break;
2278
2279                 if (i < drhd->devices_cnt)
2280                         continue;
2281
2282                 /* bypass IOMMU if it is just for gfx devices */
2283                 drhd->ignored = 1;
2284                 for (i = 0; i < drhd->devices_cnt; i++) {
2285                         if (!drhd->devices[i])
2286                                 continue;
2287                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2288                 }
2289         }
2290 }
2291
2292 int __init intel_iommu_init(void)
2293 {
2294         int ret = 0;
2295
2296         if (dmar_table_init())
2297                 return  -ENODEV;
2298
2299         if (dmar_dev_scope_init())
2300                 return  -ENODEV;
2301
2302         /*
2303          * Check the need for DMA-remapping initialization now.
2304          * Above initialization will also be used by Interrupt-remapping.
2305          */
2306         if (no_iommu || swiotlb || dmar_disabled)
2307                 return -ENODEV;
2308
2309         iommu_init_mempool();
2310         dmar_init_reserved_ranges();
2311
2312         init_no_remapping_devices();
2313
2314         ret = init_dmars();
2315         if (ret) {
2316                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2317                 put_iova_domain(&reserved_iova_list);
2318                 iommu_exit_mempool();
2319                 return ret;
2320         }
2321         printk(KERN_INFO
2322         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2323
2324         init_timer(&unmap_timer);
2325         force_iommu = 1;
2326         dma_ops = &intel_dma_ops;
2327         return 0;
2328 }
2329
2330 void intel_iommu_domain_exit(struct dmar_domain *domain)
2331 {
2332         u64 end;
2333
2334         /* Domain 0 is reserved, so dont process it */
2335         if (!domain)
2336                 return;
2337
2338         end = DOMAIN_MAX_ADDR(domain->gaw);
2339         end = end & (~VTD_PAGE_MASK);
2340
2341         /* clear ptes */
2342         dma_pte_clear_range(domain, 0, end);
2343
2344         /* free page tables */
2345         dma_pte_free_pagetable(domain, 0, end);
2346
2347         iommu_free_domain(domain);
2348         free_domain_mem(domain);
2349 }
2350 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2351
2352 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2353 {
2354         struct dmar_drhd_unit *drhd;
2355         struct dmar_domain *domain;
2356         struct intel_iommu *iommu;
2357
2358         drhd = dmar_find_matched_drhd_unit(pdev);
2359         if (!drhd) {
2360                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2361                 return NULL;
2362         }
2363
2364         iommu = drhd->iommu;
2365         if (!iommu) {
2366                 printk(KERN_ERR
2367                         "intel_iommu_domain_alloc: iommu == NULL\n");
2368                 return NULL;
2369         }
2370         domain = iommu_alloc_domain(iommu);
2371         if (!domain) {
2372                 printk(KERN_ERR
2373                         "intel_iommu_domain_alloc: domain == NULL\n");
2374                 return NULL;
2375         }
2376         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2377                 printk(KERN_ERR
2378                         "intel_iommu_domain_alloc: domain_init() failed\n");
2379                 intel_iommu_domain_exit(domain);
2380                 return NULL;
2381         }
2382         return domain;
2383 }
2384 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2385
2386 int intel_iommu_context_mapping(
2387         struct dmar_domain *domain, struct pci_dev *pdev)
2388 {
2389         int rc;
2390         rc = domain_context_mapping(domain, pdev);
2391         return rc;
2392 }
2393 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2394
2395 int intel_iommu_page_mapping(
2396         struct dmar_domain *domain, dma_addr_t iova,
2397         u64 hpa, size_t size, int prot)
2398 {
2399         int rc;
2400         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2401         return rc;
2402 }
2403 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2404
2405 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2406 {
2407         detach_domain_for_dev(domain, bus, devfn);
2408 }
2409 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2410
2411 struct dmar_domain *
2412 intel_iommu_find_domain(struct pci_dev *pdev)
2413 {
2414         return find_domain(pdev);
2415 }
2416 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2417
2418 int intel_iommu_found(void)
2419 {
2420         return g_num_of_iommus;
2421 }
2422 EXPORT_SYMBOL_GPL(intel_iommu_found);
2423
2424 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2425 {
2426         struct dma_pte *pte;
2427         u64 pfn;
2428
2429         pfn = 0;
2430         pte = addr_to_dma_pte(domain, iova);
2431
2432         if (pte)
2433                 pfn = dma_pte_addr(*pte);
2434
2435         return pfn >> VTD_PAGE_SHIFT;
2436 }
2437 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);