]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/pci/intel-iommu.c
intel-iommu: trivially inline context entry macros
[karo-tx-linux.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /*
61  * 0: Present
62  * 1-11: Reserved
63  * 12-63: Context Ptr (12 - (haw-1))
64  * 64-127: Reserved
65  */
66 struct root_entry {
67         u64     val;
68         u64     rsvd1;
69 };
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 static inline bool root_present(struct root_entry *root)
72 {
73         return (root->val & 1);
74 }
75 static inline void set_root_present(struct root_entry *root)
76 {
77         root->val |= 1;
78 }
79 static inline void set_root_value(struct root_entry *root, unsigned long value)
80 {
81         root->val |= value & VTD_PAGE_MASK;
82 }
83
84 static inline struct context_entry *
85 get_context_addr_from_root(struct root_entry *root)
86 {
87         return (struct context_entry *)
88                 (root_present(root)?phys_to_virt(
89                 root->val & VTD_PAGE_MASK) :
90                 NULL);
91 }
92
93 /*
94  * low 64 bits:
95  * 0: present
96  * 1: fault processing disable
97  * 2-3: translation type
98  * 12-63: address space root
99  * high 64 bits:
100  * 0-2: address width
101  * 3-6: aval
102  * 8-23: domain id
103  */
104 struct context_entry {
105         u64 lo;
106         u64 hi;
107 };
108
109 static inline bool context_present(struct context_entry *context)
110 {
111         return (context->lo & 1);
112 }
113 static inline void context_set_present(struct context_entry *context)
114 {
115         context->lo |= 1;
116 }
117
118 static inline void context_set_fault_enable(struct context_entry *context)
119 {
120         context->lo &= (((u64)-1) << 2) | 1;
121 }
122
123 #define CONTEXT_TT_MULTI_LEVEL 0
124
125 static inline void context_set_translation_type(struct context_entry *context,
126                                                 unsigned long value)
127 {
128         context->lo &= (((u64)-1) << 4) | 3;
129         context->lo |= (value & 3) << 2;
130 }
131
132 static inline void context_set_address_root(struct context_entry *context,
133                                             unsigned long value)
134 {
135         context->lo |= value & VTD_PAGE_MASK;
136 }
137
138 static inline void context_set_address_width(struct context_entry *context,
139                                              unsigned long value)
140 {
141         context->hi |= value & 7;
142 }
143
144 static inline void context_set_domain_id(struct context_entry *context,
145                                          unsigned long value)
146 {
147         context->hi |= (value & ((1 << 16) - 1)) << 8;
148 }
149
150 static inline void context_clear_entry(struct context_entry *context)
151 {
152         context->lo = 0;
153         context->hi = 0;
154 }
155
156 /*
157  * 0: readable
158  * 1: writable
159  * 2-6: reserved
160  * 7: super page
161  * 8-11: available
162  * 12-63: Host physcial address
163  */
164 struct dma_pte {
165         u64 val;
166 };
167 #define dma_clear_pte(p)        do {(p).val = 0;} while (0)
168
169 #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
170 #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
171 #define dma_set_pte_prot(p, prot) \
172                 do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
173 #define dma_pte_addr(p) ((p).val & VTD_PAGE_MASK)
174 #define dma_set_pte_addr(p, addr) do {\
175                 (p).val |= ((addr) & VTD_PAGE_MASK); } while (0)
176 #define dma_pte_present(p) (((p).val & 3) != 0)
177
178 struct dmar_domain {
179         int     id;                     /* domain id */
180         struct intel_iommu *iommu;      /* back pointer to owning iommu */
181
182         struct list_head devices;       /* all devices' list */
183         struct iova_domain iovad;       /* iova's that belong to this domain */
184
185         struct dma_pte  *pgd;           /* virtual address */
186         spinlock_t      mapping_lock;   /* page table lock */
187         int             gaw;            /* max guest address width */
188
189         /* adjusted guest address width, 0 is level 2 30-bit */
190         int             agaw;
191
192 #define DOMAIN_FLAG_MULTIPLE_DEVICES 1
193         int             flags;
194 };
195
196 /* PCI domain-device relationship */
197 struct device_domain_info {
198         struct list_head link;  /* link to domain siblings */
199         struct list_head global; /* link to global list */
200         u8 bus;                 /* PCI bus numer */
201         u8 devfn;               /* PCI devfn number */
202         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
203         struct dmar_domain *domain; /* pointer to domain */
204 };
205
206 static void flush_unmaps_timeout(unsigned long data);
207
208 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
209
210 #define HIGH_WATER_MARK 250
211 struct deferred_flush_tables {
212         int next;
213         struct iova *iova[HIGH_WATER_MARK];
214         struct dmar_domain *domain[HIGH_WATER_MARK];
215 };
216
217 static struct deferred_flush_tables *deferred_flush;
218
219 /* bitmap for indexing intel_iommus */
220 static int g_num_of_iommus;
221
222 static DEFINE_SPINLOCK(async_umap_flush_lock);
223 static LIST_HEAD(unmaps_to_do);
224
225 static int timer_on;
226 static long list_size;
227
228 static void domain_remove_dev_info(struct dmar_domain *domain);
229
230 int dmar_disabled;
231 static int __initdata dmar_map_gfx = 1;
232 static int dmar_forcedac;
233 static int intel_iommu_strict;
234
235 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
236 static DEFINE_SPINLOCK(device_domain_lock);
237 static LIST_HEAD(device_domain_list);
238
239 static int __init intel_iommu_setup(char *str)
240 {
241         if (!str)
242                 return -EINVAL;
243         while (*str) {
244                 if (!strncmp(str, "off", 3)) {
245                         dmar_disabled = 1;
246                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
247                 } else if (!strncmp(str, "igfx_off", 8)) {
248                         dmar_map_gfx = 0;
249                         printk(KERN_INFO
250                                 "Intel-IOMMU: disable GFX device mapping\n");
251                 } else if (!strncmp(str, "forcedac", 8)) {
252                         printk(KERN_INFO
253                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
254                         dmar_forcedac = 1;
255                 } else if (!strncmp(str, "strict", 6)) {
256                         printk(KERN_INFO
257                                 "Intel-IOMMU: disable batched IOTLB flush\n");
258                         intel_iommu_strict = 1;
259                 }
260
261                 str += strcspn(str, ",");
262                 while (*str == ',')
263                         str++;
264         }
265         return 0;
266 }
267 __setup("intel_iommu=", intel_iommu_setup);
268
269 static struct kmem_cache *iommu_domain_cache;
270 static struct kmem_cache *iommu_devinfo_cache;
271 static struct kmem_cache *iommu_iova_cache;
272
273 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
274 {
275         unsigned int flags;
276         void *vaddr;
277
278         /* trying to avoid low memory issues */
279         flags = current->flags & PF_MEMALLOC;
280         current->flags |= PF_MEMALLOC;
281         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
282         current->flags &= (~PF_MEMALLOC | flags);
283         return vaddr;
284 }
285
286
287 static inline void *alloc_pgtable_page(void)
288 {
289         unsigned int flags;
290         void *vaddr;
291
292         /* trying to avoid low memory issues */
293         flags = current->flags & PF_MEMALLOC;
294         current->flags |= PF_MEMALLOC;
295         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
296         current->flags &= (~PF_MEMALLOC | flags);
297         return vaddr;
298 }
299
300 static inline void free_pgtable_page(void *vaddr)
301 {
302         free_page((unsigned long)vaddr);
303 }
304
305 static inline void *alloc_domain_mem(void)
306 {
307         return iommu_kmem_cache_alloc(iommu_domain_cache);
308 }
309
310 static void free_domain_mem(void *vaddr)
311 {
312         kmem_cache_free(iommu_domain_cache, vaddr);
313 }
314
315 static inline void * alloc_devinfo_mem(void)
316 {
317         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
318 }
319
320 static inline void free_devinfo_mem(void *vaddr)
321 {
322         kmem_cache_free(iommu_devinfo_cache, vaddr);
323 }
324
325 struct iova *alloc_iova_mem(void)
326 {
327         return iommu_kmem_cache_alloc(iommu_iova_cache);
328 }
329
330 void free_iova_mem(struct iova *iova)
331 {
332         kmem_cache_free(iommu_iova_cache, iova);
333 }
334
335 /* Gets context entry for a given bus and devfn */
336 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
337                 u8 bus, u8 devfn)
338 {
339         struct root_entry *root;
340         struct context_entry *context;
341         unsigned long phy_addr;
342         unsigned long flags;
343
344         spin_lock_irqsave(&iommu->lock, flags);
345         root = &iommu->root_entry[bus];
346         context = get_context_addr_from_root(root);
347         if (!context) {
348                 context = (struct context_entry *)alloc_pgtable_page();
349                 if (!context) {
350                         spin_unlock_irqrestore(&iommu->lock, flags);
351                         return NULL;
352                 }
353                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
354                 phy_addr = virt_to_phys((void *)context);
355                 set_root_value(root, phy_addr);
356                 set_root_present(root);
357                 __iommu_flush_cache(iommu, root, sizeof(*root));
358         }
359         spin_unlock_irqrestore(&iommu->lock, flags);
360         return &context[devfn];
361 }
362
363 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
364 {
365         struct root_entry *root;
366         struct context_entry *context;
367         int ret;
368         unsigned long flags;
369
370         spin_lock_irqsave(&iommu->lock, flags);
371         root = &iommu->root_entry[bus];
372         context = get_context_addr_from_root(root);
373         if (!context) {
374                 ret = 0;
375                 goto out;
376         }
377         ret = context_present(&context[devfn]);
378 out:
379         spin_unlock_irqrestore(&iommu->lock, flags);
380         return ret;
381 }
382
383 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
384 {
385         struct root_entry *root;
386         struct context_entry *context;
387         unsigned long flags;
388
389         spin_lock_irqsave(&iommu->lock, flags);
390         root = &iommu->root_entry[bus];
391         context = get_context_addr_from_root(root);
392         if (context) {
393                 context_clear_entry(&context[devfn]);
394                 __iommu_flush_cache(iommu, &context[devfn], \
395                         sizeof(*context));
396         }
397         spin_unlock_irqrestore(&iommu->lock, flags);
398 }
399
400 static void free_context_table(struct intel_iommu *iommu)
401 {
402         struct root_entry *root;
403         int i;
404         unsigned long flags;
405         struct context_entry *context;
406
407         spin_lock_irqsave(&iommu->lock, flags);
408         if (!iommu->root_entry) {
409                 goto out;
410         }
411         for (i = 0; i < ROOT_ENTRY_NR; i++) {
412                 root = &iommu->root_entry[i];
413                 context = get_context_addr_from_root(root);
414                 if (context)
415                         free_pgtable_page(context);
416         }
417         free_pgtable_page(iommu->root_entry);
418         iommu->root_entry = NULL;
419 out:
420         spin_unlock_irqrestore(&iommu->lock, flags);
421 }
422
423 /* page table handling */
424 #define LEVEL_STRIDE            (9)
425 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
426
427 static inline int agaw_to_level(int agaw)
428 {
429         return agaw + 2;
430 }
431
432 static inline int agaw_to_width(int agaw)
433 {
434         return 30 + agaw * LEVEL_STRIDE;
435
436 }
437
438 static inline int width_to_agaw(int width)
439 {
440         return (width - 30) / LEVEL_STRIDE;
441 }
442
443 static inline unsigned int level_to_offset_bits(int level)
444 {
445         return (12 + (level - 1) * LEVEL_STRIDE);
446 }
447
448 static inline int address_level_offset(u64 addr, int level)
449 {
450         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
451 }
452
453 static inline u64 level_mask(int level)
454 {
455         return ((u64)-1 << level_to_offset_bits(level));
456 }
457
458 static inline u64 level_size(int level)
459 {
460         return ((u64)1 << level_to_offset_bits(level));
461 }
462
463 static inline u64 align_to_level(u64 addr, int level)
464 {
465         return ((addr + level_size(level) - 1) & level_mask(level));
466 }
467
468 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
469 {
470         int addr_width = agaw_to_width(domain->agaw);
471         struct dma_pte *parent, *pte = NULL;
472         int level = agaw_to_level(domain->agaw);
473         int offset;
474         unsigned long flags;
475
476         BUG_ON(!domain->pgd);
477
478         addr &= (((u64)1) << addr_width) - 1;
479         parent = domain->pgd;
480
481         spin_lock_irqsave(&domain->mapping_lock, flags);
482         while (level > 0) {
483                 void *tmp_page;
484
485                 offset = address_level_offset(addr, level);
486                 pte = &parent[offset];
487                 if (level == 1)
488                         break;
489
490                 if (!dma_pte_present(*pte)) {
491                         tmp_page = alloc_pgtable_page();
492
493                         if (!tmp_page) {
494                                 spin_unlock_irqrestore(&domain->mapping_lock,
495                                         flags);
496                                 return NULL;
497                         }
498                         __iommu_flush_cache(domain->iommu, tmp_page,
499                                         PAGE_SIZE);
500                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
501                         /*
502                          * high level table always sets r/w, last level page
503                          * table control read/write
504                          */
505                         dma_set_pte_readable(*pte);
506                         dma_set_pte_writable(*pte);
507                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
508                 }
509                 parent = phys_to_virt(dma_pte_addr(*pte));
510                 level--;
511         }
512
513         spin_unlock_irqrestore(&domain->mapping_lock, flags);
514         return pte;
515 }
516
517 /* return address's pte at specific level */
518 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
519                 int level)
520 {
521         struct dma_pte *parent, *pte = NULL;
522         int total = agaw_to_level(domain->agaw);
523         int offset;
524
525         parent = domain->pgd;
526         while (level <= total) {
527                 offset = address_level_offset(addr, total);
528                 pte = &parent[offset];
529                 if (level == total)
530                         return pte;
531
532                 if (!dma_pte_present(*pte))
533                         break;
534                 parent = phys_to_virt(dma_pte_addr(*pte));
535                 total--;
536         }
537         return NULL;
538 }
539
540 /* clear one page's page table */
541 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
542 {
543         struct dma_pte *pte = NULL;
544
545         /* get last level pte */
546         pte = dma_addr_level_pte(domain, addr, 1);
547
548         if (pte) {
549                 dma_clear_pte(*pte);
550                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
551         }
552 }
553
554 /* clear last level pte, a tlb flush should be followed */
555 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
556 {
557         int addr_width = agaw_to_width(domain->agaw);
558
559         start &= (((u64)1) << addr_width) - 1;
560         end &= (((u64)1) << addr_width) - 1;
561         /* in case it's partial page */
562         start = PAGE_ALIGN(start);
563         end &= PAGE_MASK;
564
565         /* we don't need lock here, nobody else touches the iova range */
566         while (start < end) {
567                 dma_pte_clear_one(domain, start);
568                 start += VTD_PAGE_SIZE;
569         }
570 }
571
572 /* free page table pages. last level pte should already be cleared */
573 static void dma_pte_free_pagetable(struct dmar_domain *domain,
574         u64 start, u64 end)
575 {
576         int addr_width = agaw_to_width(domain->agaw);
577         struct dma_pte *pte;
578         int total = agaw_to_level(domain->agaw);
579         int level;
580         u64 tmp;
581
582         start &= (((u64)1) << addr_width) - 1;
583         end &= (((u64)1) << addr_width) - 1;
584
585         /* we don't need lock here, nobody else touches the iova range */
586         level = 2;
587         while (level <= total) {
588                 tmp = align_to_level(start, level);
589                 if (tmp >= end || (tmp + level_size(level) > end))
590                         return;
591
592                 while (tmp < end) {
593                         pte = dma_addr_level_pte(domain, tmp, level);
594                         if (pte) {
595                                 free_pgtable_page(
596                                         phys_to_virt(dma_pte_addr(*pte)));
597                                 dma_clear_pte(*pte);
598                                 __iommu_flush_cache(domain->iommu,
599                                                 pte, sizeof(*pte));
600                         }
601                         tmp += level_size(level);
602                 }
603                 level++;
604         }
605         /* free pgd */
606         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
607                 free_pgtable_page(domain->pgd);
608                 domain->pgd = NULL;
609         }
610 }
611
612 /* iommu handling */
613 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
614 {
615         struct root_entry *root;
616         unsigned long flags;
617
618         root = (struct root_entry *)alloc_pgtable_page();
619         if (!root)
620                 return -ENOMEM;
621
622         __iommu_flush_cache(iommu, root, ROOT_SIZE);
623
624         spin_lock_irqsave(&iommu->lock, flags);
625         iommu->root_entry = root;
626         spin_unlock_irqrestore(&iommu->lock, flags);
627
628         return 0;
629 }
630
631 static void iommu_set_root_entry(struct intel_iommu *iommu)
632 {
633         void *addr;
634         u32 cmd, sts;
635         unsigned long flag;
636
637         addr = iommu->root_entry;
638
639         spin_lock_irqsave(&iommu->register_lock, flag);
640         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
641
642         cmd = iommu->gcmd | DMA_GCMD_SRTP;
643         writel(cmd, iommu->reg + DMAR_GCMD_REG);
644
645         /* Make sure hardware complete it */
646         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
647                 readl, (sts & DMA_GSTS_RTPS), sts);
648
649         spin_unlock_irqrestore(&iommu->register_lock, flag);
650 }
651
652 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
653 {
654         u32 val;
655         unsigned long flag;
656
657         if (!cap_rwbf(iommu->cap))
658                 return;
659         val = iommu->gcmd | DMA_GCMD_WBF;
660
661         spin_lock_irqsave(&iommu->register_lock, flag);
662         writel(val, iommu->reg + DMAR_GCMD_REG);
663
664         /* Make sure hardware complete it */
665         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
666                         readl, (!(val & DMA_GSTS_WBFS)), val);
667
668         spin_unlock_irqrestore(&iommu->register_lock, flag);
669 }
670
671 /* return value determine if we need a write buffer flush */
672 static int __iommu_flush_context(struct intel_iommu *iommu,
673         u16 did, u16 source_id, u8 function_mask, u64 type,
674         int non_present_entry_flush)
675 {
676         u64 val = 0;
677         unsigned long flag;
678
679         /*
680          * In the non-present entry flush case, if hardware doesn't cache
681          * non-present entry we do nothing and if hardware cache non-present
682          * entry, we flush entries of domain 0 (the domain id is used to cache
683          * any non-present entries)
684          */
685         if (non_present_entry_flush) {
686                 if (!cap_caching_mode(iommu->cap))
687                         return 1;
688                 else
689                         did = 0;
690         }
691
692         switch (type) {
693         case DMA_CCMD_GLOBAL_INVL:
694                 val = DMA_CCMD_GLOBAL_INVL;
695                 break;
696         case DMA_CCMD_DOMAIN_INVL:
697                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
698                 break;
699         case DMA_CCMD_DEVICE_INVL:
700                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
701                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
702                 break;
703         default:
704                 BUG();
705         }
706         val |= DMA_CCMD_ICC;
707
708         spin_lock_irqsave(&iommu->register_lock, flag);
709         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
710
711         /* Make sure hardware complete it */
712         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
713                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
714
715         spin_unlock_irqrestore(&iommu->register_lock, flag);
716
717         /* flush context entry will implicitly flush write buffer */
718         return 0;
719 }
720
721 /* return value determine if we need a write buffer flush */
722 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
723         u64 addr, unsigned int size_order, u64 type,
724         int non_present_entry_flush)
725 {
726         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
727         u64 val = 0, val_iva = 0;
728         unsigned long flag;
729
730         /*
731          * In the non-present entry flush case, if hardware doesn't cache
732          * non-present entry we do nothing and if hardware cache non-present
733          * entry, we flush entries of domain 0 (the domain id is used to cache
734          * any non-present entries)
735          */
736         if (non_present_entry_flush) {
737                 if (!cap_caching_mode(iommu->cap))
738                         return 1;
739                 else
740                         did = 0;
741         }
742
743         switch (type) {
744         case DMA_TLB_GLOBAL_FLUSH:
745                 /* global flush doesn't need set IVA_REG */
746                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
747                 break;
748         case DMA_TLB_DSI_FLUSH:
749                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
750                 break;
751         case DMA_TLB_PSI_FLUSH:
752                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
753                 /* Note: always flush non-leaf currently */
754                 val_iva = size_order | addr;
755                 break;
756         default:
757                 BUG();
758         }
759         /* Note: set drain read/write */
760 #if 0
761         /*
762          * This is probably to be super secure.. Looks like we can
763          * ignore it without any impact.
764          */
765         if (cap_read_drain(iommu->cap))
766                 val |= DMA_TLB_READ_DRAIN;
767 #endif
768         if (cap_write_drain(iommu->cap))
769                 val |= DMA_TLB_WRITE_DRAIN;
770
771         spin_lock_irqsave(&iommu->register_lock, flag);
772         /* Note: Only uses first TLB reg currently */
773         if (val_iva)
774                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
775         dmar_writeq(iommu->reg + tlb_offset + 8, val);
776
777         /* Make sure hardware complete it */
778         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
779                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
780
781         spin_unlock_irqrestore(&iommu->register_lock, flag);
782
783         /* check IOTLB invalidation granularity */
784         if (DMA_TLB_IAIG(val) == 0)
785                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
786         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
787                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
788                         (unsigned long long)DMA_TLB_IIRG(type),
789                         (unsigned long long)DMA_TLB_IAIG(val));
790         /* flush iotlb entry will implicitly flush write buffer */
791         return 0;
792 }
793
794 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
795         u64 addr, unsigned int pages, int non_present_entry_flush)
796 {
797         unsigned int mask;
798
799         BUG_ON(addr & (~VTD_PAGE_MASK));
800         BUG_ON(pages == 0);
801
802         /* Fallback to domain selective flush if no PSI support */
803         if (!cap_pgsel_inv(iommu->cap))
804                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
805                                                 DMA_TLB_DSI_FLUSH,
806                                                 non_present_entry_flush);
807
808         /*
809          * PSI requires page size to be 2 ^ x, and the base address is naturally
810          * aligned to the size
811          */
812         mask = ilog2(__roundup_pow_of_two(pages));
813         /* Fallback to domain selective flush if size is too big */
814         if (mask > cap_max_amask_val(iommu->cap))
815                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
816                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
817
818         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
819                                         DMA_TLB_PSI_FLUSH,
820                                         non_present_entry_flush);
821 }
822
823 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
824 {
825         u32 pmen;
826         unsigned long flags;
827
828         spin_lock_irqsave(&iommu->register_lock, flags);
829         pmen = readl(iommu->reg + DMAR_PMEN_REG);
830         pmen &= ~DMA_PMEN_EPM;
831         writel(pmen, iommu->reg + DMAR_PMEN_REG);
832
833         /* wait for the protected region status bit to clear */
834         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
835                 readl, !(pmen & DMA_PMEN_PRS), pmen);
836
837         spin_unlock_irqrestore(&iommu->register_lock, flags);
838 }
839
840 static int iommu_enable_translation(struct intel_iommu *iommu)
841 {
842         u32 sts;
843         unsigned long flags;
844
845         spin_lock_irqsave(&iommu->register_lock, flags);
846         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
847
848         /* Make sure hardware complete it */
849         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
850                 readl, (sts & DMA_GSTS_TES), sts);
851
852         iommu->gcmd |= DMA_GCMD_TE;
853         spin_unlock_irqrestore(&iommu->register_lock, flags);
854         return 0;
855 }
856
857 static int iommu_disable_translation(struct intel_iommu *iommu)
858 {
859         u32 sts;
860         unsigned long flag;
861
862         spin_lock_irqsave(&iommu->register_lock, flag);
863         iommu->gcmd &= ~DMA_GCMD_TE;
864         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
865
866         /* Make sure hardware complete it */
867         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
868                 readl, (!(sts & DMA_GSTS_TES)), sts);
869
870         spin_unlock_irqrestore(&iommu->register_lock, flag);
871         return 0;
872 }
873
874 /* iommu interrupt handling. Most stuff are MSI-like. */
875
876 static const char *fault_reason_strings[] =
877 {
878         "Software",
879         "Present bit in root entry is clear",
880         "Present bit in context entry is clear",
881         "Invalid context entry",
882         "Access beyond MGAW",
883         "PTE Write access is not set",
884         "PTE Read access is not set",
885         "Next page table ptr is invalid",
886         "Root table address invalid",
887         "Context table ptr is invalid",
888         "non-zero reserved fields in RTP",
889         "non-zero reserved fields in CTP",
890         "non-zero reserved fields in PTE",
891 };
892 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
893
894 const char *dmar_get_fault_reason(u8 fault_reason)
895 {
896         if (fault_reason > MAX_FAULT_REASON_IDX)
897                 return "Unknown";
898         else
899                 return fault_reason_strings[fault_reason];
900 }
901
902 void dmar_msi_unmask(unsigned int irq)
903 {
904         struct intel_iommu *iommu = get_irq_data(irq);
905         unsigned long flag;
906
907         /* unmask it */
908         spin_lock_irqsave(&iommu->register_lock, flag);
909         writel(0, iommu->reg + DMAR_FECTL_REG);
910         /* Read a reg to force flush the post write */
911         readl(iommu->reg + DMAR_FECTL_REG);
912         spin_unlock_irqrestore(&iommu->register_lock, flag);
913 }
914
915 void dmar_msi_mask(unsigned int irq)
916 {
917         unsigned long flag;
918         struct intel_iommu *iommu = get_irq_data(irq);
919
920         /* mask it */
921         spin_lock_irqsave(&iommu->register_lock, flag);
922         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
923         /* Read a reg to force flush the post write */
924         readl(iommu->reg + DMAR_FECTL_REG);
925         spin_unlock_irqrestore(&iommu->register_lock, flag);
926 }
927
928 void dmar_msi_write(int irq, struct msi_msg *msg)
929 {
930         struct intel_iommu *iommu = get_irq_data(irq);
931         unsigned long flag;
932
933         spin_lock_irqsave(&iommu->register_lock, flag);
934         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
935         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
936         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938 }
939
940 void dmar_msi_read(int irq, struct msi_msg *msg)
941 {
942         struct intel_iommu *iommu = get_irq_data(irq);
943         unsigned long flag;
944
945         spin_lock_irqsave(&iommu->register_lock, flag);
946         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
947         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
948         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
949         spin_unlock_irqrestore(&iommu->register_lock, flag);
950 }
951
952 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
953                 u8 fault_reason, u16 source_id, unsigned long long addr)
954 {
955         const char *reason;
956
957         reason = dmar_get_fault_reason(fault_reason);
958
959         printk(KERN_ERR
960                 "DMAR:[%s] Request device [%02x:%02x.%d] "
961                 "fault addr %llx \n"
962                 "DMAR:[fault reason %02d] %s\n",
963                 (type ? "DMA Read" : "DMA Write"),
964                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
965                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
966         return 0;
967 }
968
969 #define PRIMARY_FAULT_REG_LEN (16)
970 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
971 {
972         struct intel_iommu *iommu = dev_id;
973         int reg, fault_index;
974         u32 fault_status;
975         unsigned long flag;
976
977         spin_lock_irqsave(&iommu->register_lock, flag);
978         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
979
980         /* TBD: ignore advanced fault log currently */
981         if (!(fault_status & DMA_FSTS_PPF))
982                 goto clear_overflow;
983
984         fault_index = dma_fsts_fault_record_index(fault_status);
985         reg = cap_fault_reg_offset(iommu->cap);
986         while (1) {
987                 u8 fault_reason;
988                 u16 source_id;
989                 u64 guest_addr;
990                 int type;
991                 u32 data;
992
993                 /* highest 32 bits */
994                 data = readl(iommu->reg + reg +
995                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
996                 if (!(data & DMA_FRCD_F))
997                         break;
998
999                 fault_reason = dma_frcd_fault_reason(data);
1000                 type = dma_frcd_type(data);
1001
1002                 data = readl(iommu->reg + reg +
1003                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1004                 source_id = dma_frcd_source_id(data);
1005
1006                 guest_addr = dmar_readq(iommu->reg + reg +
1007                                 fault_index * PRIMARY_FAULT_REG_LEN);
1008                 guest_addr = dma_frcd_page_addr(guest_addr);
1009                 /* clear the fault */
1010                 writel(DMA_FRCD_F, iommu->reg + reg +
1011                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1012
1013                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1014
1015                 iommu_page_fault_do_one(iommu, type, fault_reason,
1016                                 source_id, guest_addr);
1017
1018                 fault_index++;
1019                 if (fault_index > cap_num_fault_regs(iommu->cap))
1020                         fault_index = 0;
1021                 spin_lock_irqsave(&iommu->register_lock, flag);
1022         }
1023 clear_overflow:
1024         /* clear primary fault overflow */
1025         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1026         if (fault_status & DMA_FSTS_PFO)
1027                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1028
1029         spin_unlock_irqrestore(&iommu->register_lock, flag);
1030         return IRQ_HANDLED;
1031 }
1032
1033 int dmar_set_interrupt(struct intel_iommu *iommu)
1034 {
1035         int irq, ret;
1036
1037         irq = create_irq();
1038         if (!irq) {
1039                 printk(KERN_ERR "IOMMU: no free vectors\n");
1040                 return -EINVAL;
1041         }
1042
1043         set_irq_data(irq, iommu);
1044         iommu->irq = irq;
1045
1046         ret = arch_setup_dmar_msi(irq);
1047         if (ret) {
1048                 set_irq_data(irq, NULL);
1049                 iommu->irq = 0;
1050                 destroy_irq(irq);
1051                 return 0;
1052         }
1053
1054         /* Force fault register is cleared */
1055         iommu_page_fault(irq, iommu);
1056
1057         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1058         if (ret)
1059                 printk(KERN_ERR "IOMMU: can't request irq\n");
1060         return ret;
1061 }
1062
1063 static int iommu_init_domains(struct intel_iommu *iommu)
1064 {
1065         unsigned long ndomains;
1066         unsigned long nlongs;
1067
1068         ndomains = cap_ndoms(iommu->cap);
1069         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1070         nlongs = BITS_TO_LONGS(ndomains);
1071
1072         /* TBD: there might be 64K domains,
1073          * consider other allocation for future chip
1074          */
1075         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1076         if (!iommu->domain_ids) {
1077                 printk(KERN_ERR "Allocating domain id array failed\n");
1078                 return -ENOMEM;
1079         }
1080         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1081                         GFP_KERNEL);
1082         if (!iommu->domains) {
1083                 printk(KERN_ERR "Allocating domain array failed\n");
1084                 kfree(iommu->domain_ids);
1085                 return -ENOMEM;
1086         }
1087
1088         spin_lock_init(&iommu->lock);
1089
1090         /*
1091          * if Caching mode is set, then invalid translations are tagged
1092          * with domainid 0. Hence we need to pre-allocate it.
1093          */
1094         if (cap_caching_mode(iommu->cap))
1095                 set_bit(0, iommu->domain_ids);
1096         return 0;
1097 }
1098
1099
1100 static void domain_exit(struct dmar_domain *domain);
1101
1102 void free_dmar_iommu(struct intel_iommu *iommu)
1103 {
1104         struct dmar_domain *domain;
1105         int i;
1106
1107         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1108         for (; i < cap_ndoms(iommu->cap); ) {
1109                 domain = iommu->domains[i];
1110                 clear_bit(i, iommu->domain_ids);
1111                 domain_exit(domain);
1112                 i = find_next_bit(iommu->domain_ids,
1113                         cap_ndoms(iommu->cap), i+1);
1114         }
1115
1116         if (iommu->gcmd & DMA_GCMD_TE)
1117                 iommu_disable_translation(iommu);
1118
1119         if (iommu->irq) {
1120                 set_irq_data(iommu->irq, NULL);
1121                 /* This will mask the irq */
1122                 free_irq(iommu->irq, iommu);
1123                 destroy_irq(iommu->irq);
1124         }
1125
1126         kfree(iommu->domains);
1127         kfree(iommu->domain_ids);
1128
1129         /* free context mapping */
1130         free_context_table(iommu);
1131 }
1132
1133 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1134 {
1135         unsigned long num;
1136         unsigned long ndomains;
1137         struct dmar_domain *domain;
1138         unsigned long flags;
1139
1140         domain = alloc_domain_mem();
1141         if (!domain)
1142                 return NULL;
1143
1144         ndomains = cap_ndoms(iommu->cap);
1145
1146         spin_lock_irqsave(&iommu->lock, flags);
1147         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1148         if (num >= ndomains) {
1149                 spin_unlock_irqrestore(&iommu->lock, flags);
1150                 free_domain_mem(domain);
1151                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1152                 return NULL;
1153         }
1154
1155         set_bit(num, iommu->domain_ids);
1156         domain->id = num;
1157         domain->iommu = iommu;
1158         iommu->domains[num] = domain;
1159         spin_unlock_irqrestore(&iommu->lock, flags);
1160
1161         return domain;
1162 }
1163
1164 static void iommu_free_domain(struct dmar_domain *domain)
1165 {
1166         unsigned long flags;
1167
1168         spin_lock_irqsave(&domain->iommu->lock, flags);
1169         clear_bit(domain->id, domain->iommu->domain_ids);
1170         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1171 }
1172
1173 static struct iova_domain reserved_iova_list;
1174 static struct lock_class_key reserved_alloc_key;
1175 static struct lock_class_key reserved_rbtree_key;
1176
1177 static void dmar_init_reserved_ranges(void)
1178 {
1179         struct pci_dev *pdev = NULL;
1180         struct iova *iova;
1181         int i;
1182         u64 addr, size;
1183
1184         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1185
1186         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1187                 &reserved_alloc_key);
1188         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1189                 &reserved_rbtree_key);
1190
1191         /* IOAPIC ranges shouldn't be accessed by DMA */
1192         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1193                 IOVA_PFN(IOAPIC_RANGE_END));
1194         if (!iova)
1195                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1196
1197         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1198         for_each_pci_dev(pdev) {
1199                 struct resource *r;
1200
1201                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1202                         r = &pdev->resource[i];
1203                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1204                                 continue;
1205                         addr = r->start;
1206                         addr &= PAGE_MASK;
1207                         size = r->end - addr;
1208                         size = PAGE_ALIGN(size);
1209                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1210                                 IOVA_PFN(size + addr) - 1);
1211                         if (!iova)
1212                                 printk(KERN_ERR "Reserve iova failed\n");
1213                 }
1214         }
1215
1216 }
1217
1218 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1219 {
1220         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1221 }
1222
1223 static inline int guestwidth_to_adjustwidth(int gaw)
1224 {
1225         int agaw;
1226         int r = (gaw - 12) % 9;
1227
1228         if (r == 0)
1229                 agaw = gaw;
1230         else
1231                 agaw = gaw + 9 - r;
1232         if (agaw > 64)
1233                 agaw = 64;
1234         return agaw;
1235 }
1236
1237 static int domain_init(struct dmar_domain *domain, int guest_width)
1238 {
1239         struct intel_iommu *iommu;
1240         int adjust_width, agaw;
1241         unsigned long sagaw;
1242
1243         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1244         spin_lock_init(&domain->mapping_lock);
1245
1246         domain_reserve_special_ranges(domain);
1247
1248         /* calculate AGAW */
1249         iommu = domain->iommu;
1250         if (guest_width > cap_mgaw(iommu->cap))
1251                 guest_width = cap_mgaw(iommu->cap);
1252         domain->gaw = guest_width;
1253         adjust_width = guestwidth_to_adjustwidth(guest_width);
1254         agaw = width_to_agaw(adjust_width);
1255         sagaw = cap_sagaw(iommu->cap);
1256         if (!test_bit(agaw, &sagaw)) {
1257                 /* hardware doesn't support it, choose a bigger one */
1258                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1259                 agaw = find_next_bit(&sagaw, 5, agaw);
1260                 if (agaw >= 5)
1261                         return -ENODEV;
1262         }
1263         domain->agaw = agaw;
1264         INIT_LIST_HEAD(&domain->devices);
1265
1266         /* always allocate the top pgd */
1267         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1268         if (!domain->pgd)
1269                 return -ENOMEM;
1270         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1271         return 0;
1272 }
1273
1274 static void domain_exit(struct dmar_domain *domain)
1275 {
1276         u64 end;
1277
1278         /* Domain 0 is reserved, so dont process it */
1279         if (!domain)
1280                 return;
1281
1282         domain_remove_dev_info(domain);
1283         /* destroy iovas */
1284         put_iova_domain(&domain->iovad);
1285         end = DOMAIN_MAX_ADDR(domain->gaw);
1286         end = end & (~PAGE_MASK);
1287
1288         /* clear ptes */
1289         dma_pte_clear_range(domain, 0, end);
1290
1291         /* free page tables */
1292         dma_pte_free_pagetable(domain, 0, end);
1293
1294         iommu_free_domain(domain);
1295         free_domain_mem(domain);
1296 }
1297
1298 static int domain_context_mapping_one(struct dmar_domain *domain,
1299                 u8 bus, u8 devfn)
1300 {
1301         struct context_entry *context;
1302         struct intel_iommu *iommu = domain->iommu;
1303         unsigned long flags;
1304
1305         pr_debug("Set context mapping for %02x:%02x.%d\n",
1306                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1307         BUG_ON(!domain->pgd);
1308         context = device_to_context_entry(iommu, bus, devfn);
1309         if (!context)
1310                 return -ENOMEM;
1311         spin_lock_irqsave(&iommu->lock, flags);
1312         if (context_present(context)) {
1313                 spin_unlock_irqrestore(&iommu->lock, flags);
1314                 return 0;
1315         }
1316
1317         context_set_domain_id(context, domain->id);
1318         context_set_address_width(context, domain->agaw);
1319         context_set_address_root(context, virt_to_phys(domain->pgd));
1320         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1321         context_set_fault_enable(context);
1322         context_set_present(context);
1323         __iommu_flush_cache(iommu, context, sizeof(*context));
1324
1325         /* it's a non-present to present mapping */
1326         if (iommu->flush.flush_context(iommu, domain->id,
1327                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1328                 DMA_CCMD_DEVICE_INVL, 1))
1329                 iommu_flush_write_buffer(iommu);
1330         else
1331                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1332
1333         spin_unlock_irqrestore(&iommu->lock, flags);
1334         return 0;
1335 }
1336
1337 static int
1338 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1339 {
1340         int ret;
1341         struct pci_dev *tmp, *parent;
1342
1343         ret = domain_context_mapping_one(domain, pdev->bus->number,
1344                 pdev->devfn);
1345         if (ret)
1346                 return ret;
1347
1348         /* dependent device mapping */
1349         tmp = pci_find_upstream_pcie_bridge(pdev);
1350         if (!tmp)
1351                 return 0;
1352         /* Secondary interface's bus number and devfn 0 */
1353         parent = pdev->bus->self;
1354         while (parent != tmp) {
1355                 ret = domain_context_mapping_one(domain, parent->bus->number,
1356                         parent->devfn);
1357                 if (ret)
1358                         return ret;
1359                 parent = parent->bus->self;
1360         }
1361         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1362                 return domain_context_mapping_one(domain,
1363                         tmp->subordinate->number, 0);
1364         else /* this is a legacy PCI bridge */
1365                 return domain_context_mapping_one(domain,
1366                         tmp->bus->number, tmp->devfn);
1367 }
1368
1369 static int domain_context_mapped(struct dmar_domain *domain,
1370         struct pci_dev *pdev)
1371 {
1372         int ret;
1373         struct pci_dev *tmp, *parent;
1374
1375         ret = device_context_mapped(domain->iommu,
1376                 pdev->bus->number, pdev->devfn);
1377         if (!ret)
1378                 return ret;
1379         /* dependent device mapping */
1380         tmp = pci_find_upstream_pcie_bridge(pdev);
1381         if (!tmp)
1382                 return ret;
1383         /* Secondary interface's bus number and devfn 0 */
1384         parent = pdev->bus->self;
1385         while (parent != tmp) {
1386                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1387                         parent->devfn);
1388                 if (!ret)
1389                         return ret;
1390                 parent = parent->bus->self;
1391         }
1392         if (tmp->is_pcie)
1393                 return device_context_mapped(domain->iommu,
1394                         tmp->subordinate->number, 0);
1395         else
1396                 return device_context_mapped(domain->iommu,
1397                         tmp->bus->number, tmp->devfn);
1398 }
1399
1400 static int
1401 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1402                         u64 hpa, size_t size, int prot)
1403 {
1404         u64 start_pfn, end_pfn;
1405         struct dma_pte *pte;
1406         int index;
1407         int addr_width = agaw_to_width(domain->agaw);
1408
1409         hpa &= (((u64)1) << addr_width) - 1;
1410
1411         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1412                 return -EINVAL;
1413         iova &= PAGE_MASK;
1414         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1415         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1416         index = 0;
1417         while (start_pfn < end_pfn) {
1418                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1419                 if (!pte)
1420                         return -ENOMEM;
1421                 /* We don't need lock here, nobody else
1422                  * touches the iova range
1423                  */
1424                 BUG_ON(dma_pte_addr(*pte));
1425                 dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1426                 dma_set_pte_prot(*pte, prot);
1427                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1428                 start_pfn++;
1429                 index++;
1430         }
1431         return 0;
1432 }
1433
1434 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1435 {
1436         clear_context_table(domain->iommu, bus, devfn);
1437         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1438                                            DMA_CCMD_GLOBAL_INVL, 0);
1439         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1440                                          DMA_TLB_GLOBAL_FLUSH, 0);
1441 }
1442
1443 static void domain_remove_dev_info(struct dmar_domain *domain)
1444 {
1445         struct device_domain_info *info;
1446         unsigned long flags;
1447
1448         spin_lock_irqsave(&device_domain_lock, flags);
1449         while (!list_empty(&domain->devices)) {
1450                 info = list_entry(domain->devices.next,
1451                         struct device_domain_info, link);
1452                 list_del(&info->link);
1453                 list_del(&info->global);
1454                 if (info->dev)
1455                         info->dev->dev.archdata.iommu = NULL;
1456                 spin_unlock_irqrestore(&device_domain_lock, flags);
1457
1458                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1459                 free_devinfo_mem(info);
1460
1461                 spin_lock_irqsave(&device_domain_lock, flags);
1462         }
1463         spin_unlock_irqrestore(&device_domain_lock, flags);
1464 }
1465
1466 /*
1467  * find_domain
1468  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1469  */
1470 static struct dmar_domain *
1471 find_domain(struct pci_dev *pdev)
1472 {
1473         struct device_domain_info *info;
1474
1475         /* No lock here, assumes no domain exit in normal case */
1476         info = pdev->dev.archdata.iommu;
1477         if (info)
1478                 return info->domain;
1479         return NULL;
1480 }
1481
1482 /* domain is initialized */
1483 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1484 {
1485         struct dmar_domain *domain, *found = NULL;
1486         struct intel_iommu *iommu;
1487         struct dmar_drhd_unit *drhd;
1488         struct device_domain_info *info, *tmp;
1489         struct pci_dev *dev_tmp;
1490         unsigned long flags;
1491         int bus = 0, devfn = 0;
1492
1493         domain = find_domain(pdev);
1494         if (domain)
1495                 return domain;
1496
1497         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1498         if (dev_tmp) {
1499                 if (dev_tmp->is_pcie) {
1500                         bus = dev_tmp->subordinate->number;
1501                         devfn = 0;
1502                 } else {
1503                         bus = dev_tmp->bus->number;
1504                         devfn = dev_tmp->devfn;
1505                 }
1506                 spin_lock_irqsave(&device_domain_lock, flags);
1507                 list_for_each_entry(info, &device_domain_list, global) {
1508                         if (info->bus == bus && info->devfn == devfn) {
1509                                 found = info->domain;
1510                                 break;
1511                         }
1512                 }
1513                 spin_unlock_irqrestore(&device_domain_lock, flags);
1514                 /* pcie-pci bridge already has a domain, uses it */
1515                 if (found) {
1516                         domain = found;
1517                         goto found_domain;
1518                 }
1519         }
1520
1521         /* Allocate new domain for the device */
1522         drhd = dmar_find_matched_drhd_unit(pdev);
1523         if (!drhd) {
1524                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1525                         pci_name(pdev));
1526                 return NULL;
1527         }
1528         iommu = drhd->iommu;
1529
1530         domain = iommu_alloc_domain(iommu);
1531         if (!domain)
1532                 goto error;
1533
1534         if (domain_init(domain, gaw)) {
1535                 domain_exit(domain);
1536                 goto error;
1537         }
1538
1539         /* register pcie-to-pci device */
1540         if (dev_tmp) {
1541                 info = alloc_devinfo_mem();
1542                 if (!info) {
1543                         domain_exit(domain);
1544                         goto error;
1545                 }
1546                 info->bus = bus;
1547                 info->devfn = devfn;
1548                 info->dev = NULL;
1549                 info->domain = domain;
1550                 /* This domain is shared by devices under p2p bridge */
1551                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1552
1553                 /* pcie-to-pci bridge already has a domain, uses it */
1554                 found = NULL;
1555                 spin_lock_irqsave(&device_domain_lock, flags);
1556                 list_for_each_entry(tmp, &device_domain_list, global) {
1557                         if (tmp->bus == bus && tmp->devfn == devfn) {
1558                                 found = tmp->domain;
1559                                 break;
1560                         }
1561                 }
1562                 if (found) {
1563                         free_devinfo_mem(info);
1564                         domain_exit(domain);
1565                         domain = found;
1566                 } else {
1567                         list_add(&info->link, &domain->devices);
1568                         list_add(&info->global, &device_domain_list);
1569                 }
1570                 spin_unlock_irqrestore(&device_domain_lock, flags);
1571         }
1572
1573 found_domain:
1574         info = alloc_devinfo_mem();
1575         if (!info)
1576                 goto error;
1577         info->bus = pdev->bus->number;
1578         info->devfn = pdev->devfn;
1579         info->dev = pdev;
1580         info->domain = domain;
1581         spin_lock_irqsave(&device_domain_lock, flags);
1582         /* somebody is fast */
1583         found = find_domain(pdev);
1584         if (found != NULL) {
1585                 spin_unlock_irqrestore(&device_domain_lock, flags);
1586                 if (found != domain) {
1587                         domain_exit(domain);
1588                         domain = found;
1589                 }
1590                 free_devinfo_mem(info);
1591                 return domain;
1592         }
1593         list_add(&info->link, &domain->devices);
1594         list_add(&info->global, &device_domain_list);
1595         pdev->dev.archdata.iommu = info;
1596         spin_unlock_irqrestore(&device_domain_lock, flags);
1597         return domain;
1598 error:
1599         /* recheck it here, maybe others set it */
1600         return find_domain(pdev);
1601 }
1602
1603 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1604                                       unsigned long long start,
1605                                       unsigned long long end)
1606 {
1607         struct dmar_domain *domain;
1608         unsigned long size;
1609         unsigned long long base;
1610         int ret;
1611
1612         printk(KERN_INFO
1613                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1614                 pci_name(pdev), start, end);
1615         /* page table init */
1616         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1617         if (!domain)
1618                 return -ENOMEM;
1619
1620         /* The address might not be aligned */
1621         base = start & PAGE_MASK;
1622         size = end - base;
1623         size = PAGE_ALIGN(size);
1624         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1625                         IOVA_PFN(base + size) - 1)) {
1626                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1627                 ret = -ENOMEM;
1628                 goto error;
1629         }
1630
1631         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1632                 size, base, pci_name(pdev));
1633         /*
1634          * RMRR range might have overlap with physical memory range,
1635          * clear it first
1636          */
1637         dma_pte_clear_range(domain, base, base + size);
1638
1639         ret = domain_page_mapping(domain, base, base, size,
1640                 DMA_PTE_READ|DMA_PTE_WRITE);
1641         if (ret)
1642                 goto error;
1643
1644         /* context entry init */
1645         ret = domain_context_mapping(domain, pdev);
1646         if (!ret)
1647                 return 0;
1648 error:
1649         domain_exit(domain);
1650         return ret;
1651
1652 }
1653
1654 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1655         struct pci_dev *pdev)
1656 {
1657         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1658                 return 0;
1659         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1660                 rmrr->end_address + 1);
1661 }
1662
1663 #ifdef CONFIG_DMAR_GFX_WA
1664 struct iommu_prepare_data {
1665         struct pci_dev *pdev;
1666         int ret;
1667 };
1668
1669 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1670                                          unsigned long end_pfn, void *datax)
1671 {
1672         struct iommu_prepare_data *data;
1673
1674         data = (struct iommu_prepare_data *)datax;
1675
1676         data->ret = iommu_prepare_identity_map(data->pdev,
1677                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1678         return data->ret;
1679
1680 }
1681
1682 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1683 {
1684         int nid;
1685         struct iommu_prepare_data data;
1686
1687         data.pdev = pdev;
1688         data.ret = 0;
1689
1690         for_each_online_node(nid) {
1691                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1692                 if (data.ret)
1693                         return data.ret;
1694         }
1695         return data.ret;
1696 }
1697
1698 static void __init iommu_prepare_gfx_mapping(void)
1699 {
1700         struct pci_dev *pdev = NULL;
1701         int ret;
1702
1703         for_each_pci_dev(pdev) {
1704                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1705                                 !IS_GFX_DEVICE(pdev))
1706                         continue;
1707                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1708                         pci_name(pdev));
1709                 ret = iommu_prepare_with_active_regions(pdev);
1710                 if (ret)
1711                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1712         }
1713 }
1714 #else /* !CONFIG_DMAR_GFX_WA */
1715 static inline void iommu_prepare_gfx_mapping(void)
1716 {
1717         return;
1718 }
1719 #endif
1720
1721 #ifdef CONFIG_DMAR_FLOPPY_WA
1722 static inline void iommu_prepare_isa(void)
1723 {
1724         struct pci_dev *pdev;
1725         int ret;
1726
1727         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1728         if (!pdev)
1729                 return;
1730
1731         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1732         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1733
1734         if (ret)
1735                 printk("IOMMU: Failed to create 0-64M identity map, "
1736                         "floppy might not work\n");
1737
1738 }
1739 #else
1740 static inline void iommu_prepare_isa(void)
1741 {
1742         return;
1743 }
1744 #endif /* !CONFIG_DMAR_FLPY_WA */
1745
1746 static int __init init_dmars(void)
1747 {
1748         struct dmar_drhd_unit *drhd;
1749         struct dmar_rmrr_unit *rmrr;
1750         struct pci_dev *pdev;
1751         struct intel_iommu *iommu;
1752         int i, ret, unit = 0;
1753
1754         /*
1755          * for each drhd
1756          *    allocate root
1757          *    initialize and program root entry to not present
1758          * endfor
1759          */
1760         for_each_drhd_unit(drhd) {
1761                 g_num_of_iommus++;
1762                 /*
1763                  * lock not needed as this is only incremented in the single
1764                  * threaded kernel __init code path all other access are read
1765                  * only
1766                  */
1767         }
1768
1769         deferred_flush = kzalloc(g_num_of_iommus *
1770                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1771         if (!deferred_flush) {
1772                 ret = -ENOMEM;
1773                 goto error;
1774         }
1775
1776         for_each_drhd_unit(drhd) {
1777                 if (drhd->ignored)
1778                         continue;
1779
1780                 iommu = drhd->iommu;
1781
1782                 ret = iommu_init_domains(iommu);
1783                 if (ret)
1784                         goto error;
1785
1786                 /*
1787                  * TBD:
1788                  * we could share the same root & context tables
1789                  * amoung all IOMMU's. Need to Split it later.
1790                  */
1791                 ret = iommu_alloc_root_entry(iommu);
1792                 if (ret) {
1793                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1794                         goto error;
1795                 }
1796         }
1797
1798         for_each_drhd_unit(drhd) {
1799                 if (drhd->ignored)
1800                         continue;
1801
1802                 iommu = drhd->iommu;
1803                 if (dmar_enable_qi(iommu)) {
1804                         /*
1805                          * Queued Invalidate not enabled, use Register Based
1806                          * Invalidate
1807                          */
1808                         iommu->flush.flush_context = __iommu_flush_context;
1809                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1810                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1811                                "invalidation\n",
1812                                (unsigned long long)drhd->reg_base_addr);
1813                 } else {
1814                         iommu->flush.flush_context = qi_flush_context;
1815                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1816                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1817                                "invalidation\n",
1818                                (unsigned long long)drhd->reg_base_addr);
1819                 }
1820         }
1821
1822         /*
1823          * For each rmrr
1824          *   for each dev attached to rmrr
1825          *   do
1826          *     locate drhd for dev, alloc domain for dev
1827          *     allocate free domain
1828          *     allocate page table entries for rmrr
1829          *     if context not allocated for bus
1830          *           allocate and init context
1831          *           set present in root table for this bus
1832          *     init context with domain, translation etc
1833          *    endfor
1834          * endfor
1835          */
1836         for_each_rmrr_units(rmrr) {
1837                 for (i = 0; i < rmrr->devices_cnt; i++) {
1838                         pdev = rmrr->devices[i];
1839                         /* some BIOS lists non-exist devices in DMAR table */
1840                         if (!pdev)
1841                                 continue;
1842                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1843                         if (ret)
1844                                 printk(KERN_ERR
1845                                  "IOMMU: mapping reserved region failed\n");
1846                 }
1847         }
1848
1849         iommu_prepare_gfx_mapping();
1850
1851         iommu_prepare_isa();
1852
1853         /*
1854          * for each drhd
1855          *   enable fault log
1856          *   global invalidate context cache
1857          *   global invalidate iotlb
1858          *   enable translation
1859          */
1860         for_each_drhd_unit(drhd) {
1861                 if (drhd->ignored)
1862                         continue;
1863                 iommu = drhd->iommu;
1864                 sprintf (iommu->name, "dmar%d", unit++);
1865
1866                 iommu_flush_write_buffer(iommu);
1867
1868                 ret = dmar_set_interrupt(iommu);
1869                 if (ret)
1870                         goto error;
1871
1872                 iommu_set_root_entry(iommu);
1873
1874                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1875                                            0);
1876                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1877                                          0);
1878                 iommu_disable_protect_mem_regions(iommu);
1879
1880                 ret = iommu_enable_translation(iommu);
1881                 if (ret)
1882                         goto error;
1883         }
1884
1885         return 0;
1886 error:
1887         for_each_drhd_unit(drhd) {
1888                 if (drhd->ignored)
1889                         continue;
1890                 iommu = drhd->iommu;
1891                 free_iommu(iommu);
1892         }
1893         return ret;
1894 }
1895
1896 static inline u64 aligned_size(u64 host_addr, size_t size)
1897 {
1898         u64 addr;
1899         addr = (host_addr & (~PAGE_MASK)) + size;
1900         return PAGE_ALIGN(addr);
1901 }
1902
1903 struct iova *
1904 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1905 {
1906         struct iova *piova;
1907
1908         /* Make sure it's in range */
1909         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1910         if (!size || (IOVA_START_ADDR + size > end))
1911                 return NULL;
1912
1913         piova = alloc_iova(&domain->iovad,
1914                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1915         return piova;
1916 }
1917
1918 static struct iova *
1919 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1920                    size_t size, u64 dma_mask)
1921 {
1922         struct pci_dev *pdev = to_pci_dev(dev);
1923         struct iova *iova = NULL;
1924
1925         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1926                 iova = iommu_alloc_iova(domain, size, dma_mask);
1927         else {
1928                 /*
1929                  * First try to allocate an io virtual address in
1930                  * DMA_32BIT_MASK and if that fails then try allocating
1931                  * from higher range
1932                  */
1933                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1934                 if (!iova)
1935                         iova = iommu_alloc_iova(domain, size, dma_mask);
1936         }
1937
1938         if (!iova) {
1939                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1940                 return NULL;
1941         }
1942
1943         return iova;
1944 }
1945
1946 static struct dmar_domain *
1947 get_valid_domain_for_dev(struct pci_dev *pdev)
1948 {
1949         struct dmar_domain *domain;
1950         int ret;
1951
1952         domain = get_domain_for_dev(pdev,
1953                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1954         if (!domain) {
1955                 printk(KERN_ERR
1956                         "Allocating domain for %s failed", pci_name(pdev));
1957                 return NULL;
1958         }
1959
1960         /* make sure context mapping is ok */
1961         if (unlikely(!domain_context_mapped(domain, pdev))) {
1962                 ret = domain_context_mapping(domain, pdev);
1963                 if (ret) {
1964                         printk(KERN_ERR
1965                                 "Domain context map for %s failed",
1966                                 pci_name(pdev));
1967                         return NULL;
1968                 }
1969         }
1970
1971         return domain;
1972 }
1973
1974 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1975                                      size_t size, int dir, u64 dma_mask)
1976 {
1977         struct pci_dev *pdev = to_pci_dev(hwdev);
1978         struct dmar_domain *domain;
1979         phys_addr_t start_paddr;
1980         struct iova *iova;
1981         int prot = 0;
1982         int ret;
1983
1984         BUG_ON(dir == DMA_NONE);
1985         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1986                 return paddr;
1987
1988         domain = get_valid_domain_for_dev(pdev);
1989         if (!domain)
1990                 return 0;
1991
1992         size = aligned_size((u64)paddr, size);
1993
1994         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1995         if (!iova)
1996                 goto error;
1997
1998         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1999
2000         /*
2001          * Check if DMAR supports zero-length reads on write only
2002          * mappings..
2003          */
2004         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2005                         !cap_zlr(domain->iommu->cap))
2006                 prot |= DMA_PTE_READ;
2007         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2008                 prot |= DMA_PTE_WRITE;
2009         /*
2010          * paddr - (paddr + size) might be partial page, we should map the whole
2011          * page.  Note: if two part of one page are separately mapped, we
2012          * might have two guest_addr mapping to the same host paddr, but this
2013          * is not a big problem
2014          */
2015         ret = domain_page_mapping(domain, start_paddr,
2016                 ((u64)paddr) & PAGE_MASK, size, prot);
2017         if (ret)
2018                 goto error;
2019
2020         /* it's a non-present to present mapping */
2021         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
2022                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2023         if (ret)
2024                 iommu_flush_write_buffer(domain->iommu);
2025
2026         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2027
2028 error:
2029         if (iova)
2030                 __free_iova(&domain->iovad, iova);
2031         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2032                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2033         return 0;
2034 }
2035
2036 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2037                             size_t size, int dir)
2038 {
2039         return __intel_map_single(hwdev, paddr, size, dir,
2040                                   to_pci_dev(hwdev)->dma_mask);
2041 }
2042
2043 static void flush_unmaps(void)
2044 {
2045         int i, j;
2046
2047         timer_on = 0;
2048
2049         /* just flush them all */
2050         for (i = 0; i < g_num_of_iommus; i++) {
2051                 if (deferred_flush[i].next) {
2052                         struct intel_iommu *iommu =
2053                                 deferred_flush[i].domain[0]->iommu;
2054
2055                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2056                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2057                         for (j = 0; j < deferred_flush[i].next; j++) {
2058                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2059                                                 deferred_flush[i].iova[j]);
2060                         }
2061                         deferred_flush[i].next = 0;
2062                 }
2063         }
2064
2065         list_size = 0;
2066 }
2067
2068 static void flush_unmaps_timeout(unsigned long data)
2069 {
2070         unsigned long flags;
2071
2072         spin_lock_irqsave(&async_umap_flush_lock, flags);
2073         flush_unmaps();
2074         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2075 }
2076
2077 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2078 {
2079         unsigned long flags;
2080         int next, iommu_id;
2081
2082         spin_lock_irqsave(&async_umap_flush_lock, flags);
2083         if (list_size == HIGH_WATER_MARK)
2084                 flush_unmaps();
2085
2086         iommu_id = dom->iommu->seq_id;
2087
2088         next = deferred_flush[iommu_id].next;
2089         deferred_flush[iommu_id].domain[next] = dom;
2090         deferred_flush[iommu_id].iova[next] = iova;
2091         deferred_flush[iommu_id].next++;
2092
2093         if (!timer_on) {
2094                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2095                 timer_on = 1;
2096         }
2097         list_size++;
2098         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2099 }
2100
2101 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2102                         int dir)
2103 {
2104         struct pci_dev *pdev = to_pci_dev(dev);
2105         struct dmar_domain *domain;
2106         unsigned long start_addr;
2107         struct iova *iova;
2108
2109         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2110                 return;
2111         domain = find_domain(pdev);
2112         BUG_ON(!domain);
2113
2114         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2115         if (!iova)
2116                 return;
2117
2118         start_addr = iova->pfn_lo << PAGE_SHIFT;
2119         size = aligned_size((u64)dev_addr, size);
2120
2121         pr_debug("Device %s unmapping: %lx@%llx\n",
2122                 pci_name(pdev), size, (unsigned long long)start_addr);
2123
2124         /*  clear the whole page */
2125         dma_pte_clear_range(domain, start_addr, start_addr + size);
2126         /* free page tables */
2127         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2128         if (intel_iommu_strict) {
2129                 if (iommu_flush_iotlb_psi(domain->iommu,
2130                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2131                         iommu_flush_write_buffer(domain->iommu);
2132                 /* free iova */
2133                 __free_iova(&domain->iovad, iova);
2134         } else {
2135                 add_unmap(domain, iova);
2136                 /*
2137                  * queue up the release of the unmap to save the 1/6th of the
2138                  * cpu used up by the iotlb flush operation...
2139                  */
2140         }
2141 }
2142
2143 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2144                            dma_addr_t *dma_handle, gfp_t flags)
2145 {
2146         void *vaddr;
2147         int order;
2148
2149         size = PAGE_ALIGN(size);
2150         order = get_order(size);
2151         flags &= ~(GFP_DMA | GFP_DMA32);
2152
2153         vaddr = (void *)__get_free_pages(flags, order);
2154         if (!vaddr)
2155                 return NULL;
2156         memset(vaddr, 0, size);
2157
2158         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2159                                          DMA_BIDIRECTIONAL,
2160                                          hwdev->coherent_dma_mask);
2161         if (*dma_handle)
2162                 return vaddr;
2163         free_pages((unsigned long)vaddr, order);
2164         return NULL;
2165 }
2166
2167 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2168                          dma_addr_t dma_handle)
2169 {
2170         int order;
2171
2172         size = PAGE_ALIGN(size);
2173         order = get_order(size);
2174
2175         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2176         free_pages((unsigned long)vaddr, order);
2177 }
2178
2179 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2180
2181 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2182                     int nelems, int dir)
2183 {
2184         int i;
2185         struct pci_dev *pdev = to_pci_dev(hwdev);
2186         struct dmar_domain *domain;
2187         unsigned long start_addr;
2188         struct iova *iova;
2189         size_t size = 0;
2190         void *addr;
2191         struct scatterlist *sg;
2192
2193         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2194                 return;
2195
2196         domain = find_domain(pdev);
2197
2198         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2199         if (!iova)
2200                 return;
2201         for_each_sg(sglist, sg, nelems, i) {
2202                 addr = SG_ENT_VIRT_ADDRESS(sg);
2203                 size += aligned_size((u64)addr, sg->length);
2204         }
2205
2206         start_addr = iova->pfn_lo << PAGE_SHIFT;
2207
2208         /*  clear the whole page */
2209         dma_pte_clear_range(domain, start_addr, start_addr + size);
2210         /* free page tables */
2211         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2212
2213         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2214                         size >> VTD_PAGE_SHIFT, 0))
2215                 iommu_flush_write_buffer(domain->iommu);
2216
2217         /* free iova */
2218         __free_iova(&domain->iovad, iova);
2219 }
2220
2221 static int intel_nontranslate_map_sg(struct device *hddev,
2222         struct scatterlist *sglist, int nelems, int dir)
2223 {
2224         int i;
2225         struct scatterlist *sg;
2226
2227         for_each_sg(sglist, sg, nelems, i) {
2228                 BUG_ON(!sg_page(sg));
2229                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2230                 sg->dma_length = sg->length;
2231         }
2232         return nelems;
2233 }
2234
2235 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2236                  int dir)
2237 {
2238         void *addr;
2239         int i;
2240         struct pci_dev *pdev = to_pci_dev(hwdev);
2241         struct dmar_domain *domain;
2242         size_t size = 0;
2243         int prot = 0;
2244         size_t offset = 0;
2245         struct iova *iova = NULL;
2246         int ret;
2247         struct scatterlist *sg;
2248         unsigned long start_addr;
2249
2250         BUG_ON(dir == DMA_NONE);
2251         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2252                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2253
2254         domain = get_valid_domain_for_dev(pdev);
2255         if (!domain)
2256                 return 0;
2257
2258         for_each_sg(sglist, sg, nelems, i) {
2259                 addr = SG_ENT_VIRT_ADDRESS(sg);
2260                 addr = (void *)virt_to_phys(addr);
2261                 size += aligned_size((u64)addr, sg->length);
2262         }
2263
2264         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2265         if (!iova) {
2266                 sglist->dma_length = 0;
2267                 return 0;
2268         }
2269
2270         /*
2271          * Check if DMAR supports zero-length reads on write only
2272          * mappings..
2273          */
2274         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2275                         !cap_zlr(domain->iommu->cap))
2276                 prot |= DMA_PTE_READ;
2277         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2278                 prot |= DMA_PTE_WRITE;
2279
2280         start_addr = iova->pfn_lo << PAGE_SHIFT;
2281         offset = 0;
2282         for_each_sg(sglist, sg, nelems, i) {
2283                 addr = SG_ENT_VIRT_ADDRESS(sg);
2284                 addr = (void *)virt_to_phys(addr);
2285                 size = aligned_size((u64)addr, sg->length);
2286                 ret = domain_page_mapping(domain, start_addr + offset,
2287                         ((u64)addr) & PAGE_MASK,
2288                         size, prot);
2289                 if (ret) {
2290                         /*  clear the page */
2291                         dma_pte_clear_range(domain, start_addr,
2292                                   start_addr + offset);
2293                         /* free page tables */
2294                         dma_pte_free_pagetable(domain, start_addr,
2295                                   start_addr + offset);
2296                         /* free iova */
2297                         __free_iova(&domain->iovad, iova);
2298                         return 0;
2299                 }
2300                 sg->dma_address = start_addr + offset +
2301                                 ((u64)addr & (~PAGE_MASK));
2302                 sg->dma_length = sg->length;
2303                 offset += size;
2304         }
2305
2306         /* it's a non-present to present mapping */
2307         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2308                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2309                 iommu_flush_write_buffer(domain->iommu);
2310         return nelems;
2311 }
2312
2313 static struct dma_mapping_ops intel_dma_ops = {
2314         .alloc_coherent = intel_alloc_coherent,
2315         .free_coherent = intel_free_coherent,
2316         .map_single = intel_map_single,
2317         .unmap_single = intel_unmap_single,
2318         .map_sg = intel_map_sg,
2319         .unmap_sg = intel_unmap_sg,
2320 };
2321
2322 static inline int iommu_domain_cache_init(void)
2323 {
2324         int ret = 0;
2325
2326         iommu_domain_cache = kmem_cache_create("iommu_domain",
2327                                          sizeof(struct dmar_domain),
2328                                          0,
2329                                          SLAB_HWCACHE_ALIGN,
2330
2331                                          NULL);
2332         if (!iommu_domain_cache) {
2333                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2334                 ret = -ENOMEM;
2335         }
2336
2337         return ret;
2338 }
2339
2340 static inline int iommu_devinfo_cache_init(void)
2341 {
2342         int ret = 0;
2343
2344         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2345                                          sizeof(struct device_domain_info),
2346                                          0,
2347                                          SLAB_HWCACHE_ALIGN,
2348                                          NULL);
2349         if (!iommu_devinfo_cache) {
2350                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2351                 ret = -ENOMEM;
2352         }
2353
2354         return ret;
2355 }
2356
2357 static inline int iommu_iova_cache_init(void)
2358 {
2359         int ret = 0;
2360
2361         iommu_iova_cache = kmem_cache_create("iommu_iova",
2362                                          sizeof(struct iova),
2363                                          0,
2364                                          SLAB_HWCACHE_ALIGN,
2365                                          NULL);
2366         if (!iommu_iova_cache) {
2367                 printk(KERN_ERR "Couldn't create iova cache\n");
2368                 ret = -ENOMEM;
2369         }
2370
2371         return ret;
2372 }
2373
2374 static int __init iommu_init_mempool(void)
2375 {
2376         int ret;
2377         ret = iommu_iova_cache_init();
2378         if (ret)
2379                 return ret;
2380
2381         ret = iommu_domain_cache_init();
2382         if (ret)
2383                 goto domain_error;
2384
2385         ret = iommu_devinfo_cache_init();
2386         if (!ret)
2387                 return ret;
2388
2389         kmem_cache_destroy(iommu_domain_cache);
2390 domain_error:
2391         kmem_cache_destroy(iommu_iova_cache);
2392
2393         return -ENOMEM;
2394 }
2395
2396 static void __init iommu_exit_mempool(void)
2397 {
2398         kmem_cache_destroy(iommu_devinfo_cache);
2399         kmem_cache_destroy(iommu_domain_cache);
2400         kmem_cache_destroy(iommu_iova_cache);
2401
2402 }
2403
2404 static void __init init_no_remapping_devices(void)
2405 {
2406         struct dmar_drhd_unit *drhd;
2407
2408         for_each_drhd_unit(drhd) {
2409                 if (!drhd->include_all) {
2410                         int i;
2411                         for (i = 0; i < drhd->devices_cnt; i++)
2412                                 if (drhd->devices[i] != NULL)
2413                                         break;
2414                         /* ignore DMAR unit if no pci devices exist */
2415                         if (i == drhd->devices_cnt)
2416                                 drhd->ignored = 1;
2417                 }
2418         }
2419
2420         if (dmar_map_gfx)
2421                 return;
2422
2423         for_each_drhd_unit(drhd) {
2424                 int i;
2425                 if (drhd->ignored || drhd->include_all)
2426                         continue;
2427
2428                 for (i = 0; i < drhd->devices_cnt; i++)
2429                         if (drhd->devices[i] &&
2430                                 !IS_GFX_DEVICE(drhd->devices[i]))
2431                                 break;
2432
2433                 if (i < drhd->devices_cnt)
2434                         continue;
2435
2436                 /* bypass IOMMU if it is just for gfx devices */
2437                 drhd->ignored = 1;
2438                 for (i = 0; i < drhd->devices_cnt; i++) {
2439                         if (!drhd->devices[i])
2440                                 continue;
2441                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2442                 }
2443         }
2444 }
2445
2446 int __init intel_iommu_init(void)
2447 {
2448         int ret = 0;
2449
2450         if (dmar_table_init())
2451                 return  -ENODEV;
2452
2453         if (dmar_dev_scope_init())
2454                 return  -ENODEV;
2455
2456         /*
2457          * Check the need for DMA-remapping initialization now.
2458          * Above initialization will also be used by Interrupt-remapping.
2459          */
2460         if (no_iommu || swiotlb || dmar_disabled)
2461                 return -ENODEV;
2462
2463         iommu_init_mempool();
2464         dmar_init_reserved_ranges();
2465
2466         init_no_remapping_devices();
2467
2468         ret = init_dmars();
2469         if (ret) {
2470                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2471                 put_iova_domain(&reserved_iova_list);
2472                 iommu_exit_mempool();
2473                 return ret;
2474         }
2475         printk(KERN_INFO
2476         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2477
2478         init_timer(&unmap_timer);
2479         force_iommu = 1;
2480         dma_ops = &intel_dma_ops;
2481         return 0;
2482 }
2483
2484 void intel_iommu_domain_exit(struct dmar_domain *domain)
2485 {
2486         u64 end;
2487
2488         /* Domain 0 is reserved, so dont process it */
2489         if (!domain)
2490                 return;
2491
2492         end = DOMAIN_MAX_ADDR(domain->gaw);
2493         end = end & (~VTD_PAGE_MASK);
2494
2495         /* clear ptes */
2496         dma_pte_clear_range(domain, 0, end);
2497
2498         /* free page tables */
2499         dma_pte_free_pagetable(domain, 0, end);
2500
2501         iommu_free_domain(domain);
2502         free_domain_mem(domain);
2503 }
2504 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2505
2506 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2507 {
2508         struct dmar_drhd_unit *drhd;
2509         struct dmar_domain *domain;
2510         struct intel_iommu *iommu;
2511
2512         drhd = dmar_find_matched_drhd_unit(pdev);
2513         if (!drhd) {
2514                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2515                 return NULL;
2516         }
2517
2518         iommu = drhd->iommu;
2519         if (!iommu) {
2520                 printk(KERN_ERR
2521                         "intel_iommu_domain_alloc: iommu == NULL\n");
2522                 return NULL;
2523         }
2524         domain = iommu_alloc_domain(iommu);
2525         if (!domain) {
2526                 printk(KERN_ERR
2527                         "intel_iommu_domain_alloc: domain == NULL\n");
2528                 return NULL;
2529         }
2530         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2531                 printk(KERN_ERR
2532                         "intel_iommu_domain_alloc: domain_init() failed\n");
2533                 intel_iommu_domain_exit(domain);
2534                 return NULL;
2535         }
2536         return domain;
2537 }
2538 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2539
2540 int intel_iommu_context_mapping(
2541         struct dmar_domain *domain, struct pci_dev *pdev)
2542 {
2543         int rc;
2544         rc = domain_context_mapping(domain, pdev);
2545         return rc;
2546 }
2547 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2548
2549 int intel_iommu_page_mapping(
2550         struct dmar_domain *domain, dma_addr_t iova,
2551         u64 hpa, size_t size, int prot)
2552 {
2553         int rc;
2554         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2555         return rc;
2556 }
2557 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2558
2559 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2560 {
2561         detach_domain_for_dev(domain, bus, devfn);
2562 }
2563 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2564
2565 struct dmar_domain *
2566 intel_iommu_find_domain(struct pci_dev *pdev)
2567 {
2568         return find_domain(pdev);
2569 }
2570 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2571
2572 int intel_iommu_found(void)
2573 {
2574         return g_num_of_iommus;
2575 }
2576 EXPORT_SYMBOL_GPL(intel_iommu_found);
2577
2578 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2579 {
2580         struct dma_pte *pte;
2581         u64 pfn;
2582
2583         pfn = 0;
2584         pte = addr_to_dma_pte(domain, iova);
2585
2586         if (pte)
2587                 pfn = dma_pte_addr(*pte);
2588
2589         return pfn >> VTD_PAGE_SHIFT;
2590 }
2591 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);