]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/pci/intel-iommu.c
calculate agaw for each iommu
[karo-tx-linux.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
62
63 /*
64  * 0: Present
65  * 1-11: Reserved
66  * 12-63: Context Ptr (12 - (haw-1))
67  * 64-127: Reserved
68  */
69 struct root_entry {
70         u64     val;
71         u64     rsvd1;
72 };
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
75 {
76         return (root->val & 1);
77 }
78 static inline void set_root_present(struct root_entry *root)
79 {
80         root->val |= 1;
81 }
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
83 {
84         root->val |= value & VTD_PAGE_MASK;
85 }
86
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
89 {
90         return (struct context_entry *)
91                 (root_present(root)?phys_to_virt(
92                 root->val & VTD_PAGE_MASK) :
93                 NULL);
94 }
95
96 /*
97  * low 64 bits:
98  * 0: present
99  * 1: fault processing disable
100  * 2-3: translation type
101  * 12-63: address space root
102  * high 64 bits:
103  * 0-2: address width
104  * 3-6: aval
105  * 8-23: domain id
106  */
107 struct context_entry {
108         u64 lo;
109         u64 hi;
110 };
111
112 static inline bool context_present(struct context_entry *context)
113 {
114         return (context->lo & 1);
115 }
116 static inline void context_set_present(struct context_entry *context)
117 {
118         context->lo |= 1;
119 }
120
121 static inline void context_set_fault_enable(struct context_entry *context)
122 {
123         context->lo &= (((u64)-1) << 2) | 1;
124 }
125
126 #define CONTEXT_TT_MULTI_LEVEL 0
127
128 static inline void context_set_translation_type(struct context_entry *context,
129                                                 unsigned long value)
130 {
131         context->lo &= (((u64)-1) << 4) | 3;
132         context->lo |= (value & 3) << 2;
133 }
134
135 static inline void context_set_address_root(struct context_entry *context,
136                                             unsigned long value)
137 {
138         context->lo |= value & VTD_PAGE_MASK;
139 }
140
141 static inline void context_set_address_width(struct context_entry *context,
142                                              unsigned long value)
143 {
144         context->hi |= value & 7;
145 }
146
147 static inline void context_set_domain_id(struct context_entry *context,
148                                          unsigned long value)
149 {
150         context->hi |= (value & ((1 << 16) - 1)) << 8;
151 }
152
153 static inline void context_clear_entry(struct context_entry *context)
154 {
155         context->lo = 0;
156         context->hi = 0;
157 }
158
159 /*
160  * 0: readable
161  * 1: writable
162  * 2-6: reserved
163  * 7: super page
164  * 8-11: available
165  * 12-63: Host physcial address
166  */
167 struct dma_pte {
168         u64 val;
169 };
170
171 static inline void dma_clear_pte(struct dma_pte *pte)
172 {
173         pte->val = 0;
174 }
175
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
177 {
178         pte->val |= DMA_PTE_READ;
179 }
180
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_WRITE;
184 }
185
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
187 {
188         pte->val = (pte->val & ~3) | (prot & 3);
189 }
190
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
192 {
193         return (pte->val & VTD_PAGE_MASK);
194 }
195
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
197 {
198         pte->val |= (addr & VTD_PAGE_MASK);
199 }
200
201 static inline bool dma_pte_present(struct dma_pte *pte)
202 {
203         return (pte->val & 3) != 0;
204 }
205
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
208
209 struct dmar_domain {
210         int     id;                     /* domain id */
211         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
212
213         struct list_head devices;       /* all devices' list */
214         struct iova_domain iovad;       /* iova's that belong to this domain */
215
216         struct dma_pte  *pgd;           /* virtual address */
217         spinlock_t      mapping_lock;   /* page table lock */
218         int             gaw;            /* max guest address width */
219
220         /* adjusted guest address width, 0 is level 2 30-bit */
221         int             agaw;
222
223         int             flags;          /* flags to find out type of domain */
224 };
225
226 /* PCI domain-device relationship */
227 struct device_domain_info {
228         struct list_head link;  /* link to domain siblings */
229         struct list_head global; /* link to global list */
230         u8 bus;                 /* PCI bus numer */
231         u8 devfn;               /* PCI devfn number */
232         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
233         struct dmar_domain *domain; /* pointer to domain */
234 };
235
236 static void flush_unmaps_timeout(unsigned long data);
237
238 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
239
240 #define HIGH_WATER_MARK 250
241 struct deferred_flush_tables {
242         int next;
243         struct iova *iova[HIGH_WATER_MARK];
244         struct dmar_domain *domain[HIGH_WATER_MARK];
245 };
246
247 static struct deferred_flush_tables *deferred_flush;
248
249 /* bitmap for indexing intel_iommus */
250 static int g_num_of_iommus;
251
252 static DEFINE_SPINLOCK(async_umap_flush_lock);
253 static LIST_HEAD(unmaps_to_do);
254
255 static int timer_on;
256 static long list_size;
257
258 static void domain_remove_dev_info(struct dmar_domain *domain);
259
260 int dmar_disabled;
261 static int __initdata dmar_map_gfx = 1;
262 static int dmar_forcedac;
263 static int intel_iommu_strict;
264
265 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
266 static DEFINE_SPINLOCK(device_domain_lock);
267 static LIST_HEAD(device_domain_list);
268
269 static int __init intel_iommu_setup(char *str)
270 {
271         if (!str)
272                 return -EINVAL;
273         while (*str) {
274                 if (!strncmp(str, "off", 3)) {
275                         dmar_disabled = 1;
276                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
277                 } else if (!strncmp(str, "igfx_off", 8)) {
278                         dmar_map_gfx = 0;
279                         printk(KERN_INFO
280                                 "Intel-IOMMU: disable GFX device mapping\n");
281                 } else if (!strncmp(str, "forcedac", 8)) {
282                         printk(KERN_INFO
283                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
284                         dmar_forcedac = 1;
285                 } else if (!strncmp(str, "strict", 6)) {
286                         printk(KERN_INFO
287                                 "Intel-IOMMU: disable batched IOTLB flush\n");
288                         intel_iommu_strict = 1;
289                 }
290
291                 str += strcspn(str, ",");
292                 while (*str == ',')
293                         str++;
294         }
295         return 0;
296 }
297 __setup("intel_iommu=", intel_iommu_setup);
298
299 static struct kmem_cache *iommu_domain_cache;
300 static struct kmem_cache *iommu_devinfo_cache;
301 static struct kmem_cache *iommu_iova_cache;
302
303 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
304 {
305         unsigned int flags;
306         void *vaddr;
307
308         /* trying to avoid low memory issues */
309         flags = current->flags & PF_MEMALLOC;
310         current->flags |= PF_MEMALLOC;
311         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
312         current->flags &= (~PF_MEMALLOC | flags);
313         return vaddr;
314 }
315
316
317 static inline void *alloc_pgtable_page(void)
318 {
319         unsigned int flags;
320         void *vaddr;
321
322         /* trying to avoid low memory issues */
323         flags = current->flags & PF_MEMALLOC;
324         current->flags |= PF_MEMALLOC;
325         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
326         current->flags &= (~PF_MEMALLOC | flags);
327         return vaddr;
328 }
329
330 static inline void free_pgtable_page(void *vaddr)
331 {
332         free_page((unsigned long)vaddr);
333 }
334
335 static inline void *alloc_domain_mem(void)
336 {
337         return iommu_kmem_cache_alloc(iommu_domain_cache);
338 }
339
340 static void free_domain_mem(void *vaddr)
341 {
342         kmem_cache_free(iommu_domain_cache, vaddr);
343 }
344
345 static inline void * alloc_devinfo_mem(void)
346 {
347         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
348 }
349
350 static inline void free_devinfo_mem(void *vaddr)
351 {
352         kmem_cache_free(iommu_devinfo_cache, vaddr);
353 }
354
355 struct iova *alloc_iova_mem(void)
356 {
357         return iommu_kmem_cache_alloc(iommu_iova_cache);
358 }
359
360 void free_iova_mem(struct iova *iova)
361 {
362         kmem_cache_free(iommu_iova_cache, iova);
363 }
364
365
366 static inline int width_to_agaw(int width);
367
368 /* calculate agaw for each iommu.
369  * "SAGAW" may be different across iommus, use a default agaw, and
370  * get a supported less agaw for iommus that don't support the default agaw.
371  */
372 int iommu_calculate_agaw(struct intel_iommu *iommu)
373 {
374         unsigned long sagaw;
375         int agaw = -1;
376
377         sagaw = cap_sagaw(iommu->cap);
378         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
379              agaw >= 0; agaw--) {
380                 if (test_bit(agaw, &sagaw))
381                         break;
382         }
383
384         return agaw;
385 }
386
387 /* in native case, each domain is related to only one iommu */
388 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
389 {
390         int iommu_id;
391
392         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
393         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
394                 return NULL;
395
396         return g_iommus[iommu_id];
397 }
398
399 /* Gets context entry for a given bus and devfn */
400 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
401                 u8 bus, u8 devfn)
402 {
403         struct root_entry *root;
404         struct context_entry *context;
405         unsigned long phy_addr;
406         unsigned long flags;
407
408         spin_lock_irqsave(&iommu->lock, flags);
409         root = &iommu->root_entry[bus];
410         context = get_context_addr_from_root(root);
411         if (!context) {
412                 context = (struct context_entry *)alloc_pgtable_page();
413                 if (!context) {
414                         spin_unlock_irqrestore(&iommu->lock, flags);
415                         return NULL;
416                 }
417                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
418                 phy_addr = virt_to_phys((void *)context);
419                 set_root_value(root, phy_addr);
420                 set_root_present(root);
421                 __iommu_flush_cache(iommu, root, sizeof(*root));
422         }
423         spin_unlock_irqrestore(&iommu->lock, flags);
424         return &context[devfn];
425 }
426
427 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
428 {
429         struct root_entry *root;
430         struct context_entry *context;
431         int ret;
432         unsigned long flags;
433
434         spin_lock_irqsave(&iommu->lock, flags);
435         root = &iommu->root_entry[bus];
436         context = get_context_addr_from_root(root);
437         if (!context) {
438                 ret = 0;
439                 goto out;
440         }
441         ret = context_present(&context[devfn]);
442 out:
443         spin_unlock_irqrestore(&iommu->lock, flags);
444         return ret;
445 }
446
447 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
448 {
449         struct root_entry *root;
450         struct context_entry *context;
451         unsigned long flags;
452
453         spin_lock_irqsave(&iommu->lock, flags);
454         root = &iommu->root_entry[bus];
455         context = get_context_addr_from_root(root);
456         if (context) {
457                 context_clear_entry(&context[devfn]);
458                 __iommu_flush_cache(iommu, &context[devfn], \
459                         sizeof(*context));
460         }
461         spin_unlock_irqrestore(&iommu->lock, flags);
462 }
463
464 static void free_context_table(struct intel_iommu *iommu)
465 {
466         struct root_entry *root;
467         int i;
468         unsigned long flags;
469         struct context_entry *context;
470
471         spin_lock_irqsave(&iommu->lock, flags);
472         if (!iommu->root_entry) {
473                 goto out;
474         }
475         for (i = 0; i < ROOT_ENTRY_NR; i++) {
476                 root = &iommu->root_entry[i];
477                 context = get_context_addr_from_root(root);
478                 if (context)
479                         free_pgtable_page(context);
480         }
481         free_pgtable_page(iommu->root_entry);
482         iommu->root_entry = NULL;
483 out:
484         spin_unlock_irqrestore(&iommu->lock, flags);
485 }
486
487 /* page table handling */
488 #define LEVEL_STRIDE            (9)
489 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
490
491 static inline int agaw_to_level(int agaw)
492 {
493         return agaw + 2;
494 }
495
496 static inline int agaw_to_width(int agaw)
497 {
498         return 30 + agaw * LEVEL_STRIDE;
499
500 }
501
502 static inline int width_to_agaw(int width)
503 {
504         return (width - 30) / LEVEL_STRIDE;
505 }
506
507 static inline unsigned int level_to_offset_bits(int level)
508 {
509         return (12 + (level - 1) * LEVEL_STRIDE);
510 }
511
512 static inline int address_level_offset(u64 addr, int level)
513 {
514         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
515 }
516
517 static inline u64 level_mask(int level)
518 {
519         return ((u64)-1 << level_to_offset_bits(level));
520 }
521
522 static inline u64 level_size(int level)
523 {
524         return ((u64)1 << level_to_offset_bits(level));
525 }
526
527 static inline u64 align_to_level(u64 addr, int level)
528 {
529         return ((addr + level_size(level) - 1) & level_mask(level));
530 }
531
532 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
533 {
534         int addr_width = agaw_to_width(domain->agaw);
535         struct dma_pte *parent, *pte = NULL;
536         int level = agaw_to_level(domain->agaw);
537         int offset;
538         unsigned long flags;
539         struct intel_iommu *iommu = domain_get_iommu(domain);
540
541         BUG_ON(!domain->pgd);
542
543         addr &= (((u64)1) << addr_width) - 1;
544         parent = domain->pgd;
545
546         spin_lock_irqsave(&domain->mapping_lock, flags);
547         while (level > 0) {
548                 void *tmp_page;
549
550                 offset = address_level_offset(addr, level);
551                 pte = &parent[offset];
552                 if (level == 1)
553                         break;
554
555                 if (!dma_pte_present(pte)) {
556                         tmp_page = alloc_pgtable_page();
557
558                         if (!tmp_page) {
559                                 spin_unlock_irqrestore(&domain->mapping_lock,
560                                         flags);
561                                 return NULL;
562                         }
563                         __iommu_flush_cache(iommu, tmp_page,
564                                         PAGE_SIZE);
565                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
566                         /*
567                          * high level table always sets r/w, last level page
568                          * table control read/write
569                          */
570                         dma_set_pte_readable(pte);
571                         dma_set_pte_writable(pte);
572                         __iommu_flush_cache(iommu, pte, sizeof(*pte));
573                 }
574                 parent = phys_to_virt(dma_pte_addr(pte));
575                 level--;
576         }
577
578         spin_unlock_irqrestore(&domain->mapping_lock, flags);
579         return pte;
580 }
581
582 /* return address's pte at specific level */
583 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
584                 int level)
585 {
586         struct dma_pte *parent, *pte = NULL;
587         int total = agaw_to_level(domain->agaw);
588         int offset;
589
590         parent = domain->pgd;
591         while (level <= total) {
592                 offset = address_level_offset(addr, total);
593                 pte = &parent[offset];
594                 if (level == total)
595                         return pte;
596
597                 if (!dma_pte_present(pte))
598                         break;
599                 parent = phys_to_virt(dma_pte_addr(pte));
600                 total--;
601         }
602         return NULL;
603 }
604
605 /* clear one page's page table */
606 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
607 {
608         struct dma_pte *pte = NULL;
609         struct intel_iommu *iommu = domain_get_iommu(domain);
610
611         /* get last level pte */
612         pte = dma_addr_level_pte(domain, addr, 1);
613
614         if (pte) {
615                 dma_clear_pte(pte);
616                 __iommu_flush_cache(iommu, pte, sizeof(*pte));
617         }
618 }
619
620 /* clear last level pte, a tlb flush should be followed */
621 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
622 {
623         int addr_width = agaw_to_width(domain->agaw);
624
625         start &= (((u64)1) << addr_width) - 1;
626         end &= (((u64)1) << addr_width) - 1;
627         /* in case it's partial page */
628         start = PAGE_ALIGN(start);
629         end &= PAGE_MASK;
630
631         /* we don't need lock here, nobody else touches the iova range */
632         while (start < end) {
633                 dma_pte_clear_one(domain, start);
634                 start += VTD_PAGE_SIZE;
635         }
636 }
637
638 /* free page table pages. last level pte should already be cleared */
639 static void dma_pte_free_pagetable(struct dmar_domain *domain,
640         u64 start, u64 end)
641 {
642         int addr_width = agaw_to_width(domain->agaw);
643         struct dma_pte *pte;
644         int total = agaw_to_level(domain->agaw);
645         int level;
646         u64 tmp;
647         struct intel_iommu *iommu = domain_get_iommu(domain);
648
649         start &= (((u64)1) << addr_width) - 1;
650         end &= (((u64)1) << addr_width) - 1;
651
652         /* we don't need lock here, nobody else touches the iova range */
653         level = 2;
654         while (level <= total) {
655                 tmp = align_to_level(start, level);
656                 if (tmp >= end || (tmp + level_size(level) > end))
657                         return;
658
659                 while (tmp < end) {
660                         pte = dma_addr_level_pte(domain, tmp, level);
661                         if (pte) {
662                                 free_pgtable_page(
663                                         phys_to_virt(dma_pte_addr(pte)));
664                                 dma_clear_pte(pte);
665                                 __iommu_flush_cache(iommu,
666                                                 pte, sizeof(*pte));
667                         }
668                         tmp += level_size(level);
669                 }
670                 level++;
671         }
672         /* free pgd */
673         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
674                 free_pgtable_page(domain->pgd);
675                 domain->pgd = NULL;
676         }
677 }
678
679 /* iommu handling */
680 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
681 {
682         struct root_entry *root;
683         unsigned long flags;
684
685         root = (struct root_entry *)alloc_pgtable_page();
686         if (!root)
687                 return -ENOMEM;
688
689         __iommu_flush_cache(iommu, root, ROOT_SIZE);
690
691         spin_lock_irqsave(&iommu->lock, flags);
692         iommu->root_entry = root;
693         spin_unlock_irqrestore(&iommu->lock, flags);
694
695         return 0;
696 }
697
698 static void iommu_set_root_entry(struct intel_iommu *iommu)
699 {
700         void *addr;
701         u32 cmd, sts;
702         unsigned long flag;
703
704         addr = iommu->root_entry;
705
706         spin_lock_irqsave(&iommu->register_lock, flag);
707         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
708
709         cmd = iommu->gcmd | DMA_GCMD_SRTP;
710         writel(cmd, iommu->reg + DMAR_GCMD_REG);
711
712         /* Make sure hardware complete it */
713         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
714                 readl, (sts & DMA_GSTS_RTPS), sts);
715
716         spin_unlock_irqrestore(&iommu->register_lock, flag);
717 }
718
719 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
720 {
721         u32 val;
722         unsigned long flag;
723
724         if (!cap_rwbf(iommu->cap))
725                 return;
726         val = iommu->gcmd | DMA_GCMD_WBF;
727
728         spin_lock_irqsave(&iommu->register_lock, flag);
729         writel(val, iommu->reg + DMAR_GCMD_REG);
730
731         /* Make sure hardware complete it */
732         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
733                         readl, (!(val & DMA_GSTS_WBFS)), val);
734
735         spin_unlock_irqrestore(&iommu->register_lock, flag);
736 }
737
738 /* return value determine if we need a write buffer flush */
739 static int __iommu_flush_context(struct intel_iommu *iommu,
740         u16 did, u16 source_id, u8 function_mask, u64 type,
741         int non_present_entry_flush)
742 {
743         u64 val = 0;
744         unsigned long flag;
745
746         /*
747          * In the non-present entry flush case, if hardware doesn't cache
748          * non-present entry we do nothing and if hardware cache non-present
749          * entry, we flush entries of domain 0 (the domain id is used to cache
750          * any non-present entries)
751          */
752         if (non_present_entry_flush) {
753                 if (!cap_caching_mode(iommu->cap))
754                         return 1;
755                 else
756                         did = 0;
757         }
758
759         switch (type) {
760         case DMA_CCMD_GLOBAL_INVL:
761                 val = DMA_CCMD_GLOBAL_INVL;
762                 break;
763         case DMA_CCMD_DOMAIN_INVL:
764                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
765                 break;
766         case DMA_CCMD_DEVICE_INVL:
767                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
768                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
769                 break;
770         default:
771                 BUG();
772         }
773         val |= DMA_CCMD_ICC;
774
775         spin_lock_irqsave(&iommu->register_lock, flag);
776         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
777
778         /* Make sure hardware complete it */
779         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
780                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
781
782         spin_unlock_irqrestore(&iommu->register_lock, flag);
783
784         /* flush context entry will implicitly flush write buffer */
785         return 0;
786 }
787
788 /* return value determine if we need a write buffer flush */
789 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
790         u64 addr, unsigned int size_order, u64 type,
791         int non_present_entry_flush)
792 {
793         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
794         u64 val = 0, val_iva = 0;
795         unsigned long flag;
796
797         /*
798          * In the non-present entry flush case, if hardware doesn't cache
799          * non-present entry we do nothing and if hardware cache non-present
800          * entry, we flush entries of domain 0 (the domain id is used to cache
801          * any non-present entries)
802          */
803         if (non_present_entry_flush) {
804                 if (!cap_caching_mode(iommu->cap))
805                         return 1;
806                 else
807                         did = 0;
808         }
809
810         switch (type) {
811         case DMA_TLB_GLOBAL_FLUSH:
812                 /* global flush doesn't need set IVA_REG */
813                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
814                 break;
815         case DMA_TLB_DSI_FLUSH:
816                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
817                 break;
818         case DMA_TLB_PSI_FLUSH:
819                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
820                 /* Note: always flush non-leaf currently */
821                 val_iva = size_order | addr;
822                 break;
823         default:
824                 BUG();
825         }
826         /* Note: set drain read/write */
827 #if 0
828         /*
829          * This is probably to be super secure.. Looks like we can
830          * ignore it without any impact.
831          */
832         if (cap_read_drain(iommu->cap))
833                 val |= DMA_TLB_READ_DRAIN;
834 #endif
835         if (cap_write_drain(iommu->cap))
836                 val |= DMA_TLB_WRITE_DRAIN;
837
838         spin_lock_irqsave(&iommu->register_lock, flag);
839         /* Note: Only uses first TLB reg currently */
840         if (val_iva)
841                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
842         dmar_writeq(iommu->reg + tlb_offset + 8, val);
843
844         /* Make sure hardware complete it */
845         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
846                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
847
848         spin_unlock_irqrestore(&iommu->register_lock, flag);
849
850         /* check IOTLB invalidation granularity */
851         if (DMA_TLB_IAIG(val) == 0)
852                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
853         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
854                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
855                         (unsigned long long)DMA_TLB_IIRG(type),
856                         (unsigned long long)DMA_TLB_IAIG(val));
857         /* flush iotlb entry will implicitly flush write buffer */
858         return 0;
859 }
860
861 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
862         u64 addr, unsigned int pages, int non_present_entry_flush)
863 {
864         unsigned int mask;
865
866         BUG_ON(addr & (~VTD_PAGE_MASK));
867         BUG_ON(pages == 0);
868
869         /* Fallback to domain selective flush if no PSI support */
870         if (!cap_pgsel_inv(iommu->cap))
871                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
872                                                 DMA_TLB_DSI_FLUSH,
873                                                 non_present_entry_flush);
874
875         /*
876          * PSI requires page size to be 2 ^ x, and the base address is naturally
877          * aligned to the size
878          */
879         mask = ilog2(__roundup_pow_of_two(pages));
880         /* Fallback to domain selective flush if size is too big */
881         if (mask > cap_max_amask_val(iommu->cap))
882                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
883                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
884
885         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
886                                         DMA_TLB_PSI_FLUSH,
887                                         non_present_entry_flush);
888 }
889
890 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
891 {
892         u32 pmen;
893         unsigned long flags;
894
895         spin_lock_irqsave(&iommu->register_lock, flags);
896         pmen = readl(iommu->reg + DMAR_PMEN_REG);
897         pmen &= ~DMA_PMEN_EPM;
898         writel(pmen, iommu->reg + DMAR_PMEN_REG);
899
900         /* wait for the protected region status bit to clear */
901         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
902                 readl, !(pmen & DMA_PMEN_PRS), pmen);
903
904         spin_unlock_irqrestore(&iommu->register_lock, flags);
905 }
906
907 static int iommu_enable_translation(struct intel_iommu *iommu)
908 {
909         u32 sts;
910         unsigned long flags;
911
912         spin_lock_irqsave(&iommu->register_lock, flags);
913         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
914
915         /* Make sure hardware complete it */
916         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
917                 readl, (sts & DMA_GSTS_TES), sts);
918
919         iommu->gcmd |= DMA_GCMD_TE;
920         spin_unlock_irqrestore(&iommu->register_lock, flags);
921         return 0;
922 }
923
924 static int iommu_disable_translation(struct intel_iommu *iommu)
925 {
926         u32 sts;
927         unsigned long flag;
928
929         spin_lock_irqsave(&iommu->register_lock, flag);
930         iommu->gcmd &= ~DMA_GCMD_TE;
931         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
932
933         /* Make sure hardware complete it */
934         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
935                 readl, (!(sts & DMA_GSTS_TES)), sts);
936
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938         return 0;
939 }
940
941 /* iommu interrupt handling. Most stuff are MSI-like. */
942
943 static const char *fault_reason_strings[] =
944 {
945         "Software",
946         "Present bit in root entry is clear",
947         "Present bit in context entry is clear",
948         "Invalid context entry",
949         "Access beyond MGAW",
950         "PTE Write access is not set",
951         "PTE Read access is not set",
952         "Next page table ptr is invalid",
953         "Root table address invalid",
954         "Context table ptr is invalid",
955         "non-zero reserved fields in RTP",
956         "non-zero reserved fields in CTP",
957         "non-zero reserved fields in PTE",
958 };
959 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
960
961 const char *dmar_get_fault_reason(u8 fault_reason)
962 {
963         if (fault_reason > MAX_FAULT_REASON_IDX)
964                 return "Unknown";
965         else
966                 return fault_reason_strings[fault_reason];
967 }
968
969 void dmar_msi_unmask(unsigned int irq)
970 {
971         struct intel_iommu *iommu = get_irq_data(irq);
972         unsigned long flag;
973
974         /* unmask it */
975         spin_lock_irqsave(&iommu->register_lock, flag);
976         writel(0, iommu->reg + DMAR_FECTL_REG);
977         /* Read a reg to force flush the post write */
978         readl(iommu->reg + DMAR_FECTL_REG);
979         spin_unlock_irqrestore(&iommu->register_lock, flag);
980 }
981
982 void dmar_msi_mask(unsigned int irq)
983 {
984         unsigned long flag;
985         struct intel_iommu *iommu = get_irq_data(irq);
986
987         /* mask it */
988         spin_lock_irqsave(&iommu->register_lock, flag);
989         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
990         /* Read a reg to force flush the post write */
991         readl(iommu->reg + DMAR_FECTL_REG);
992         spin_unlock_irqrestore(&iommu->register_lock, flag);
993 }
994
995 void dmar_msi_write(int irq, struct msi_msg *msg)
996 {
997         struct intel_iommu *iommu = get_irq_data(irq);
998         unsigned long flag;
999
1000         spin_lock_irqsave(&iommu->register_lock, flag);
1001         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1002         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1003         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1004         spin_unlock_irqrestore(&iommu->register_lock, flag);
1005 }
1006
1007 void dmar_msi_read(int irq, struct msi_msg *msg)
1008 {
1009         struct intel_iommu *iommu = get_irq_data(irq);
1010         unsigned long flag;
1011
1012         spin_lock_irqsave(&iommu->register_lock, flag);
1013         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1014         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1015         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1016         spin_unlock_irqrestore(&iommu->register_lock, flag);
1017 }
1018
1019 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1020                 u8 fault_reason, u16 source_id, unsigned long long addr)
1021 {
1022         const char *reason;
1023
1024         reason = dmar_get_fault_reason(fault_reason);
1025
1026         printk(KERN_ERR
1027                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1028                 "fault addr %llx \n"
1029                 "DMAR:[fault reason %02d] %s\n",
1030                 (type ? "DMA Read" : "DMA Write"),
1031                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1032                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1033         return 0;
1034 }
1035
1036 #define PRIMARY_FAULT_REG_LEN (16)
1037 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1038 {
1039         struct intel_iommu *iommu = dev_id;
1040         int reg, fault_index;
1041         u32 fault_status;
1042         unsigned long flag;
1043
1044         spin_lock_irqsave(&iommu->register_lock, flag);
1045         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1046
1047         /* TBD: ignore advanced fault log currently */
1048         if (!(fault_status & DMA_FSTS_PPF))
1049                 goto clear_overflow;
1050
1051         fault_index = dma_fsts_fault_record_index(fault_status);
1052         reg = cap_fault_reg_offset(iommu->cap);
1053         while (1) {
1054                 u8 fault_reason;
1055                 u16 source_id;
1056                 u64 guest_addr;
1057                 int type;
1058                 u32 data;
1059
1060                 /* highest 32 bits */
1061                 data = readl(iommu->reg + reg +
1062                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1063                 if (!(data & DMA_FRCD_F))
1064                         break;
1065
1066                 fault_reason = dma_frcd_fault_reason(data);
1067                 type = dma_frcd_type(data);
1068
1069                 data = readl(iommu->reg + reg +
1070                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1071                 source_id = dma_frcd_source_id(data);
1072
1073                 guest_addr = dmar_readq(iommu->reg + reg +
1074                                 fault_index * PRIMARY_FAULT_REG_LEN);
1075                 guest_addr = dma_frcd_page_addr(guest_addr);
1076                 /* clear the fault */
1077                 writel(DMA_FRCD_F, iommu->reg + reg +
1078                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1079
1080                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1081
1082                 iommu_page_fault_do_one(iommu, type, fault_reason,
1083                                 source_id, guest_addr);
1084
1085                 fault_index++;
1086                 if (fault_index > cap_num_fault_regs(iommu->cap))
1087                         fault_index = 0;
1088                 spin_lock_irqsave(&iommu->register_lock, flag);
1089         }
1090 clear_overflow:
1091         /* clear primary fault overflow */
1092         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1093         if (fault_status & DMA_FSTS_PFO)
1094                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1095
1096         spin_unlock_irqrestore(&iommu->register_lock, flag);
1097         return IRQ_HANDLED;
1098 }
1099
1100 int dmar_set_interrupt(struct intel_iommu *iommu)
1101 {
1102         int irq, ret;
1103
1104         irq = create_irq();
1105         if (!irq) {
1106                 printk(KERN_ERR "IOMMU: no free vectors\n");
1107                 return -EINVAL;
1108         }
1109
1110         set_irq_data(irq, iommu);
1111         iommu->irq = irq;
1112
1113         ret = arch_setup_dmar_msi(irq);
1114         if (ret) {
1115                 set_irq_data(irq, NULL);
1116                 iommu->irq = 0;
1117                 destroy_irq(irq);
1118                 return 0;
1119         }
1120
1121         /* Force fault register is cleared */
1122         iommu_page_fault(irq, iommu);
1123
1124         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1125         if (ret)
1126                 printk(KERN_ERR "IOMMU: can't request irq\n");
1127         return ret;
1128 }
1129
1130 static int iommu_init_domains(struct intel_iommu *iommu)
1131 {
1132         unsigned long ndomains;
1133         unsigned long nlongs;
1134
1135         ndomains = cap_ndoms(iommu->cap);
1136         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1137         nlongs = BITS_TO_LONGS(ndomains);
1138
1139         /* TBD: there might be 64K domains,
1140          * consider other allocation for future chip
1141          */
1142         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1143         if (!iommu->domain_ids) {
1144                 printk(KERN_ERR "Allocating domain id array failed\n");
1145                 return -ENOMEM;
1146         }
1147         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1148                         GFP_KERNEL);
1149         if (!iommu->domains) {
1150                 printk(KERN_ERR "Allocating domain array failed\n");
1151                 kfree(iommu->domain_ids);
1152                 return -ENOMEM;
1153         }
1154
1155         spin_lock_init(&iommu->lock);
1156
1157         /*
1158          * if Caching mode is set, then invalid translations are tagged
1159          * with domainid 0. Hence we need to pre-allocate it.
1160          */
1161         if (cap_caching_mode(iommu->cap))
1162                 set_bit(0, iommu->domain_ids);
1163         return 0;
1164 }
1165
1166
1167 static void domain_exit(struct dmar_domain *domain);
1168
1169 void free_dmar_iommu(struct intel_iommu *iommu)
1170 {
1171         struct dmar_domain *domain;
1172         int i;
1173
1174         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1175         for (; i < cap_ndoms(iommu->cap); ) {
1176                 domain = iommu->domains[i];
1177                 clear_bit(i, iommu->domain_ids);
1178                 domain_exit(domain);
1179                 i = find_next_bit(iommu->domain_ids,
1180                         cap_ndoms(iommu->cap), i+1);
1181         }
1182
1183         if (iommu->gcmd & DMA_GCMD_TE)
1184                 iommu_disable_translation(iommu);
1185
1186         if (iommu->irq) {
1187                 set_irq_data(iommu->irq, NULL);
1188                 /* This will mask the irq */
1189                 free_irq(iommu->irq, iommu);
1190                 destroy_irq(iommu->irq);
1191         }
1192
1193         kfree(iommu->domains);
1194         kfree(iommu->domain_ids);
1195
1196         g_iommus[iommu->seq_id] = NULL;
1197
1198         /* if all iommus are freed, free g_iommus */
1199         for (i = 0; i < g_num_of_iommus; i++) {
1200                 if (g_iommus[i])
1201                         break;
1202         }
1203
1204         if (i == g_num_of_iommus)
1205                 kfree(g_iommus);
1206
1207         /* free context mapping */
1208         free_context_table(iommu);
1209 }
1210
1211 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1212 {
1213         unsigned long num;
1214         unsigned long ndomains;
1215         struct dmar_domain *domain;
1216         unsigned long flags;
1217
1218         domain = alloc_domain_mem();
1219         if (!domain)
1220                 return NULL;
1221
1222         ndomains = cap_ndoms(iommu->cap);
1223
1224         spin_lock_irqsave(&iommu->lock, flags);
1225         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1226         if (num >= ndomains) {
1227                 spin_unlock_irqrestore(&iommu->lock, flags);
1228                 free_domain_mem(domain);
1229                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1230                 return NULL;
1231         }
1232
1233         set_bit(num, iommu->domain_ids);
1234         domain->id = num;
1235         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1236         set_bit(iommu->seq_id, &domain->iommu_bmp);
1237         domain->flags = 0;
1238         iommu->domains[num] = domain;
1239         spin_unlock_irqrestore(&iommu->lock, flags);
1240
1241         return domain;
1242 }
1243
1244 static void iommu_free_domain(struct dmar_domain *domain)
1245 {
1246         unsigned long flags;
1247         struct intel_iommu *iommu;
1248
1249         iommu = domain_get_iommu(domain);
1250
1251         spin_lock_irqsave(&iommu->lock, flags);
1252         clear_bit(domain->id, iommu->domain_ids);
1253         spin_unlock_irqrestore(&iommu->lock, flags);
1254 }
1255
1256 static struct iova_domain reserved_iova_list;
1257 static struct lock_class_key reserved_alloc_key;
1258 static struct lock_class_key reserved_rbtree_key;
1259
1260 static void dmar_init_reserved_ranges(void)
1261 {
1262         struct pci_dev *pdev = NULL;
1263         struct iova *iova;
1264         int i;
1265         u64 addr, size;
1266
1267         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1268
1269         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1270                 &reserved_alloc_key);
1271         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1272                 &reserved_rbtree_key);
1273
1274         /* IOAPIC ranges shouldn't be accessed by DMA */
1275         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1276                 IOVA_PFN(IOAPIC_RANGE_END));
1277         if (!iova)
1278                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1279
1280         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1281         for_each_pci_dev(pdev) {
1282                 struct resource *r;
1283
1284                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1285                         r = &pdev->resource[i];
1286                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1287                                 continue;
1288                         addr = r->start;
1289                         addr &= PAGE_MASK;
1290                         size = r->end - addr;
1291                         size = PAGE_ALIGN(size);
1292                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1293                                 IOVA_PFN(size + addr) - 1);
1294                         if (!iova)
1295                                 printk(KERN_ERR "Reserve iova failed\n");
1296                 }
1297         }
1298
1299 }
1300
1301 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1302 {
1303         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1304 }
1305
1306 static inline int guestwidth_to_adjustwidth(int gaw)
1307 {
1308         int agaw;
1309         int r = (gaw - 12) % 9;
1310
1311         if (r == 0)
1312                 agaw = gaw;
1313         else
1314                 agaw = gaw + 9 - r;
1315         if (agaw > 64)
1316                 agaw = 64;
1317         return agaw;
1318 }
1319
1320 static int domain_init(struct dmar_domain *domain, int guest_width)
1321 {
1322         struct intel_iommu *iommu;
1323         int adjust_width, agaw;
1324         unsigned long sagaw;
1325
1326         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1327         spin_lock_init(&domain->mapping_lock);
1328
1329         domain_reserve_special_ranges(domain);
1330
1331         /* calculate AGAW */
1332         iommu = domain_get_iommu(domain);
1333         if (guest_width > cap_mgaw(iommu->cap))
1334                 guest_width = cap_mgaw(iommu->cap);
1335         domain->gaw = guest_width;
1336         adjust_width = guestwidth_to_adjustwidth(guest_width);
1337         agaw = width_to_agaw(adjust_width);
1338         sagaw = cap_sagaw(iommu->cap);
1339         if (!test_bit(agaw, &sagaw)) {
1340                 /* hardware doesn't support it, choose a bigger one */
1341                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1342                 agaw = find_next_bit(&sagaw, 5, agaw);
1343                 if (agaw >= 5)
1344                         return -ENODEV;
1345         }
1346         domain->agaw = agaw;
1347         INIT_LIST_HEAD(&domain->devices);
1348
1349         /* always allocate the top pgd */
1350         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1351         if (!domain->pgd)
1352                 return -ENOMEM;
1353         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1354         return 0;
1355 }
1356
1357 static void domain_exit(struct dmar_domain *domain)
1358 {
1359         u64 end;
1360
1361         /* Domain 0 is reserved, so dont process it */
1362         if (!domain)
1363                 return;
1364
1365         domain_remove_dev_info(domain);
1366         /* destroy iovas */
1367         put_iova_domain(&domain->iovad);
1368         end = DOMAIN_MAX_ADDR(domain->gaw);
1369         end = end & (~PAGE_MASK);
1370
1371         /* clear ptes */
1372         dma_pte_clear_range(domain, 0, end);
1373
1374         /* free page tables */
1375         dma_pte_free_pagetable(domain, 0, end);
1376
1377         iommu_free_domain(domain);
1378         free_domain_mem(domain);
1379 }
1380
1381 static int domain_context_mapping_one(struct dmar_domain *domain,
1382                 u8 bus, u8 devfn)
1383 {
1384         struct context_entry *context;
1385         struct intel_iommu *iommu = domain_get_iommu(domain);
1386         unsigned long flags;
1387
1388         pr_debug("Set context mapping for %02x:%02x.%d\n",
1389                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1390         BUG_ON(!domain->pgd);
1391         context = device_to_context_entry(iommu, bus, devfn);
1392         if (!context)
1393                 return -ENOMEM;
1394         spin_lock_irqsave(&iommu->lock, flags);
1395         if (context_present(context)) {
1396                 spin_unlock_irqrestore(&iommu->lock, flags);
1397                 return 0;
1398         }
1399
1400         context_set_domain_id(context, domain->id);
1401         context_set_address_width(context, domain->agaw);
1402         context_set_address_root(context, virt_to_phys(domain->pgd));
1403         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1404         context_set_fault_enable(context);
1405         context_set_present(context);
1406         __iommu_flush_cache(iommu, context, sizeof(*context));
1407
1408         /* it's a non-present to present mapping */
1409         if (iommu->flush.flush_context(iommu, domain->id,
1410                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1411                 DMA_CCMD_DEVICE_INVL, 1))
1412                 iommu_flush_write_buffer(iommu);
1413         else
1414                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1415
1416         spin_unlock_irqrestore(&iommu->lock, flags);
1417         return 0;
1418 }
1419
1420 static int
1421 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1422 {
1423         int ret;
1424         struct pci_dev *tmp, *parent;
1425
1426         ret = domain_context_mapping_one(domain, pdev->bus->number,
1427                 pdev->devfn);
1428         if (ret)
1429                 return ret;
1430
1431         /* dependent device mapping */
1432         tmp = pci_find_upstream_pcie_bridge(pdev);
1433         if (!tmp)
1434                 return 0;
1435         /* Secondary interface's bus number and devfn 0 */
1436         parent = pdev->bus->self;
1437         while (parent != tmp) {
1438                 ret = domain_context_mapping_one(domain, parent->bus->number,
1439                         parent->devfn);
1440                 if (ret)
1441                         return ret;
1442                 parent = parent->bus->self;
1443         }
1444         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1445                 return domain_context_mapping_one(domain,
1446                         tmp->subordinate->number, 0);
1447         else /* this is a legacy PCI bridge */
1448                 return domain_context_mapping_one(domain,
1449                         tmp->bus->number, tmp->devfn);
1450 }
1451
1452 static int domain_context_mapped(struct dmar_domain *domain,
1453         struct pci_dev *pdev)
1454 {
1455         int ret;
1456         struct pci_dev *tmp, *parent;
1457         struct intel_iommu *iommu = domain_get_iommu(domain);
1458
1459         ret = device_context_mapped(iommu,
1460                 pdev->bus->number, pdev->devfn);
1461         if (!ret)
1462                 return ret;
1463         /* dependent device mapping */
1464         tmp = pci_find_upstream_pcie_bridge(pdev);
1465         if (!tmp)
1466                 return ret;
1467         /* Secondary interface's bus number and devfn 0 */
1468         parent = pdev->bus->self;
1469         while (parent != tmp) {
1470                 ret = device_context_mapped(iommu, parent->bus->number,
1471                         parent->devfn);
1472                 if (!ret)
1473                         return ret;
1474                 parent = parent->bus->self;
1475         }
1476         if (tmp->is_pcie)
1477                 return device_context_mapped(iommu,
1478                         tmp->subordinate->number, 0);
1479         else
1480                 return device_context_mapped(iommu,
1481                         tmp->bus->number, tmp->devfn);
1482 }
1483
1484 static int
1485 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1486                         u64 hpa, size_t size, int prot)
1487 {
1488         u64 start_pfn, end_pfn;
1489         struct dma_pte *pte;
1490         int index;
1491         int addr_width = agaw_to_width(domain->agaw);
1492         struct intel_iommu *iommu = domain_get_iommu(domain);
1493
1494         hpa &= (((u64)1) << addr_width) - 1;
1495
1496         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1497                 return -EINVAL;
1498         iova &= PAGE_MASK;
1499         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1500         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1501         index = 0;
1502         while (start_pfn < end_pfn) {
1503                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1504                 if (!pte)
1505                         return -ENOMEM;
1506                 /* We don't need lock here, nobody else
1507                  * touches the iova range
1508                  */
1509                 BUG_ON(dma_pte_addr(pte));
1510                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1511                 dma_set_pte_prot(pte, prot);
1512                 __iommu_flush_cache(iommu, pte, sizeof(*pte));
1513                 start_pfn++;
1514                 index++;
1515         }
1516         return 0;
1517 }
1518
1519 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1520 {
1521         struct intel_iommu *iommu = domain_get_iommu(domain);
1522
1523         clear_context_table(iommu, bus, devfn);
1524         iommu->flush.flush_context(iommu, 0, 0, 0,
1525                                            DMA_CCMD_GLOBAL_INVL, 0);
1526         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1527                                          DMA_TLB_GLOBAL_FLUSH, 0);
1528 }
1529
1530 static void domain_remove_dev_info(struct dmar_domain *domain)
1531 {
1532         struct device_domain_info *info;
1533         unsigned long flags;
1534
1535         spin_lock_irqsave(&device_domain_lock, flags);
1536         while (!list_empty(&domain->devices)) {
1537                 info = list_entry(domain->devices.next,
1538                         struct device_domain_info, link);
1539                 list_del(&info->link);
1540                 list_del(&info->global);
1541                 if (info->dev)
1542                         info->dev->dev.archdata.iommu = NULL;
1543                 spin_unlock_irqrestore(&device_domain_lock, flags);
1544
1545                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1546                 free_devinfo_mem(info);
1547
1548                 spin_lock_irqsave(&device_domain_lock, flags);
1549         }
1550         spin_unlock_irqrestore(&device_domain_lock, flags);
1551 }
1552
1553 /*
1554  * find_domain
1555  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1556  */
1557 static struct dmar_domain *
1558 find_domain(struct pci_dev *pdev)
1559 {
1560         struct device_domain_info *info;
1561
1562         /* No lock here, assumes no domain exit in normal case */
1563         info = pdev->dev.archdata.iommu;
1564         if (info)
1565                 return info->domain;
1566         return NULL;
1567 }
1568
1569 /* domain is initialized */
1570 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1571 {
1572         struct dmar_domain *domain, *found = NULL;
1573         struct intel_iommu *iommu;
1574         struct dmar_drhd_unit *drhd;
1575         struct device_domain_info *info, *tmp;
1576         struct pci_dev *dev_tmp;
1577         unsigned long flags;
1578         int bus = 0, devfn = 0;
1579
1580         domain = find_domain(pdev);
1581         if (domain)
1582                 return domain;
1583
1584         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1585         if (dev_tmp) {
1586                 if (dev_tmp->is_pcie) {
1587                         bus = dev_tmp->subordinate->number;
1588                         devfn = 0;
1589                 } else {
1590                         bus = dev_tmp->bus->number;
1591                         devfn = dev_tmp->devfn;
1592                 }
1593                 spin_lock_irqsave(&device_domain_lock, flags);
1594                 list_for_each_entry(info, &device_domain_list, global) {
1595                         if (info->bus == bus && info->devfn == devfn) {
1596                                 found = info->domain;
1597                                 break;
1598                         }
1599                 }
1600                 spin_unlock_irqrestore(&device_domain_lock, flags);
1601                 /* pcie-pci bridge already has a domain, uses it */
1602                 if (found) {
1603                         domain = found;
1604                         goto found_domain;
1605                 }
1606         }
1607
1608         /* Allocate new domain for the device */
1609         drhd = dmar_find_matched_drhd_unit(pdev);
1610         if (!drhd) {
1611                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1612                         pci_name(pdev));
1613                 return NULL;
1614         }
1615         iommu = drhd->iommu;
1616
1617         domain = iommu_alloc_domain(iommu);
1618         if (!domain)
1619                 goto error;
1620
1621         if (domain_init(domain, gaw)) {
1622                 domain_exit(domain);
1623                 goto error;
1624         }
1625
1626         /* register pcie-to-pci device */
1627         if (dev_tmp) {
1628                 info = alloc_devinfo_mem();
1629                 if (!info) {
1630                         domain_exit(domain);
1631                         goto error;
1632                 }
1633                 info->bus = bus;
1634                 info->devfn = devfn;
1635                 info->dev = NULL;
1636                 info->domain = domain;
1637                 /* This domain is shared by devices under p2p bridge */
1638                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1639
1640                 /* pcie-to-pci bridge already has a domain, uses it */
1641                 found = NULL;
1642                 spin_lock_irqsave(&device_domain_lock, flags);
1643                 list_for_each_entry(tmp, &device_domain_list, global) {
1644                         if (tmp->bus == bus && tmp->devfn == devfn) {
1645                                 found = tmp->domain;
1646                                 break;
1647                         }
1648                 }
1649                 if (found) {
1650                         free_devinfo_mem(info);
1651                         domain_exit(domain);
1652                         domain = found;
1653                 } else {
1654                         list_add(&info->link, &domain->devices);
1655                         list_add(&info->global, &device_domain_list);
1656                 }
1657                 spin_unlock_irqrestore(&device_domain_lock, flags);
1658         }
1659
1660 found_domain:
1661         info = alloc_devinfo_mem();
1662         if (!info)
1663                 goto error;
1664         info->bus = pdev->bus->number;
1665         info->devfn = pdev->devfn;
1666         info->dev = pdev;
1667         info->domain = domain;
1668         spin_lock_irqsave(&device_domain_lock, flags);
1669         /* somebody is fast */
1670         found = find_domain(pdev);
1671         if (found != NULL) {
1672                 spin_unlock_irqrestore(&device_domain_lock, flags);
1673                 if (found != domain) {
1674                         domain_exit(domain);
1675                         domain = found;
1676                 }
1677                 free_devinfo_mem(info);
1678                 return domain;
1679         }
1680         list_add(&info->link, &domain->devices);
1681         list_add(&info->global, &device_domain_list);
1682         pdev->dev.archdata.iommu = info;
1683         spin_unlock_irqrestore(&device_domain_lock, flags);
1684         return domain;
1685 error:
1686         /* recheck it here, maybe others set it */
1687         return find_domain(pdev);
1688 }
1689
1690 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1691                                       unsigned long long start,
1692                                       unsigned long long end)
1693 {
1694         struct dmar_domain *domain;
1695         unsigned long size;
1696         unsigned long long base;
1697         int ret;
1698
1699         printk(KERN_INFO
1700                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1701                 pci_name(pdev), start, end);
1702         /* page table init */
1703         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1704         if (!domain)
1705                 return -ENOMEM;
1706
1707         /* The address might not be aligned */
1708         base = start & PAGE_MASK;
1709         size = end - base;
1710         size = PAGE_ALIGN(size);
1711         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1712                         IOVA_PFN(base + size) - 1)) {
1713                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1714                 ret = -ENOMEM;
1715                 goto error;
1716         }
1717
1718         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1719                 size, base, pci_name(pdev));
1720         /*
1721          * RMRR range might have overlap with physical memory range,
1722          * clear it first
1723          */
1724         dma_pte_clear_range(domain, base, base + size);
1725
1726         ret = domain_page_mapping(domain, base, base, size,
1727                 DMA_PTE_READ|DMA_PTE_WRITE);
1728         if (ret)
1729                 goto error;
1730
1731         /* context entry init */
1732         ret = domain_context_mapping(domain, pdev);
1733         if (!ret)
1734                 return 0;
1735 error:
1736         domain_exit(domain);
1737         return ret;
1738
1739 }
1740
1741 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1742         struct pci_dev *pdev)
1743 {
1744         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1745                 return 0;
1746         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1747                 rmrr->end_address + 1);
1748 }
1749
1750 #ifdef CONFIG_DMAR_GFX_WA
1751 struct iommu_prepare_data {
1752         struct pci_dev *pdev;
1753         int ret;
1754 };
1755
1756 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1757                                          unsigned long end_pfn, void *datax)
1758 {
1759         struct iommu_prepare_data *data;
1760
1761         data = (struct iommu_prepare_data *)datax;
1762
1763         data->ret = iommu_prepare_identity_map(data->pdev,
1764                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1765         return data->ret;
1766
1767 }
1768
1769 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1770 {
1771         int nid;
1772         struct iommu_prepare_data data;
1773
1774         data.pdev = pdev;
1775         data.ret = 0;
1776
1777         for_each_online_node(nid) {
1778                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1779                 if (data.ret)
1780                         return data.ret;
1781         }
1782         return data.ret;
1783 }
1784
1785 static void __init iommu_prepare_gfx_mapping(void)
1786 {
1787         struct pci_dev *pdev = NULL;
1788         int ret;
1789
1790         for_each_pci_dev(pdev) {
1791                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1792                                 !IS_GFX_DEVICE(pdev))
1793                         continue;
1794                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1795                         pci_name(pdev));
1796                 ret = iommu_prepare_with_active_regions(pdev);
1797                 if (ret)
1798                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1799         }
1800 }
1801 #else /* !CONFIG_DMAR_GFX_WA */
1802 static inline void iommu_prepare_gfx_mapping(void)
1803 {
1804         return;
1805 }
1806 #endif
1807
1808 #ifdef CONFIG_DMAR_FLOPPY_WA
1809 static inline void iommu_prepare_isa(void)
1810 {
1811         struct pci_dev *pdev;
1812         int ret;
1813
1814         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1815         if (!pdev)
1816                 return;
1817
1818         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1819         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1820
1821         if (ret)
1822                 printk("IOMMU: Failed to create 0-64M identity map, "
1823                         "floppy might not work\n");
1824
1825 }
1826 #else
1827 static inline void iommu_prepare_isa(void)
1828 {
1829         return;
1830 }
1831 #endif /* !CONFIG_DMAR_FLPY_WA */
1832
1833 static int __init init_dmars(void)
1834 {
1835         struct dmar_drhd_unit *drhd;
1836         struct dmar_rmrr_unit *rmrr;
1837         struct pci_dev *pdev;
1838         struct intel_iommu *iommu;
1839         int i, ret, unit = 0;
1840
1841         /*
1842          * for each drhd
1843          *    allocate root
1844          *    initialize and program root entry to not present
1845          * endfor
1846          */
1847         for_each_drhd_unit(drhd) {
1848                 g_num_of_iommus++;
1849                 /*
1850                  * lock not needed as this is only incremented in the single
1851                  * threaded kernel __init code path all other access are read
1852                  * only
1853                  */
1854         }
1855
1856         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1857                         GFP_KERNEL);
1858         if (!g_iommus) {
1859                 printk(KERN_ERR "Allocating global iommu array failed\n");
1860                 ret = -ENOMEM;
1861                 goto error;
1862         }
1863
1864         deferred_flush = kzalloc(g_num_of_iommus *
1865                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1866         if (!deferred_flush) {
1867                 kfree(g_iommus);
1868                 ret = -ENOMEM;
1869                 goto error;
1870         }
1871
1872         for_each_drhd_unit(drhd) {
1873                 if (drhd->ignored)
1874                         continue;
1875
1876                 iommu = drhd->iommu;
1877                 g_iommus[iommu->seq_id] = iommu;
1878
1879                 ret = iommu_init_domains(iommu);
1880                 if (ret)
1881                         goto error;
1882
1883                 /*
1884                  * TBD:
1885                  * we could share the same root & context tables
1886                  * amoung all IOMMU's. Need to Split it later.
1887                  */
1888                 ret = iommu_alloc_root_entry(iommu);
1889                 if (ret) {
1890                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1891                         goto error;
1892                 }
1893         }
1894
1895         for_each_drhd_unit(drhd) {
1896                 if (drhd->ignored)
1897                         continue;
1898
1899                 iommu = drhd->iommu;
1900                 if (dmar_enable_qi(iommu)) {
1901                         /*
1902                          * Queued Invalidate not enabled, use Register Based
1903                          * Invalidate
1904                          */
1905                         iommu->flush.flush_context = __iommu_flush_context;
1906                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1907                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1908                                "invalidation\n",
1909                                (unsigned long long)drhd->reg_base_addr);
1910                 } else {
1911                         iommu->flush.flush_context = qi_flush_context;
1912                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1913                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1914                                "invalidation\n",
1915                                (unsigned long long)drhd->reg_base_addr);
1916                 }
1917         }
1918
1919         /*
1920          * For each rmrr
1921          *   for each dev attached to rmrr
1922          *   do
1923          *     locate drhd for dev, alloc domain for dev
1924          *     allocate free domain
1925          *     allocate page table entries for rmrr
1926          *     if context not allocated for bus
1927          *           allocate and init context
1928          *           set present in root table for this bus
1929          *     init context with domain, translation etc
1930          *    endfor
1931          * endfor
1932          */
1933         for_each_rmrr_units(rmrr) {
1934                 for (i = 0; i < rmrr->devices_cnt; i++) {
1935                         pdev = rmrr->devices[i];
1936                         /* some BIOS lists non-exist devices in DMAR table */
1937                         if (!pdev)
1938                                 continue;
1939                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1940                         if (ret)
1941                                 printk(KERN_ERR
1942                                  "IOMMU: mapping reserved region failed\n");
1943                 }
1944         }
1945
1946         iommu_prepare_gfx_mapping();
1947
1948         iommu_prepare_isa();
1949
1950         /*
1951          * for each drhd
1952          *   enable fault log
1953          *   global invalidate context cache
1954          *   global invalidate iotlb
1955          *   enable translation
1956          */
1957         for_each_drhd_unit(drhd) {
1958                 if (drhd->ignored)
1959                         continue;
1960                 iommu = drhd->iommu;
1961                 sprintf (iommu->name, "dmar%d", unit++);
1962
1963                 iommu_flush_write_buffer(iommu);
1964
1965                 ret = dmar_set_interrupt(iommu);
1966                 if (ret)
1967                         goto error;
1968
1969                 iommu_set_root_entry(iommu);
1970
1971                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1972                                            0);
1973                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1974                                          0);
1975                 iommu_disable_protect_mem_regions(iommu);
1976
1977                 ret = iommu_enable_translation(iommu);
1978                 if (ret)
1979                         goto error;
1980         }
1981
1982         return 0;
1983 error:
1984         for_each_drhd_unit(drhd) {
1985                 if (drhd->ignored)
1986                         continue;
1987                 iommu = drhd->iommu;
1988                 free_iommu(iommu);
1989         }
1990         kfree(g_iommus);
1991         return ret;
1992 }
1993
1994 static inline u64 aligned_size(u64 host_addr, size_t size)
1995 {
1996         u64 addr;
1997         addr = (host_addr & (~PAGE_MASK)) + size;
1998         return PAGE_ALIGN(addr);
1999 }
2000
2001 struct iova *
2002 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2003 {
2004         struct iova *piova;
2005
2006         /* Make sure it's in range */
2007         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2008         if (!size || (IOVA_START_ADDR + size > end))
2009                 return NULL;
2010
2011         piova = alloc_iova(&domain->iovad,
2012                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2013         return piova;
2014 }
2015
2016 static struct iova *
2017 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2018                    size_t size, u64 dma_mask)
2019 {
2020         struct pci_dev *pdev = to_pci_dev(dev);
2021         struct iova *iova = NULL;
2022
2023         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2024                 iova = iommu_alloc_iova(domain, size, dma_mask);
2025         else {
2026                 /*
2027                  * First try to allocate an io virtual address in
2028                  * DMA_32BIT_MASK and if that fails then try allocating
2029                  * from higher range
2030                  */
2031                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2032                 if (!iova)
2033                         iova = iommu_alloc_iova(domain, size, dma_mask);
2034         }
2035
2036         if (!iova) {
2037                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2038                 return NULL;
2039         }
2040
2041         return iova;
2042 }
2043
2044 static struct dmar_domain *
2045 get_valid_domain_for_dev(struct pci_dev *pdev)
2046 {
2047         struct dmar_domain *domain;
2048         int ret;
2049
2050         domain = get_domain_for_dev(pdev,
2051                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2052         if (!domain) {
2053                 printk(KERN_ERR
2054                         "Allocating domain for %s failed", pci_name(pdev));
2055                 return NULL;
2056         }
2057
2058         /* make sure context mapping is ok */
2059         if (unlikely(!domain_context_mapped(domain, pdev))) {
2060                 ret = domain_context_mapping(domain, pdev);
2061                 if (ret) {
2062                         printk(KERN_ERR
2063                                 "Domain context map for %s failed",
2064                                 pci_name(pdev));
2065                         return NULL;
2066                 }
2067         }
2068
2069         return domain;
2070 }
2071
2072 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2073                                      size_t size, int dir, u64 dma_mask)
2074 {
2075         struct pci_dev *pdev = to_pci_dev(hwdev);
2076         struct dmar_domain *domain;
2077         phys_addr_t start_paddr;
2078         struct iova *iova;
2079         int prot = 0;
2080         int ret;
2081         struct intel_iommu *iommu;
2082
2083         BUG_ON(dir == DMA_NONE);
2084         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2085                 return paddr;
2086
2087         domain = get_valid_domain_for_dev(pdev);
2088         if (!domain)
2089                 return 0;
2090
2091         iommu = domain_get_iommu(domain);
2092         size = aligned_size((u64)paddr, size);
2093
2094         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2095         if (!iova)
2096                 goto error;
2097
2098         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2099
2100         /*
2101          * Check if DMAR supports zero-length reads on write only
2102          * mappings..
2103          */
2104         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2105                         !cap_zlr(iommu->cap))
2106                 prot |= DMA_PTE_READ;
2107         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2108                 prot |= DMA_PTE_WRITE;
2109         /*
2110          * paddr - (paddr + size) might be partial page, we should map the whole
2111          * page.  Note: if two part of one page are separately mapped, we
2112          * might have two guest_addr mapping to the same host paddr, but this
2113          * is not a big problem
2114          */
2115         ret = domain_page_mapping(domain, start_paddr,
2116                 ((u64)paddr) & PAGE_MASK, size, prot);
2117         if (ret)
2118                 goto error;
2119
2120         /* it's a non-present to present mapping */
2121         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2122                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2123         if (ret)
2124                 iommu_flush_write_buffer(iommu);
2125
2126         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2127
2128 error:
2129         if (iova)
2130                 __free_iova(&domain->iovad, iova);
2131         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2132                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2133         return 0;
2134 }
2135
2136 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2137                             size_t size, int dir)
2138 {
2139         return __intel_map_single(hwdev, paddr, size, dir,
2140                                   to_pci_dev(hwdev)->dma_mask);
2141 }
2142
2143 static void flush_unmaps(void)
2144 {
2145         int i, j;
2146
2147         timer_on = 0;
2148
2149         /* just flush them all */
2150         for (i = 0; i < g_num_of_iommus; i++) {
2151                 struct intel_iommu *iommu = g_iommus[i];
2152                 if (!iommu)
2153                         continue;
2154
2155                 if (deferred_flush[i].next) {
2156                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2157                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2158                         for (j = 0; j < deferred_flush[i].next; j++) {
2159                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2160                                                 deferred_flush[i].iova[j]);
2161                         }
2162                         deferred_flush[i].next = 0;
2163                 }
2164         }
2165
2166         list_size = 0;
2167 }
2168
2169 static void flush_unmaps_timeout(unsigned long data)
2170 {
2171         unsigned long flags;
2172
2173         spin_lock_irqsave(&async_umap_flush_lock, flags);
2174         flush_unmaps();
2175         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2176 }
2177
2178 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2179 {
2180         unsigned long flags;
2181         int next, iommu_id;
2182         struct intel_iommu *iommu;
2183
2184         spin_lock_irqsave(&async_umap_flush_lock, flags);
2185         if (list_size == HIGH_WATER_MARK)
2186                 flush_unmaps();
2187
2188         iommu = domain_get_iommu(dom);
2189         iommu_id = iommu->seq_id;
2190
2191         next = deferred_flush[iommu_id].next;
2192         deferred_flush[iommu_id].domain[next] = dom;
2193         deferred_flush[iommu_id].iova[next] = iova;
2194         deferred_flush[iommu_id].next++;
2195
2196         if (!timer_on) {
2197                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2198                 timer_on = 1;
2199         }
2200         list_size++;
2201         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2202 }
2203
2204 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2205                         int dir)
2206 {
2207         struct pci_dev *pdev = to_pci_dev(dev);
2208         struct dmar_domain *domain;
2209         unsigned long start_addr;
2210         struct iova *iova;
2211         struct intel_iommu *iommu;
2212
2213         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2214                 return;
2215         domain = find_domain(pdev);
2216         BUG_ON(!domain);
2217
2218         iommu = domain_get_iommu(domain);
2219
2220         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2221         if (!iova)
2222                 return;
2223
2224         start_addr = iova->pfn_lo << PAGE_SHIFT;
2225         size = aligned_size((u64)dev_addr, size);
2226
2227         pr_debug("Device %s unmapping: %lx@%llx\n",
2228                 pci_name(pdev), size, (unsigned long long)start_addr);
2229
2230         /*  clear the whole page */
2231         dma_pte_clear_range(domain, start_addr, start_addr + size);
2232         /* free page tables */
2233         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2234         if (intel_iommu_strict) {
2235                 if (iommu_flush_iotlb_psi(iommu,
2236                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2237                         iommu_flush_write_buffer(iommu);
2238                 /* free iova */
2239                 __free_iova(&domain->iovad, iova);
2240         } else {
2241                 add_unmap(domain, iova);
2242                 /*
2243                  * queue up the release of the unmap to save the 1/6th of the
2244                  * cpu used up by the iotlb flush operation...
2245                  */
2246         }
2247 }
2248
2249 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2250                            dma_addr_t *dma_handle, gfp_t flags)
2251 {
2252         void *vaddr;
2253         int order;
2254
2255         size = PAGE_ALIGN(size);
2256         order = get_order(size);
2257         flags &= ~(GFP_DMA | GFP_DMA32);
2258
2259         vaddr = (void *)__get_free_pages(flags, order);
2260         if (!vaddr)
2261                 return NULL;
2262         memset(vaddr, 0, size);
2263
2264         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2265                                          DMA_BIDIRECTIONAL,
2266                                          hwdev->coherent_dma_mask);
2267         if (*dma_handle)
2268                 return vaddr;
2269         free_pages((unsigned long)vaddr, order);
2270         return NULL;
2271 }
2272
2273 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2274                          dma_addr_t dma_handle)
2275 {
2276         int order;
2277
2278         size = PAGE_ALIGN(size);
2279         order = get_order(size);
2280
2281         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2282         free_pages((unsigned long)vaddr, order);
2283 }
2284
2285 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2286
2287 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2288                     int nelems, int dir)
2289 {
2290         int i;
2291         struct pci_dev *pdev = to_pci_dev(hwdev);
2292         struct dmar_domain *domain;
2293         unsigned long start_addr;
2294         struct iova *iova;
2295         size_t size = 0;
2296         void *addr;
2297         struct scatterlist *sg;
2298         struct intel_iommu *iommu;
2299
2300         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2301                 return;
2302
2303         domain = find_domain(pdev);
2304         BUG_ON(!domain);
2305
2306         iommu = domain_get_iommu(domain);
2307
2308         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2309         if (!iova)
2310                 return;
2311         for_each_sg(sglist, sg, nelems, i) {
2312                 addr = SG_ENT_VIRT_ADDRESS(sg);
2313                 size += aligned_size((u64)addr, sg->length);
2314         }
2315
2316         start_addr = iova->pfn_lo << PAGE_SHIFT;
2317
2318         /*  clear the whole page */
2319         dma_pte_clear_range(domain, start_addr, start_addr + size);
2320         /* free page tables */
2321         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2322
2323         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2324                         size >> VTD_PAGE_SHIFT, 0))
2325                 iommu_flush_write_buffer(iommu);
2326
2327         /* free iova */
2328         __free_iova(&domain->iovad, iova);
2329 }
2330
2331 static int intel_nontranslate_map_sg(struct device *hddev,
2332         struct scatterlist *sglist, int nelems, int dir)
2333 {
2334         int i;
2335         struct scatterlist *sg;
2336
2337         for_each_sg(sglist, sg, nelems, i) {
2338                 BUG_ON(!sg_page(sg));
2339                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2340                 sg->dma_length = sg->length;
2341         }
2342         return nelems;
2343 }
2344
2345 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2346                  int dir)
2347 {
2348         void *addr;
2349         int i;
2350         struct pci_dev *pdev = to_pci_dev(hwdev);
2351         struct dmar_domain *domain;
2352         size_t size = 0;
2353         int prot = 0;
2354         size_t offset = 0;
2355         struct iova *iova = NULL;
2356         int ret;
2357         struct scatterlist *sg;
2358         unsigned long start_addr;
2359         struct intel_iommu *iommu;
2360
2361         BUG_ON(dir == DMA_NONE);
2362         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2363                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2364
2365         domain = get_valid_domain_for_dev(pdev);
2366         if (!domain)
2367                 return 0;
2368
2369         iommu = domain_get_iommu(domain);
2370
2371         for_each_sg(sglist, sg, nelems, i) {
2372                 addr = SG_ENT_VIRT_ADDRESS(sg);
2373                 addr = (void *)virt_to_phys(addr);
2374                 size += aligned_size((u64)addr, sg->length);
2375         }
2376
2377         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2378         if (!iova) {
2379                 sglist->dma_length = 0;
2380                 return 0;
2381         }
2382
2383         /*
2384          * Check if DMAR supports zero-length reads on write only
2385          * mappings..
2386          */
2387         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2388                         !cap_zlr(iommu->cap))
2389                 prot |= DMA_PTE_READ;
2390         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2391                 prot |= DMA_PTE_WRITE;
2392
2393         start_addr = iova->pfn_lo << PAGE_SHIFT;
2394         offset = 0;
2395         for_each_sg(sglist, sg, nelems, i) {
2396                 addr = SG_ENT_VIRT_ADDRESS(sg);
2397                 addr = (void *)virt_to_phys(addr);
2398                 size = aligned_size((u64)addr, sg->length);
2399                 ret = domain_page_mapping(domain, start_addr + offset,
2400                         ((u64)addr) & PAGE_MASK,
2401                         size, prot);
2402                 if (ret) {
2403                         /*  clear the page */
2404                         dma_pte_clear_range(domain, start_addr,
2405                                   start_addr + offset);
2406                         /* free page tables */
2407                         dma_pte_free_pagetable(domain, start_addr,
2408                                   start_addr + offset);
2409                         /* free iova */
2410                         __free_iova(&domain->iovad, iova);
2411                         return 0;
2412                 }
2413                 sg->dma_address = start_addr + offset +
2414                                 ((u64)addr & (~PAGE_MASK));
2415                 sg->dma_length = sg->length;
2416                 offset += size;
2417         }
2418
2419         /* it's a non-present to present mapping */
2420         if (iommu_flush_iotlb_psi(iommu, domain->id,
2421                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2422                 iommu_flush_write_buffer(iommu);
2423         return nelems;
2424 }
2425
2426 static struct dma_mapping_ops intel_dma_ops = {
2427         .alloc_coherent = intel_alloc_coherent,
2428         .free_coherent = intel_free_coherent,
2429         .map_single = intel_map_single,
2430         .unmap_single = intel_unmap_single,
2431         .map_sg = intel_map_sg,
2432         .unmap_sg = intel_unmap_sg,
2433 };
2434
2435 static inline int iommu_domain_cache_init(void)
2436 {
2437         int ret = 0;
2438
2439         iommu_domain_cache = kmem_cache_create("iommu_domain",
2440                                          sizeof(struct dmar_domain),
2441                                          0,
2442                                          SLAB_HWCACHE_ALIGN,
2443
2444                                          NULL);
2445         if (!iommu_domain_cache) {
2446                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2447                 ret = -ENOMEM;
2448         }
2449
2450         return ret;
2451 }
2452
2453 static inline int iommu_devinfo_cache_init(void)
2454 {
2455         int ret = 0;
2456
2457         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2458                                          sizeof(struct device_domain_info),
2459                                          0,
2460                                          SLAB_HWCACHE_ALIGN,
2461                                          NULL);
2462         if (!iommu_devinfo_cache) {
2463                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2464                 ret = -ENOMEM;
2465         }
2466
2467         return ret;
2468 }
2469
2470 static inline int iommu_iova_cache_init(void)
2471 {
2472         int ret = 0;
2473
2474         iommu_iova_cache = kmem_cache_create("iommu_iova",
2475                                          sizeof(struct iova),
2476                                          0,
2477                                          SLAB_HWCACHE_ALIGN,
2478                                          NULL);
2479         if (!iommu_iova_cache) {
2480                 printk(KERN_ERR "Couldn't create iova cache\n");
2481                 ret = -ENOMEM;
2482         }
2483
2484         return ret;
2485 }
2486
2487 static int __init iommu_init_mempool(void)
2488 {
2489         int ret;
2490         ret = iommu_iova_cache_init();
2491         if (ret)
2492                 return ret;
2493
2494         ret = iommu_domain_cache_init();
2495         if (ret)
2496                 goto domain_error;
2497
2498         ret = iommu_devinfo_cache_init();
2499         if (!ret)
2500                 return ret;
2501
2502         kmem_cache_destroy(iommu_domain_cache);
2503 domain_error:
2504         kmem_cache_destroy(iommu_iova_cache);
2505
2506         return -ENOMEM;
2507 }
2508
2509 static void __init iommu_exit_mempool(void)
2510 {
2511         kmem_cache_destroy(iommu_devinfo_cache);
2512         kmem_cache_destroy(iommu_domain_cache);
2513         kmem_cache_destroy(iommu_iova_cache);
2514
2515 }
2516
2517 static void __init init_no_remapping_devices(void)
2518 {
2519         struct dmar_drhd_unit *drhd;
2520
2521         for_each_drhd_unit(drhd) {
2522                 if (!drhd->include_all) {
2523                         int i;
2524                         for (i = 0; i < drhd->devices_cnt; i++)
2525                                 if (drhd->devices[i] != NULL)
2526                                         break;
2527                         /* ignore DMAR unit if no pci devices exist */
2528                         if (i == drhd->devices_cnt)
2529                                 drhd->ignored = 1;
2530                 }
2531         }
2532
2533         if (dmar_map_gfx)
2534                 return;
2535
2536         for_each_drhd_unit(drhd) {
2537                 int i;
2538                 if (drhd->ignored || drhd->include_all)
2539                         continue;
2540
2541                 for (i = 0; i < drhd->devices_cnt; i++)
2542                         if (drhd->devices[i] &&
2543                                 !IS_GFX_DEVICE(drhd->devices[i]))
2544                                 break;
2545
2546                 if (i < drhd->devices_cnt)
2547                         continue;
2548
2549                 /* bypass IOMMU if it is just for gfx devices */
2550                 drhd->ignored = 1;
2551                 for (i = 0; i < drhd->devices_cnt; i++) {
2552                         if (!drhd->devices[i])
2553                                 continue;
2554                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2555                 }
2556         }
2557 }
2558
2559 int __init intel_iommu_init(void)
2560 {
2561         int ret = 0;
2562
2563         if (dmar_table_init())
2564                 return  -ENODEV;
2565
2566         if (dmar_dev_scope_init())
2567                 return  -ENODEV;
2568
2569         /*
2570          * Check the need for DMA-remapping initialization now.
2571          * Above initialization will also be used by Interrupt-remapping.
2572          */
2573         if (no_iommu || swiotlb || dmar_disabled)
2574                 return -ENODEV;
2575
2576         iommu_init_mempool();
2577         dmar_init_reserved_ranges();
2578
2579         init_no_remapping_devices();
2580
2581         ret = init_dmars();
2582         if (ret) {
2583                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2584                 put_iova_domain(&reserved_iova_list);
2585                 iommu_exit_mempool();
2586                 return ret;
2587         }
2588         printk(KERN_INFO
2589         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2590
2591         init_timer(&unmap_timer);
2592         force_iommu = 1;
2593         dma_ops = &intel_dma_ops;
2594         return 0;
2595 }
2596
2597 void intel_iommu_domain_exit(struct dmar_domain *domain)
2598 {
2599         u64 end;
2600
2601         /* Domain 0 is reserved, so dont process it */
2602         if (!domain)
2603                 return;
2604
2605         end = DOMAIN_MAX_ADDR(domain->gaw);
2606         end = end & (~VTD_PAGE_MASK);
2607
2608         /* clear ptes */
2609         dma_pte_clear_range(domain, 0, end);
2610
2611         /* free page tables */
2612         dma_pte_free_pagetable(domain, 0, end);
2613
2614         iommu_free_domain(domain);
2615         free_domain_mem(domain);
2616 }
2617 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2618
2619 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2620 {
2621         struct dmar_drhd_unit *drhd;
2622         struct dmar_domain *domain;
2623         struct intel_iommu *iommu;
2624
2625         drhd = dmar_find_matched_drhd_unit(pdev);
2626         if (!drhd) {
2627                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2628                 return NULL;
2629         }
2630
2631         iommu = drhd->iommu;
2632         if (!iommu) {
2633                 printk(KERN_ERR
2634                         "intel_iommu_domain_alloc: iommu == NULL\n");
2635                 return NULL;
2636         }
2637         domain = iommu_alloc_domain(iommu);
2638         if (!domain) {
2639                 printk(KERN_ERR
2640                         "intel_iommu_domain_alloc: domain == NULL\n");
2641                 return NULL;
2642         }
2643         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2644                 printk(KERN_ERR
2645                         "intel_iommu_domain_alloc: domain_init() failed\n");
2646                 intel_iommu_domain_exit(domain);
2647                 return NULL;
2648         }
2649         return domain;
2650 }
2651 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2652
2653 int intel_iommu_context_mapping(
2654         struct dmar_domain *domain, struct pci_dev *pdev)
2655 {
2656         int rc;
2657         rc = domain_context_mapping(domain, pdev);
2658         return rc;
2659 }
2660 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2661
2662 int intel_iommu_page_mapping(
2663         struct dmar_domain *domain, dma_addr_t iova,
2664         u64 hpa, size_t size, int prot)
2665 {
2666         int rc;
2667         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2668         return rc;
2669 }
2670 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2671
2672 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2673 {
2674         detach_domain_for_dev(domain, bus, devfn);
2675 }
2676 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2677
2678 struct dmar_domain *
2679 intel_iommu_find_domain(struct pci_dev *pdev)
2680 {
2681         return find_domain(pdev);
2682 }
2683 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2684
2685 int intel_iommu_found(void)
2686 {
2687         return g_num_of_iommus;
2688 }
2689 EXPORT_SYMBOL_GPL(intel_iommu_found);
2690
2691 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2692 {
2693         struct dma_pte *pte;
2694         u64 pfn;
2695
2696         pfn = 0;
2697         pte = addr_to_dma_pte(domain, iova);
2698
2699         if (pte)
2700                 pfn = dma_pte_addr(pte);
2701
2702         return pfn >> VTD_PAGE_SHIFT;
2703 }
2704 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);