]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/iommu/intel-iommu.c
intel-iommu: Free old page tables before creating superpage
[karo-tx-linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 static inline int agaw_to_level(int agaw)
82 {
83         return agaw + 2;
84 }
85
86 static inline int agaw_to_width(int agaw)
87 {
88         return 30 + agaw * LEVEL_STRIDE;
89 }
90
91 static inline int width_to_agaw(int width)
92 {
93         return (width - 30) / LEVEL_STRIDE;
94 }
95
96 static inline unsigned int level_to_offset_bits(int level)
97 {
98         return (level - 1) * LEVEL_STRIDE;
99 }
100
101 static inline int pfn_level_offset(unsigned long pfn, int level)
102 {
103         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
104 }
105
106 static inline unsigned long level_mask(int level)
107 {
108         return -1UL << level_to_offset_bits(level);
109 }
110
111 static inline unsigned long level_size(int level)
112 {
113         return 1UL << level_to_offset_bits(level);
114 }
115
116 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 {
118         return (pfn + level_size(level) - 1) & level_mask(level);
119 }
120
121 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
122 {
123         return  1 << ((lvl - 1) * LEVEL_STRIDE);
124 }
125
126 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
127    are never going to work. */
128 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
129 {
130         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
131 }
132
133 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
134 {
135         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
136 }
137 static inline unsigned long page_to_dma_pfn(struct page *pg)
138 {
139         return mm_to_dma_pfn(page_to_pfn(pg));
140 }
141 static inline unsigned long virt_to_dma_pfn(void *p)
142 {
143         return page_to_dma_pfn(virt_to_page(p));
144 }
145
146 /* global iommu list, set NULL for ignored DMAR units */
147 static struct intel_iommu **g_iommus;
148
149 static void __init check_tylersburg_isoch(void);
150 static int rwbf_quirk;
151
152 /*
153  * set to 1 to panic kernel if can't successfully enable VT-d
154  * (used when kernel is launched w/ TXT)
155  */
156 static int force_on = 0;
157
158 /*
159  * 0: Present
160  * 1-11: Reserved
161  * 12-63: Context Ptr (12 - (haw-1))
162  * 64-127: Reserved
163  */
164 struct root_entry {
165         u64     val;
166         u64     rsvd1;
167 };
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 static inline bool root_present(struct root_entry *root)
170 {
171         return (root->val & 1);
172 }
173 static inline void set_root_present(struct root_entry *root)
174 {
175         root->val |= 1;
176 }
177 static inline void set_root_value(struct root_entry *root, unsigned long value)
178 {
179         root->val |= value & VTD_PAGE_MASK;
180 }
181
182 static inline struct context_entry *
183 get_context_addr_from_root(struct root_entry *root)
184 {
185         return (struct context_entry *)
186                 (root_present(root)?phys_to_virt(
187                 root->val & VTD_PAGE_MASK) :
188                 NULL);
189 }
190
191 /*
192  * low 64 bits:
193  * 0: present
194  * 1: fault processing disable
195  * 2-3: translation type
196  * 12-63: address space root
197  * high 64 bits:
198  * 0-2: address width
199  * 3-6: aval
200  * 8-23: domain id
201  */
202 struct context_entry {
203         u64 lo;
204         u64 hi;
205 };
206
207 static inline bool context_present(struct context_entry *context)
208 {
209         return (context->lo & 1);
210 }
211 static inline void context_set_present(struct context_entry *context)
212 {
213         context->lo |= 1;
214 }
215
216 static inline void context_set_fault_enable(struct context_entry *context)
217 {
218         context->lo &= (((u64)-1) << 2) | 1;
219 }
220
221 static inline void context_set_translation_type(struct context_entry *context,
222                                                 unsigned long value)
223 {
224         context->lo &= (((u64)-1) << 4) | 3;
225         context->lo |= (value & 3) << 2;
226 }
227
228 static inline void context_set_address_root(struct context_entry *context,
229                                             unsigned long value)
230 {
231         context->lo |= value & VTD_PAGE_MASK;
232 }
233
234 static inline void context_set_address_width(struct context_entry *context,
235                                              unsigned long value)
236 {
237         context->hi |= value & 7;
238 }
239
240 static inline void context_set_domain_id(struct context_entry *context,
241                                          unsigned long value)
242 {
243         context->hi |= (value & ((1 << 16) - 1)) << 8;
244 }
245
246 static inline void context_clear_entry(struct context_entry *context)
247 {
248         context->lo = 0;
249         context->hi = 0;
250 }
251
252 /*
253  * 0: readable
254  * 1: writable
255  * 2-6: reserved
256  * 7: super page
257  * 8-10: available
258  * 11: snoop behavior
259  * 12-63: Host physcial address
260  */
261 struct dma_pte {
262         u64 val;
263 };
264
265 static inline void dma_clear_pte(struct dma_pte *pte)
266 {
267         pte->val = 0;
268 }
269
270 static inline void dma_set_pte_readable(struct dma_pte *pte)
271 {
272         pte->val |= DMA_PTE_READ;
273 }
274
275 static inline void dma_set_pte_writable(struct dma_pte *pte)
276 {
277         pte->val |= DMA_PTE_WRITE;
278 }
279
280 static inline void dma_set_pte_snp(struct dma_pte *pte)
281 {
282         pte->val |= DMA_PTE_SNP;
283 }
284
285 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
286 {
287         pte->val = (pte->val & ~3) | (prot & 3);
288 }
289
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
291 {
292 #ifdef CONFIG_64BIT
293         return pte->val & VTD_PAGE_MASK;
294 #else
295         /* Must have a full atomic 64-bit read */
296         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
298 }
299
300 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
301 {
302         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
303 }
304
305 static inline bool dma_pte_present(struct dma_pte *pte)
306 {
307         return (pte->val & 3) != 0;
308 }
309
310 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 {
312         return (pte->val & (1 << 7));
313 }
314
315 static inline int first_pte_in_page(struct dma_pte *pte)
316 {
317         return !((unsigned long)pte & ~VTD_PAGE_MASK);
318 }
319
320 /*
321  * This domain is a statically identity mapping domain.
322  *      1. This domain creats a static 1:1 mapping to all usable memory.
323  *      2. It maps to each iommu if successful.
324  *      3. Each iommu mapps to this domain if successful.
325  */
326 static struct dmar_domain *si_domain;
327 static int hw_pass_through = 1;
328
329 /* devices under the same p2p bridge are owned in one domain */
330 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
331
332 /* domain represents a virtual machine, more than one devices
333  * across iommus may be owned in one domain, e.g. kvm guest.
334  */
335 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
336
337 /* si_domain contains mulitple devices */
338 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
339
340 struct dmar_domain {
341         int     id;                     /* domain id */
342         int     nid;                    /* node id */
343         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
344
345         struct list_head devices;       /* all devices' list */
346         struct iova_domain iovad;       /* iova's that belong to this domain */
347
348         struct dma_pte  *pgd;           /* virtual address */
349         int             gaw;            /* max guest address width */
350
351         /* adjusted guest address width, 0 is level 2 30-bit */
352         int             agaw;
353
354         int             flags;          /* flags to find out type of domain */
355
356         int             iommu_coherency;/* indicate coherency of iommu access */
357         int             iommu_snooping; /* indicate snooping control feature*/
358         int             iommu_count;    /* reference count of iommu */
359         int             iommu_superpage;/* Level of superpages supported:
360                                            0 == 4KiB (no superpages), 1 == 2MiB,
361                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362         spinlock_t      iommu_lock;     /* protect iommu set in domain */
363         u64             max_addr;       /* maximum mapped address */
364 };
365
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368         struct list_head link;  /* link to domain siblings */
369         struct list_head global; /* link to global list */
370         int segment;            /* PCI domain */
371         u8 bus;                 /* PCI bus number */
372         u8 devfn;               /* PCI devfn number */
373         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
374         struct intel_iommu *iommu; /* IOMMU used by this device */
375         struct dmar_domain *domain; /* pointer to domain */
376 };
377
378 static void flush_unmaps_timeout(unsigned long data);
379
380 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
381
382 #define HIGH_WATER_MARK 250
383 struct deferred_flush_tables {
384         int next;
385         struct iova *iova[HIGH_WATER_MARK];
386         struct dmar_domain *domain[HIGH_WATER_MARK];
387 };
388
389 static struct deferred_flush_tables *deferred_flush;
390
391 /* bitmap for indexing intel_iommus */
392 static int g_num_of_iommus;
393
394 static DEFINE_SPINLOCK(async_umap_flush_lock);
395 static LIST_HEAD(unmaps_to_do);
396
397 static int timer_on;
398 static long list_size;
399
400 static void domain_remove_dev_info(struct dmar_domain *domain);
401
402 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
403 int dmar_disabled = 0;
404 #else
405 int dmar_disabled = 1;
406 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
407
408 int intel_iommu_enabled = 0;
409 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
410
411 static int dmar_map_gfx = 1;
412 static int dmar_forcedac;
413 static int intel_iommu_strict;
414 static int intel_iommu_superpage = 1;
415
416 int intel_iommu_gfx_mapped;
417 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
418
419 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
420 static DEFINE_SPINLOCK(device_domain_lock);
421 static LIST_HEAD(device_domain_list);
422
423 static struct iommu_ops intel_iommu_ops;
424
425 static int __init intel_iommu_setup(char *str)
426 {
427         if (!str)
428                 return -EINVAL;
429         while (*str) {
430                 if (!strncmp(str, "on", 2)) {
431                         dmar_disabled = 0;
432                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
433                 } else if (!strncmp(str, "off", 3)) {
434                         dmar_disabled = 1;
435                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
436                 } else if (!strncmp(str, "igfx_off", 8)) {
437                         dmar_map_gfx = 0;
438                         printk(KERN_INFO
439                                 "Intel-IOMMU: disable GFX device mapping\n");
440                 } else if (!strncmp(str, "forcedac", 8)) {
441                         printk(KERN_INFO
442                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
443                         dmar_forcedac = 1;
444                 } else if (!strncmp(str, "strict", 6)) {
445                         printk(KERN_INFO
446                                 "Intel-IOMMU: disable batched IOTLB flush\n");
447                         intel_iommu_strict = 1;
448                 } else if (!strncmp(str, "sp_off", 6)) {
449                         printk(KERN_INFO
450                                 "Intel-IOMMU: disable supported super page\n");
451                         intel_iommu_superpage = 0;
452                 }
453
454                 str += strcspn(str, ",");
455                 while (*str == ',')
456                         str++;
457         }
458         return 0;
459 }
460 __setup("intel_iommu=", intel_iommu_setup);
461
462 static struct kmem_cache *iommu_domain_cache;
463 static struct kmem_cache *iommu_devinfo_cache;
464 static struct kmem_cache *iommu_iova_cache;
465
466 static inline void *alloc_pgtable_page(int node)
467 {
468         struct page *page;
469         void *vaddr = NULL;
470
471         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
472         if (page)
473                 vaddr = page_address(page);
474         return vaddr;
475 }
476
477 static inline void free_pgtable_page(void *vaddr)
478 {
479         free_page((unsigned long)vaddr);
480 }
481
482 static inline void *alloc_domain_mem(void)
483 {
484         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
485 }
486
487 static void free_domain_mem(void *vaddr)
488 {
489         kmem_cache_free(iommu_domain_cache, vaddr);
490 }
491
492 static inline void * alloc_devinfo_mem(void)
493 {
494         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
495 }
496
497 static inline void free_devinfo_mem(void *vaddr)
498 {
499         kmem_cache_free(iommu_devinfo_cache, vaddr);
500 }
501
502 struct iova *alloc_iova_mem(void)
503 {
504         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
505 }
506
507 void free_iova_mem(struct iova *iova)
508 {
509         kmem_cache_free(iommu_iova_cache, iova);
510 }
511
512
513 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
514 {
515         unsigned long sagaw;
516         int agaw = -1;
517
518         sagaw = cap_sagaw(iommu->cap);
519         for (agaw = width_to_agaw(max_gaw);
520              agaw >= 0; agaw--) {
521                 if (test_bit(agaw, &sagaw))
522                         break;
523         }
524
525         return agaw;
526 }
527
528 /*
529  * Calculate max SAGAW for each iommu.
530  */
531 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
532 {
533         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
534 }
535
536 /*
537  * calculate agaw for each iommu.
538  * "SAGAW" may be different across iommus, use a default agaw, and
539  * get a supported less agaw for iommus that don't support the default agaw.
540  */
541 int iommu_calculate_agaw(struct intel_iommu *iommu)
542 {
543         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
544 }
545
546 /* This functionin only returns single iommu in a domain */
547 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
548 {
549         int iommu_id;
550
551         /* si_domain and vm domain should not get here. */
552         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
553         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
554
555         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
556         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
557                 return NULL;
558
559         return g_iommus[iommu_id];
560 }
561
562 static void domain_update_iommu_coherency(struct dmar_domain *domain)
563 {
564         int i;
565
566         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
567
568         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
569
570         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
571                 if (!ecap_coherent(g_iommus[i]->ecap)) {
572                         domain->iommu_coherency = 0;
573                         break;
574                 }
575         }
576 }
577
578 static void domain_update_iommu_snooping(struct dmar_domain *domain)
579 {
580         int i;
581
582         domain->iommu_snooping = 1;
583
584         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
585                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
586                         domain->iommu_snooping = 0;
587                         break;
588                 }
589         }
590 }
591
592 static void domain_update_iommu_superpage(struct dmar_domain *domain)
593 {
594         struct dmar_drhd_unit *drhd;
595         struct intel_iommu *iommu = NULL;
596         int mask = 0xf;
597
598         if (!intel_iommu_superpage) {
599                 domain->iommu_superpage = 0;
600                 return;
601         }
602
603         /* set iommu_superpage to the smallest common denominator */
604         for_each_active_iommu(iommu, drhd) {
605                 mask &= cap_super_page_val(iommu->cap);
606                 if (!mask) {
607                         break;
608                 }
609         }
610         domain->iommu_superpage = fls(mask);
611 }
612
613 /* Some capabilities may be different across iommus */
614 static void domain_update_iommu_cap(struct dmar_domain *domain)
615 {
616         domain_update_iommu_coherency(domain);
617         domain_update_iommu_snooping(domain);
618         domain_update_iommu_superpage(domain);
619 }
620
621 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
622 {
623         struct dmar_drhd_unit *drhd = NULL;
624         int i;
625
626         for_each_drhd_unit(drhd) {
627                 if (drhd->ignored)
628                         continue;
629                 if (segment != drhd->segment)
630                         continue;
631
632                 for (i = 0; i < drhd->devices_cnt; i++) {
633                         if (drhd->devices[i] &&
634                             drhd->devices[i]->bus->number == bus &&
635                             drhd->devices[i]->devfn == devfn)
636                                 return drhd->iommu;
637                         if (drhd->devices[i] &&
638                             drhd->devices[i]->subordinate &&
639                             drhd->devices[i]->subordinate->number <= bus &&
640                             drhd->devices[i]->subordinate->subordinate >= bus)
641                                 return drhd->iommu;
642                 }
643
644                 if (drhd->include_all)
645                         return drhd->iommu;
646         }
647
648         return NULL;
649 }
650
651 static void domain_flush_cache(struct dmar_domain *domain,
652                                void *addr, int size)
653 {
654         if (!domain->iommu_coherency)
655                 clflush_cache_range(addr, size);
656 }
657
658 /* Gets context entry for a given bus and devfn */
659 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
660                 u8 bus, u8 devfn)
661 {
662         struct root_entry *root;
663         struct context_entry *context;
664         unsigned long phy_addr;
665         unsigned long flags;
666
667         spin_lock_irqsave(&iommu->lock, flags);
668         root = &iommu->root_entry[bus];
669         context = get_context_addr_from_root(root);
670         if (!context) {
671                 context = (struct context_entry *)
672                                 alloc_pgtable_page(iommu->node);
673                 if (!context) {
674                         spin_unlock_irqrestore(&iommu->lock, flags);
675                         return NULL;
676                 }
677                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
678                 phy_addr = virt_to_phys((void *)context);
679                 set_root_value(root, phy_addr);
680                 set_root_present(root);
681                 __iommu_flush_cache(iommu, root, sizeof(*root));
682         }
683         spin_unlock_irqrestore(&iommu->lock, flags);
684         return &context[devfn];
685 }
686
687 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
688 {
689         struct root_entry *root;
690         struct context_entry *context;
691         int ret;
692         unsigned long flags;
693
694         spin_lock_irqsave(&iommu->lock, flags);
695         root = &iommu->root_entry[bus];
696         context = get_context_addr_from_root(root);
697         if (!context) {
698                 ret = 0;
699                 goto out;
700         }
701         ret = context_present(&context[devfn]);
702 out:
703         spin_unlock_irqrestore(&iommu->lock, flags);
704         return ret;
705 }
706
707 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
708 {
709         struct root_entry *root;
710         struct context_entry *context;
711         unsigned long flags;
712
713         spin_lock_irqsave(&iommu->lock, flags);
714         root = &iommu->root_entry[bus];
715         context = get_context_addr_from_root(root);
716         if (context) {
717                 context_clear_entry(&context[devfn]);
718                 __iommu_flush_cache(iommu, &context[devfn], \
719                         sizeof(*context));
720         }
721         spin_unlock_irqrestore(&iommu->lock, flags);
722 }
723
724 static void free_context_table(struct intel_iommu *iommu)
725 {
726         struct root_entry *root;
727         int i;
728         unsigned long flags;
729         struct context_entry *context;
730
731         spin_lock_irqsave(&iommu->lock, flags);
732         if (!iommu->root_entry) {
733                 goto out;
734         }
735         for (i = 0; i < ROOT_ENTRY_NR; i++) {
736                 root = &iommu->root_entry[i];
737                 context = get_context_addr_from_root(root);
738                 if (context)
739                         free_pgtable_page(context);
740         }
741         free_pgtable_page(iommu->root_entry);
742         iommu->root_entry = NULL;
743 out:
744         spin_unlock_irqrestore(&iommu->lock, flags);
745 }
746
747 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
748                                       unsigned long pfn, int target_level)
749 {
750         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
751         struct dma_pte *parent, *pte = NULL;
752         int level = agaw_to_level(domain->agaw);
753         int offset;
754
755         BUG_ON(!domain->pgd);
756         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
757         parent = domain->pgd;
758
759         while (level > 0) {
760                 void *tmp_page;
761
762                 offset = pfn_level_offset(pfn, level);
763                 pte = &parent[offset];
764                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
765                         break;
766                 if (level == target_level)
767                         break;
768
769                 if (!dma_pte_present(pte)) {
770                         uint64_t pteval;
771
772                         tmp_page = alloc_pgtable_page(domain->nid);
773
774                         if (!tmp_page)
775                                 return NULL;
776
777                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
778                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
779                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
780                                 /* Someone else set it while we were thinking; use theirs. */
781                                 free_pgtable_page(tmp_page);
782                         } else {
783                                 dma_pte_addr(pte);
784                                 domain_flush_cache(domain, pte, sizeof(*pte));
785                         }
786                 }
787                 parent = phys_to_virt(dma_pte_addr(pte));
788                 level--;
789         }
790
791         return pte;
792 }
793
794
795 /* return address's pte at specific level */
796 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
797                                          unsigned long pfn,
798                                          int level, int *large_page)
799 {
800         struct dma_pte *parent, *pte = NULL;
801         int total = agaw_to_level(domain->agaw);
802         int offset;
803
804         parent = domain->pgd;
805         while (level <= total) {
806                 offset = pfn_level_offset(pfn, total);
807                 pte = &parent[offset];
808                 if (level == total)
809                         return pte;
810
811                 if (!dma_pte_present(pte)) {
812                         *large_page = total;
813                         break;
814                 }
815
816                 if (pte->val & DMA_PTE_LARGE_PAGE) {
817                         *large_page = total;
818                         return pte;
819                 }
820
821                 parent = phys_to_virt(dma_pte_addr(pte));
822                 total--;
823         }
824         return NULL;
825 }
826
827 /* clear last level pte, a tlb flush should be followed */
828 static int dma_pte_clear_range(struct dmar_domain *domain,
829                                 unsigned long start_pfn,
830                                 unsigned long last_pfn)
831 {
832         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
833         unsigned int large_page = 1;
834         struct dma_pte *first_pte, *pte;
835         int order;
836
837         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
838         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
839         BUG_ON(start_pfn > last_pfn);
840
841         /* we don't need lock here; nobody else touches the iova range */
842         do {
843                 large_page = 1;
844                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
845                 if (!pte) {
846                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
847                         continue;
848                 }
849                 do {
850                         dma_clear_pte(pte);
851                         start_pfn += lvl_to_nr_pages(large_page);
852                         pte++;
853                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
854
855                 domain_flush_cache(domain, first_pte,
856                                    (void *)pte - (void *)first_pte);
857
858         } while (start_pfn && start_pfn <= last_pfn);
859
860         order = (large_page - 1) * 9;
861         return order;
862 }
863
864 /* free page table pages. last level pte should already be cleared */
865 static void dma_pte_free_pagetable(struct dmar_domain *domain,
866                                    unsigned long start_pfn,
867                                    unsigned long last_pfn)
868 {
869         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
870         struct dma_pte *first_pte, *pte;
871         int total = agaw_to_level(domain->agaw);
872         int level;
873         unsigned long tmp;
874         int large_page = 2;
875
876         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
877         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
878         BUG_ON(start_pfn > last_pfn);
879
880         /* We don't need lock here; nobody else touches the iova range */
881         level = 2;
882         while (level <= total) {
883                 tmp = align_to_level(start_pfn, level);
884
885                 /* If we can't even clear one PTE at this level, we're done */
886                 if (tmp + level_size(level) - 1 > last_pfn)
887                         return;
888
889                 do {
890                         large_page = level;
891                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
892                         if (large_page > level)
893                                 level = large_page + 1;
894                         if (!pte) {
895                                 tmp = align_to_level(tmp + 1, level + 1);
896                                 continue;
897                         }
898                         do {
899                                 if (dma_pte_present(pte)) {
900                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
901                                         dma_clear_pte(pte);
902                                 }
903                                 pte++;
904                                 tmp += level_size(level);
905                         } while (!first_pte_in_page(pte) &&
906                                  tmp + level_size(level) - 1 <= last_pfn);
907
908                         domain_flush_cache(domain, first_pte,
909                                            (void *)pte - (void *)first_pte);
910                         
911                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
912                 level++;
913         }
914         /* free pgd */
915         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
916                 free_pgtable_page(domain->pgd);
917                 domain->pgd = NULL;
918         }
919 }
920
921 /* iommu handling */
922 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
923 {
924         struct root_entry *root;
925         unsigned long flags;
926
927         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
928         if (!root)
929                 return -ENOMEM;
930
931         __iommu_flush_cache(iommu, root, ROOT_SIZE);
932
933         spin_lock_irqsave(&iommu->lock, flags);
934         iommu->root_entry = root;
935         spin_unlock_irqrestore(&iommu->lock, flags);
936
937         return 0;
938 }
939
940 static void iommu_set_root_entry(struct intel_iommu *iommu)
941 {
942         void *addr;
943         u32 sts;
944         unsigned long flag;
945
946         addr = iommu->root_entry;
947
948         raw_spin_lock_irqsave(&iommu->register_lock, flag);
949         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
950
951         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
952
953         /* Make sure hardware complete it */
954         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
955                       readl, (sts & DMA_GSTS_RTPS), sts);
956
957         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
958 }
959
960 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
961 {
962         u32 val;
963         unsigned long flag;
964
965         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
966                 return;
967
968         raw_spin_lock_irqsave(&iommu->register_lock, flag);
969         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
970
971         /* Make sure hardware complete it */
972         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
973                       readl, (!(val & DMA_GSTS_WBFS)), val);
974
975         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
976 }
977
978 /* return value determine if we need a write buffer flush */
979 static void __iommu_flush_context(struct intel_iommu *iommu,
980                                   u16 did, u16 source_id, u8 function_mask,
981                                   u64 type)
982 {
983         u64 val = 0;
984         unsigned long flag;
985
986         switch (type) {
987         case DMA_CCMD_GLOBAL_INVL:
988                 val = DMA_CCMD_GLOBAL_INVL;
989                 break;
990         case DMA_CCMD_DOMAIN_INVL:
991                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
992                 break;
993         case DMA_CCMD_DEVICE_INVL:
994                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
995                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
996                 break;
997         default:
998                 BUG();
999         }
1000         val |= DMA_CCMD_ICC;
1001
1002         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1003         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1004
1005         /* Make sure hardware complete it */
1006         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1007                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1008
1009         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1010 }
1011
1012 /* return value determine if we need a write buffer flush */
1013 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1014                                 u64 addr, unsigned int size_order, u64 type)
1015 {
1016         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1017         u64 val = 0, val_iva = 0;
1018         unsigned long flag;
1019
1020         switch (type) {
1021         case DMA_TLB_GLOBAL_FLUSH:
1022                 /* global flush doesn't need set IVA_REG */
1023                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1024                 break;
1025         case DMA_TLB_DSI_FLUSH:
1026                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1027                 break;
1028         case DMA_TLB_PSI_FLUSH:
1029                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1030                 /* Note: always flush non-leaf currently */
1031                 val_iva = size_order | addr;
1032                 break;
1033         default:
1034                 BUG();
1035         }
1036         /* Note: set drain read/write */
1037 #if 0
1038         /*
1039          * This is probably to be super secure.. Looks like we can
1040          * ignore it without any impact.
1041          */
1042         if (cap_read_drain(iommu->cap))
1043                 val |= DMA_TLB_READ_DRAIN;
1044 #endif
1045         if (cap_write_drain(iommu->cap))
1046                 val |= DMA_TLB_WRITE_DRAIN;
1047
1048         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1049         /* Note: Only uses first TLB reg currently */
1050         if (val_iva)
1051                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1052         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1053
1054         /* Make sure hardware complete it */
1055         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1056                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1057
1058         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1059
1060         /* check IOTLB invalidation granularity */
1061         if (DMA_TLB_IAIG(val) == 0)
1062                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1063         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1064                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1065                         (unsigned long long)DMA_TLB_IIRG(type),
1066                         (unsigned long long)DMA_TLB_IAIG(val));
1067 }
1068
1069 static struct device_domain_info *iommu_support_dev_iotlb(
1070         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1071 {
1072         int found = 0;
1073         unsigned long flags;
1074         struct device_domain_info *info;
1075         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1076
1077         if (!ecap_dev_iotlb_support(iommu->ecap))
1078                 return NULL;
1079
1080         if (!iommu->qi)
1081                 return NULL;
1082
1083         spin_lock_irqsave(&device_domain_lock, flags);
1084         list_for_each_entry(info, &domain->devices, link)
1085                 if (info->bus == bus && info->devfn == devfn) {
1086                         found = 1;
1087                         break;
1088                 }
1089         spin_unlock_irqrestore(&device_domain_lock, flags);
1090
1091         if (!found || !info->dev)
1092                 return NULL;
1093
1094         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1095                 return NULL;
1096
1097         if (!dmar_find_matched_atsr_unit(info->dev))
1098                 return NULL;
1099
1100         info->iommu = iommu;
1101
1102         return info;
1103 }
1104
1105 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1106 {
1107         if (!info)
1108                 return;
1109
1110         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1111 }
1112
1113 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1114 {
1115         if (!info->dev || !pci_ats_enabled(info->dev))
1116                 return;
1117
1118         pci_disable_ats(info->dev);
1119 }
1120
1121 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1122                                   u64 addr, unsigned mask)
1123 {
1124         u16 sid, qdep;
1125         unsigned long flags;
1126         struct device_domain_info *info;
1127
1128         spin_lock_irqsave(&device_domain_lock, flags);
1129         list_for_each_entry(info, &domain->devices, link) {
1130                 if (!info->dev || !pci_ats_enabled(info->dev))
1131                         continue;
1132
1133                 sid = info->bus << 8 | info->devfn;
1134                 qdep = pci_ats_queue_depth(info->dev);
1135                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1136         }
1137         spin_unlock_irqrestore(&device_domain_lock, flags);
1138 }
1139
1140 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1141                                   unsigned long pfn, unsigned int pages, int map)
1142 {
1143         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1144         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1145
1146         BUG_ON(pages == 0);
1147
1148         /*
1149          * Fallback to domain selective flush if no PSI support or the size is
1150          * too big.
1151          * PSI requires page size to be 2 ^ x, and the base address is naturally
1152          * aligned to the size
1153          */
1154         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1155                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1156                                                 DMA_TLB_DSI_FLUSH);
1157         else
1158                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1159                                                 DMA_TLB_PSI_FLUSH);
1160
1161         /*
1162          * In caching mode, changes of pages from non-present to present require
1163          * flush. However, device IOTLB doesn't need to be flushed in this case.
1164          */
1165         if (!cap_caching_mode(iommu->cap) || !map)
1166                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1167 }
1168
1169 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1170 {
1171         u32 pmen;
1172         unsigned long flags;
1173
1174         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1175         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1176         pmen &= ~DMA_PMEN_EPM;
1177         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1178
1179         /* wait for the protected region status bit to clear */
1180         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1181                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1182
1183         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1184 }
1185
1186 static int iommu_enable_translation(struct intel_iommu *iommu)
1187 {
1188         u32 sts;
1189         unsigned long flags;
1190
1191         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1192         iommu->gcmd |= DMA_GCMD_TE;
1193         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1194
1195         /* Make sure hardware complete it */
1196         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1197                       readl, (sts & DMA_GSTS_TES), sts);
1198
1199         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1200         return 0;
1201 }
1202
1203 static int iommu_disable_translation(struct intel_iommu *iommu)
1204 {
1205         u32 sts;
1206         unsigned long flag;
1207
1208         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209         iommu->gcmd &= ~DMA_GCMD_TE;
1210         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1211
1212         /* Make sure hardware complete it */
1213         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1214                       readl, (!(sts & DMA_GSTS_TES)), sts);
1215
1216         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1217         return 0;
1218 }
1219
1220
1221 static int iommu_init_domains(struct intel_iommu *iommu)
1222 {
1223         unsigned long ndomains;
1224         unsigned long nlongs;
1225
1226         ndomains = cap_ndoms(iommu->cap);
1227         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1228                         ndomains);
1229         nlongs = BITS_TO_LONGS(ndomains);
1230
1231         spin_lock_init(&iommu->lock);
1232
1233         /* TBD: there might be 64K domains,
1234          * consider other allocation for future chip
1235          */
1236         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1237         if (!iommu->domain_ids) {
1238                 printk(KERN_ERR "Allocating domain id array failed\n");
1239                 return -ENOMEM;
1240         }
1241         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1242                         GFP_KERNEL);
1243         if (!iommu->domains) {
1244                 printk(KERN_ERR "Allocating domain array failed\n");
1245                 return -ENOMEM;
1246         }
1247
1248         /*
1249          * if Caching mode is set, then invalid translations are tagged
1250          * with domainid 0. Hence we need to pre-allocate it.
1251          */
1252         if (cap_caching_mode(iommu->cap))
1253                 set_bit(0, iommu->domain_ids);
1254         return 0;
1255 }
1256
1257
1258 static void domain_exit(struct dmar_domain *domain);
1259 static void vm_domain_exit(struct dmar_domain *domain);
1260
1261 void free_dmar_iommu(struct intel_iommu *iommu)
1262 {
1263         struct dmar_domain *domain;
1264         int i;
1265         unsigned long flags;
1266
1267         if ((iommu->domains) && (iommu->domain_ids)) {
1268                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1269                         domain = iommu->domains[i];
1270                         clear_bit(i, iommu->domain_ids);
1271
1272                         spin_lock_irqsave(&domain->iommu_lock, flags);
1273                         if (--domain->iommu_count == 0) {
1274                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1275                                         vm_domain_exit(domain);
1276                                 else
1277                                         domain_exit(domain);
1278                         }
1279                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1280                 }
1281         }
1282
1283         if (iommu->gcmd & DMA_GCMD_TE)
1284                 iommu_disable_translation(iommu);
1285
1286         if (iommu->irq) {
1287                 irq_set_handler_data(iommu->irq, NULL);
1288                 /* This will mask the irq */
1289                 free_irq(iommu->irq, iommu);
1290                 destroy_irq(iommu->irq);
1291         }
1292
1293         kfree(iommu->domains);
1294         kfree(iommu->domain_ids);
1295
1296         g_iommus[iommu->seq_id] = NULL;
1297
1298         /* if all iommus are freed, free g_iommus */
1299         for (i = 0; i < g_num_of_iommus; i++) {
1300                 if (g_iommus[i])
1301                         break;
1302         }
1303
1304         if (i == g_num_of_iommus)
1305                 kfree(g_iommus);
1306
1307         /* free context mapping */
1308         free_context_table(iommu);
1309 }
1310
1311 static struct dmar_domain *alloc_domain(void)
1312 {
1313         struct dmar_domain *domain;
1314
1315         domain = alloc_domain_mem();
1316         if (!domain)
1317                 return NULL;
1318
1319         domain->nid = -1;
1320         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1321         domain->flags = 0;
1322
1323         return domain;
1324 }
1325
1326 static int iommu_attach_domain(struct dmar_domain *domain,
1327                                struct intel_iommu *iommu)
1328 {
1329         int num;
1330         unsigned long ndomains;
1331         unsigned long flags;
1332
1333         ndomains = cap_ndoms(iommu->cap);
1334
1335         spin_lock_irqsave(&iommu->lock, flags);
1336
1337         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1338         if (num >= ndomains) {
1339                 spin_unlock_irqrestore(&iommu->lock, flags);
1340                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1341                 return -ENOMEM;
1342         }
1343
1344         domain->id = num;
1345         set_bit(num, iommu->domain_ids);
1346         set_bit(iommu->seq_id, &domain->iommu_bmp);
1347         iommu->domains[num] = domain;
1348         spin_unlock_irqrestore(&iommu->lock, flags);
1349
1350         return 0;
1351 }
1352
1353 static void iommu_detach_domain(struct dmar_domain *domain,
1354                                 struct intel_iommu *iommu)
1355 {
1356         unsigned long flags;
1357         int num, ndomains;
1358         int found = 0;
1359
1360         spin_lock_irqsave(&iommu->lock, flags);
1361         ndomains = cap_ndoms(iommu->cap);
1362         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1363                 if (iommu->domains[num] == domain) {
1364                         found = 1;
1365                         break;
1366                 }
1367         }
1368
1369         if (found) {
1370                 clear_bit(num, iommu->domain_ids);
1371                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1372                 iommu->domains[num] = NULL;
1373         }
1374         spin_unlock_irqrestore(&iommu->lock, flags);
1375 }
1376
1377 static struct iova_domain reserved_iova_list;
1378 static struct lock_class_key reserved_rbtree_key;
1379
1380 static int dmar_init_reserved_ranges(void)
1381 {
1382         struct pci_dev *pdev = NULL;
1383         struct iova *iova;
1384         int i;
1385
1386         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1387
1388         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1389                 &reserved_rbtree_key);
1390
1391         /* IOAPIC ranges shouldn't be accessed by DMA */
1392         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1393                 IOVA_PFN(IOAPIC_RANGE_END));
1394         if (!iova) {
1395                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1396                 return -ENODEV;
1397         }
1398
1399         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1400         for_each_pci_dev(pdev) {
1401                 struct resource *r;
1402
1403                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1404                         r = &pdev->resource[i];
1405                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1406                                 continue;
1407                         iova = reserve_iova(&reserved_iova_list,
1408                                             IOVA_PFN(r->start),
1409                                             IOVA_PFN(r->end));
1410                         if (!iova) {
1411                                 printk(KERN_ERR "Reserve iova failed\n");
1412                                 return -ENODEV;
1413                         }
1414                 }
1415         }
1416         return 0;
1417 }
1418
1419 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1420 {
1421         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1422 }
1423
1424 static inline int guestwidth_to_adjustwidth(int gaw)
1425 {
1426         int agaw;
1427         int r = (gaw - 12) % 9;
1428
1429         if (r == 0)
1430                 agaw = gaw;
1431         else
1432                 agaw = gaw + 9 - r;
1433         if (agaw > 64)
1434                 agaw = 64;
1435         return agaw;
1436 }
1437
1438 static int domain_init(struct dmar_domain *domain, int guest_width)
1439 {
1440         struct intel_iommu *iommu;
1441         int adjust_width, agaw;
1442         unsigned long sagaw;
1443
1444         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1445         spin_lock_init(&domain->iommu_lock);
1446
1447         domain_reserve_special_ranges(domain);
1448
1449         /* calculate AGAW */
1450         iommu = domain_get_iommu(domain);
1451         if (guest_width > cap_mgaw(iommu->cap))
1452                 guest_width = cap_mgaw(iommu->cap);
1453         domain->gaw = guest_width;
1454         adjust_width = guestwidth_to_adjustwidth(guest_width);
1455         agaw = width_to_agaw(adjust_width);
1456         sagaw = cap_sagaw(iommu->cap);
1457         if (!test_bit(agaw, &sagaw)) {
1458                 /* hardware doesn't support it, choose a bigger one */
1459                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1460                 agaw = find_next_bit(&sagaw, 5, agaw);
1461                 if (agaw >= 5)
1462                         return -ENODEV;
1463         }
1464         domain->agaw = agaw;
1465         INIT_LIST_HEAD(&domain->devices);
1466
1467         if (ecap_coherent(iommu->ecap))
1468                 domain->iommu_coherency = 1;
1469         else
1470                 domain->iommu_coherency = 0;
1471
1472         if (ecap_sc_support(iommu->ecap))
1473                 domain->iommu_snooping = 1;
1474         else
1475                 domain->iommu_snooping = 0;
1476
1477         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1478         domain->iommu_count = 1;
1479         domain->nid = iommu->node;
1480
1481         /* always allocate the top pgd */
1482         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1483         if (!domain->pgd)
1484                 return -ENOMEM;
1485         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1486         return 0;
1487 }
1488
1489 static void domain_exit(struct dmar_domain *domain)
1490 {
1491         struct dmar_drhd_unit *drhd;
1492         struct intel_iommu *iommu;
1493
1494         /* Domain 0 is reserved, so dont process it */
1495         if (!domain)
1496                 return;
1497
1498         /* Flush any lazy unmaps that may reference this domain */
1499         if (!intel_iommu_strict)
1500                 flush_unmaps_timeout(0);
1501
1502         domain_remove_dev_info(domain);
1503         /* destroy iovas */
1504         put_iova_domain(&domain->iovad);
1505
1506         /* clear ptes */
1507         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1508
1509         /* free page tables */
1510         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1511
1512         for_each_active_iommu(iommu, drhd)
1513                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1514                         iommu_detach_domain(domain, iommu);
1515
1516         free_domain_mem(domain);
1517 }
1518
1519 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1520                                  u8 bus, u8 devfn, int translation)
1521 {
1522         struct context_entry *context;
1523         unsigned long flags;
1524         struct intel_iommu *iommu;
1525         struct dma_pte *pgd;
1526         unsigned long num;
1527         unsigned long ndomains;
1528         int id;
1529         int agaw;
1530         struct device_domain_info *info = NULL;
1531
1532         pr_debug("Set context mapping for %02x:%02x.%d\n",
1533                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1534
1535         BUG_ON(!domain->pgd);
1536         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1537                translation != CONTEXT_TT_MULTI_LEVEL);
1538
1539         iommu = device_to_iommu(segment, bus, devfn);
1540         if (!iommu)
1541                 return -ENODEV;
1542
1543         context = device_to_context_entry(iommu, bus, devfn);
1544         if (!context)
1545                 return -ENOMEM;
1546         spin_lock_irqsave(&iommu->lock, flags);
1547         if (context_present(context)) {
1548                 spin_unlock_irqrestore(&iommu->lock, flags);
1549                 return 0;
1550         }
1551
1552         id = domain->id;
1553         pgd = domain->pgd;
1554
1555         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1556             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1557                 int found = 0;
1558
1559                 /* find an available domain id for this device in iommu */
1560                 ndomains = cap_ndoms(iommu->cap);
1561                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1562                         if (iommu->domains[num] == domain) {
1563                                 id = num;
1564                                 found = 1;
1565                                 break;
1566                         }
1567                 }
1568
1569                 if (found == 0) {
1570                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1571                         if (num >= ndomains) {
1572                                 spin_unlock_irqrestore(&iommu->lock, flags);
1573                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1574                                 return -EFAULT;
1575                         }
1576
1577                         set_bit(num, iommu->domain_ids);
1578                         iommu->domains[num] = domain;
1579                         id = num;
1580                 }
1581
1582                 /* Skip top levels of page tables for
1583                  * iommu which has less agaw than default.
1584                  * Unnecessary for PT mode.
1585                  */
1586                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1587                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1588                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1589                                 if (!dma_pte_present(pgd)) {
1590                                         spin_unlock_irqrestore(&iommu->lock, flags);
1591                                         return -ENOMEM;
1592                                 }
1593                         }
1594                 }
1595         }
1596
1597         context_set_domain_id(context, id);
1598
1599         if (translation != CONTEXT_TT_PASS_THROUGH) {
1600                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1601                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1602                                      CONTEXT_TT_MULTI_LEVEL;
1603         }
1604         /*
1605          * In pass through mode, AW must be programmed to indicate the largest
1606          * AGAW value supported by hardware. And ASR is ignored by hardware.
1607          */
1608         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1609                 context_set_address_width(context, iommu->msagaw);
1610         else {
1611                 context_set_address_root(context, virt_to_phys(pgd));
1612                 context_set_address_width(context, iommu->agaw);
1613         }
1614
1615         context_set_translation_type(context, translation);
1616         context_set_fault_enable(context);
1617         context_set_present(context);
1618         domain_flush_cache(domain, context, sizeof(*context));
1619
1620         /*
1621          * It's a non-present to present mapping. If hardware doesn't cache
1622          * non-present entry we only need to flush the write-buffer. If the
1623          * _does_ cache non-present entries, then it does so in the special
1624          * domain #0, which we have to flush:
1625          */
1626         if (cap_caching_mode(iommu->cap)) {
1627                 iommu->flush.flush_context(iommu, 0,
1628                                            (((u16)bus) << 8) | devfn,
1629                                            DMA_CCMD_MASK_NOBIT,
1630                                            DMA_CCMD_DEVICE_INVL);
1631                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1632         } else {
1633                 iommu_flush_write_buffer(iommu);
1634         }
1635         iommu_enable_dev_iotlb(info);
1636         spin_unlock_irqrestore(&iommu->lock, flags);
1637
1638         spin_lock_irqsave(&domain->iommu_lock, flags);
1639         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1640                 domain->iommu_count++;
1641                 if (domain->iommu_count == 1)
1642                         domain->nid = iommu->node;
1643                 domain_update_iommu_cap(domain);
1644         }
1645         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1646         return 0;
1647 }
1648
1649 static int
1650 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1651                         int translation)
1652 {
1653         int ret;
1654         struct pci_dev *tmp, *parent;
1655
1656         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1657                                          pdev->bus->number, pdev->devfn,
1658                                          translation);
1659         if (ret)
1660                 return ret;
1661
1662         /* dependent device mapping */
1663         tmp = pci_find_upstream_pcie_bridge(pdev);
1664         if (!tmp)
1665                 return 0;
1666         /* Secondary interface's bus number and devfn 0 */
1667         parent = pdev->bus->self;
1668         while (parent != tmp) {
1669                 ret = domain_context_mapping_one(domain,
1670                                                  pci_domain_nr(parent->bus),
1671                                                  parent->bus->number,
1672                                                  parent->devfn, translation);
1673                 if (ret)
1674                         return ret;
1675                 parent = parent->bus->self;
1676         }
1677         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1678                 return domain_context_mapping_one(domain,
1679                                         pci_domain_nr(tmp->subordinate),
1680                                         tmp->subordinate->number, 0,
1681                                         translation);
1682         else /* this is a legacy PCI bridge */
1683                 return domain_context_mapping_one(domain,
1684                                                   pci_domain_nr(tmp->bus),
1685                                                   tmp->bus->number,
1686                                                   tmp->devfn,
1687                                                   translation);
1688 }
1689
1690 static int domain_context_mapped(struct pci_dev *pdev)
1691 {
1692         int ret;
1693         struct pci_dev *tmp, *parent;
1694         struct intel_iommu *iommu;
1695
1696         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1697                                 pdev->devfn);
1698         if (!iommu)
1699                 return -ENODEV;
1700
1701         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1702         if (!ret)
1703                 return ret;
1704         /* dependent device mapping */
1705         tmp = pci_find_upstream_pcie_bridge(pdev);
1706         if (!tmp)
1707                 return ret;
1708         /* Secondary interface's bus number and devfn 0 */
1709         parent = pdev->bus->self;
1710         while (parent != tmp) {
1711                 ret = device_context_mapped(iommu, parent->bus->number,
1712                                             parent->devfn);
1713                 if (!ret)
1714                         return ret;
1715                 parent = parent->bus->self;
1716         }
1717         if (pci_is_pcie(tmp))
1718                 return device_context_mapped(iommu, tmp->subordinate->number,
1719                                              0);
1720         else
1721                 return device_context_mapped(iommu, tmp->bus->number,
1722                                              tmp->devfn);
1723 }
1724
1725 /* Returns a number of VTD pages, but aligned to MM page size */
1726 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1727                                             size_t size)
1728 {
1729         host_addr &= ~PAGE_MASK;
1730         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1731 }
1732
1733 /* Return largest possible superpage level for a given mapping */
1734 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1735                                           unsigned long iov_pfn,
1736                                           unsigned long phy_pfn,
1737                                           unsigned long pages)
1738 {
1739         int support, level = 1;
1740         unsigned long pfnmerge;
1741
1742         support = domain->iommu_superpage;
1743
1744         /* To use a large page, the virtual *and* physical addresses
1745            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1746            of them will mean we have to use smaller pages. So just
1747            merge them and check both at once. */
1748         pfnmerge = iov_pfn | phy_pfn;
1749
1750         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1751                 pages >>= VTD_STRIDE_SHIFT;
1752                 if (!pages)
1753                         break;
1754                 pfnmerge >>= VTD_STRIDE_SHIFT;
1755                 level++;
1756                 support--;
1757         }
1758         return level;
1759 }
1760
1761 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1762                             struct scatterlist *sg, unsigned long phys_pfn,
1763                             unsigned long nr_pages, int prot)
1764 {
1765         struct dma_pte *first_pte = NULL, *pte = NULL;
1766         phys_addr_t uninitialized_var(pteval);
1767         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1768         unsigned long sg_res;
1769         unsigned int largepage_lvl = 0;
1770         unsigned long lvl_pages = 0;
1771
1772         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1773
1774         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1775                 return -EINVAL;
1776
1777         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1778
1779         if (sg)
1780                 sg_res = 0;
1781         else {
1782                 sg_res = nr_pages + 1;
1783                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1784         }
1785
1786         while (nr_pages > 0) {
1787                 uint64_t tmp;
1788
1789                 if (!sg_res) {
1790                         sg_res = aligned_nrpages(sg->offset, sg->length);
1791                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1792                         sg->dma_length = sg->length;
1793                         pteval = page_to_phys(sg_page(sg)) | prot;
1794                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1795                 }
1796
1797                 if (!pte) {
1798                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1799
1800                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1801                         if (!pte)
1802                                 return -ENOMEM;
1803                         /* It is large page*/
1804                         if (largepage_lvl > 1) {
1805                                 pteval |= DMA_PTE_LARGE_PAGE;
1806                                 /* Ensure that old small page tables are removed to make room
1807                                    for superpage, if they exist. */
1808                                 dma_pte_clear_range(domain, iov_pfn,
1809                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1810                                 dma_pte_free_pagetable(domain, iov_pfn,
1811                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1812                         } else {
1813                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1814                         }
1815
1816                 }
1817                 /* We don't need lock here, nobody else
1818                  * touches the iova range
1819                  */
1820                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1821                 if (tmp) {
1822                         static int dumps = 5;
1823                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1824                                iov_pfn, tmp, (unsigned long long)pteval);
1825                         if (dumps) {
1826                                 dumps--;
1827                                 debug_dma_dump_mappings(NULL);
1828                         }
1829                         WARN_ON(1);
1830                 }
1831
1832                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1833
1834                 BUG_ON(nr_pages < lvl_pages);
1835                 BUG_ON(sg_res < lvl_pages);
1836
1837                 nr_pages -= lvl_pages;
1838                 iov_pfn += lvl_pages;
1839                 phys_pfn += lvl_pages;
1840                 pteval += lvl_pages * VTD_PAGE_SIZE;
1841                 sg_res -= lvl_pages;
1842
1843                 /* If the next PTE would be the first in a new page, then we
1844                    need to flush the cache on the entries we've just written.
1845                    And then we'll need to recalculate 'pte', so clear it and
1846                    let it get set again in the if (!pte) block above.
1847
1848                    If we're done (!nr_pages) we need to flush the cache too.
1849
1850                    Also if we've been setting superpages, we may need to
1851                    recalculate 'pte' and switch back to smaller pages for the
1852                    end of the mapping, if the trailing size is not enough to
1853                    use another superpage (i.e. sg_res < lvl_pages). */
1854                 pte++;
1855                 if (!nr_pages || first_pte_in_page(pte) ||
1856                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1857                         domain_flush_cache(domain, first_pte,
1858                                            (void *)pte - (void *)first_pte);
1859                         pte = NULL;
1860                 }
1861
1862                 if (!sg_res && nr_pages)
1863                         sg = sg_next(sg);
1864         }
1865         return 0;
1866 }
1867
1868 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1869                                     struct scatterlist *sg, unsigned long nr_pages,
1870                                     int prot)
1871 {
1872         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1873 }
1874
1875 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1876                                      unsigned long phys_pfn, unsigned long nr_pages,
1877                                      int prot)
1878 {
1879         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1880 }
1881
1882 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1883 {
1884         if (!iommu)
1885                 return;
1886
1887         clear_context_table(iommu, bus, devfn);
1888         iommu->flush.flush_context(iommu, 0, 0, 0,
1889                                            DMA_CCMD_GLOBAL_INVL);
1890         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1891 }
1892
1893 static void domain_remove_dev_info(struct dmar_domain *domain)
1894 {
1895         struct device_domain_info *info;
1896         unsigned long flags;
1897         struct intel_iommu *iommu;
1898
1899         spin_lock_irqsave(&device_domain_lock, flags);
1900         while (!list_empty(&domain->devices)) {
1901                 info = list_entry(domain->devices.next,
1902                         struct device_domain_info, link);
1903                 list_del(&info->link);
1904                 list_del(&info->global);
1905                 if (info->dev)
1906                         info->dev->dev.archdata.iommu = NULL;
1907                 spin_unlock_irqrestore(&device_domain_lock, flags);
1908
1909                 iommu_disable_dev_iotlb(info);
1910                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1911                 iommu_detach_dev(iommu, info->bus, info->devfn);
1912                 free_devinfo_mem(info);
1913
1914                 spin_lock_irqsave(&device_domain_lock, flags);
1915         }
1916         spin_unlock_irqrestore(&device_domain_lock, flags);
1917 }
1918
1919 /*
1920  * find_domain
1921  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1922  */
1923 static struct dmar_domain *
1924 find_domain(struct pci_dev *pdev)
1925 {
1926         struct device_domain_info *info;
1927
1928         /* No lock here, assumes no domain exit in normal case */
1929         info = pdev->dev.archdata.iommu;
1930         if (info)
1931                 return info->domain;
1932         return NULL;
1933 }
1934
1935 /* domain is initialized */
1936 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1937 {
1938         struct dmar_domain *domain, *found = NULL;
1939         struct intel_iommu *iommu;
1940         struct dmar_drhd_unit *drhd;
1941         struct device_domain_info *info, *tmp;
1942         struct pci_dev *dev_tmp;
1943         unsigned long flags;
1944         int bus = 0, devfn = 0;
1945         int segment;
1946         int ret;
1947
1948         domain = find_domain(pdev);
1949         if (domain)
1950                 return domain;
1951
1952         segment = pci_domain_nr(pdev->bus);
1953
1954         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1955         if (dev_tmp) {
1956                 if (pci_is_pcie(dev_tmp)) {
1957                         bus = dev_tmp->subordinate->number;
1958                         devfn = 0;
1959                 } else {
1960                         bus = dev_tmp->bus->number;
1961                         devfn = dev_tmp->devfn;
1962                 }
1963                 spin_lock_irqsave(&device_domain_lock, flags);
1964                 list_for_each_entry(info, &device_domain_list, global) {
1965                         if (info->segment == segment &&
1966                             info->bus == bus && info->devfn == devfn) {
1967                                 found = info->domain;
1968                                 break;
1969                         }
1970                 }
1971                 spin_unlock_irqrestore(&device_domain_lock, flags);
1972                 /* pcie-pci bridge already has a domain, uses it */
1973                 if (found) {
1974                         domain = found;
1975                         goto found_domain;
1976                 }
1977         }
1978
1979         domain = alloc_domain();
1980         if (!domain)
1981                 goto error;
1982
1983         /* Allocate new domain for the device */
1984         drhd = dmar_find_matched_drhd_unit(pdev);
1985         if (!drhd) {
1986                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1987                         pci_name(pdev));
1988                 return NULL;
1989         }
1990         iommu = drhd->iommu;
1991
1992         ret = iommu_attach_domain(domain, iommu);
1993         if (ret) {
1994                 free_domain_mem(domain);
1995                 goto error;
1996         }
1997
1998         if (domain_init(domain, gaw)) {
1999                 domain_exit(domain);
2000                 goto error;
2001         }
2002
2003         /* register pcie-to-pci device */
2004         if (dev_tmp) {
2005                 info = alloc_devinfo_mem();
2006                 if (!info) {
2007                         domain_exit(domain);
2008                         goto error;
2009                 }
2010                 info->segment = segment;
2011                 info->bus = bus;
2012                 info->devfn = devfn;
2013                 info->dev = NULL;
2014                 info->domain = domain;
2015                 /* This domain is shared by devices under p2p bridge */
2016                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2017
2018                 /* pcie-to-pci bridge already has a domain, uses it */
2019                 found = NULL;
2020                 spin_lock_irqsave(&device_domain_lock, flags);
2021                 list_for_each_entry(tmp, &device_domain_list, global) {
2022                         if (tmp->segment == segment &&
2023                             tmp->bus == bus && tmp->devfn == devfn) {
2024                                 found = tmp->domain;
2025                                 break;
2026                         }
2027                 }
2028                 if (found) {
2029                         spin_unlock_irqrestore(&device_domain_lock, flags);
2030                         free_devinfo_mem(info);
2031                         domain_exit(domain);
2032                         domain = found;
2033                 } else {
2034                         list_add(&info->link, &domain->devices);
2035                         list_add(&info->global, &device_domain_list);
2036                         spin_unlock_irqrestore(&device_domain_lock, flags);
2037                 }
2038         }
2039
2040 found_domain:
2041         info = alloc_devinfo_mem();
2042         if (!info)
2043                 goto error;
2044         info->segment = segment;
2045         info->bus = pdev->bus->number;
2046         info->devfn = pdev->devfn;
2047         info->dev = pdev;
2048         info->domain = domain;
2049         spin_lock_irqsave(&device_domain_lock, flags);
2050         /* somebody is fast */
2051         found = find_domain(pdev);
2052         if (found != NULL) {
2053                 spin_unlock_irqrestore(&device_domain_lock, flags);
2054                 if (found != domain) {
2055                         domain_exit(domain);
2056                         domain = found;
2057                 }
2058                 free_devinfo_mem(info);
2059                 return domain;
2060         }
2061         list_add(&info->link, &domain->devices);
2062         list_add(&info->global, &device_domain_list);
2063         pdev->dev.archdata.iommu = info;
2064         spin_unlock_irqrestore(&device_domain_lock, flags);
2065         return domain;
2066 error:
2067         /* recheck it here, maybe others set it */
2068         return find_domain(pdev);
2069 }
2070
2071 static int iommu_identity_mapping;
2072 #define IDENTMAP_ALL            1
2073 #define IDENTMAP_GFX            2
2074 #define IDENTMAP_AZALIA         4
2075
2076 static int iommu_domain_identity_map(struct dmar_domain *domain,
2077                                      unsigned long long start,
2078                                      unsigned long long end)
2079 {
2080         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2081         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2082
2083         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2084                           dma_to_mm_pfn(last_vpfn))) {
2085                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2086                 return -ENOMEM;
2087         }
2088
2089         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2090                  start, end, domain->id);
2091         /*
2092          * RMRR range might have overlap with physical memory range,
2093          * clear it first
2094          */
2095         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2096
2097         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2098                                   last_vpfn - first_vpfn + 1,
2099                                   DMA_PTE_READ|DMA_PTE_WRITE);
2100 }
2101
2102 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2103                                       unsigned long long start,
2104                                       unsigned long long end)
2105 {
2106         struct dmar_domain *domain;
2107         int ret;
2108
2109         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2110         if (!domain)
2111                 return -ENOMEM;
2112
2113         /* For _hardware_ passthrough, don't bother. But for software
2114            passthrough, we do it anyway -- it may indicate a memory
2115            range which is reserved in E820, so which didn't get set
2116            up to start with in si_domain */
2117         if (domain == si_domain && hw_pass_through) {
2118                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2119                        pci_name(pdev), start, end);
2120                 return 0;
2121         }
2122
2123         printk(KERN_INFO
2124                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2125                pci_name(pdev), start, end);
2126         
2127         if (end < start) {
2128                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2129                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2130                         dmi_get_system_info(DMI_BIOS_VENDOR),
2131                         dmi_get_system_info(DMI_BIOS_VERSION),
2132                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2133                 ret = -EIO;
2134                 goto error;
2135         }
2136
2137         if (end >> agaw_to_width(domain->agaw)) {
2138                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2139                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2140                      agaw_to_width(domain->agaw),
2141                      dmi_get_system_info(DMI_BIOS_VENDOR),
2142                      dmi_get_system_info(DMI_BIOS_VERSION),
2143                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2144                 ret = -EIO;
2145                 goto error;
2146         }
2147
2148         ret = iommu_domain_identity_map(domain, start, end);
2149         if (ret)
2150                 goto error;
2151
2152         /* context entry init */
2153         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2154         if (ret)
2155                 goto error;
2156
2157         return 0;
2158
2159  error:
2160         domain_exit(domain);
2161         return ret;
2162 }
2163
2164 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2165         struct pci_dev *pdev)
2166 {
2167         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2168                 return 0;
2169         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2170                 rmrr->end_address);
2171 }
2172
2173 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2174 static inline void iommu_prepare_isa(void)
2175 {
2176         struct pci_dev *pdev;
2177         int ret;
2178
2179         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2180         if (!pdev)
2181                 return;
2182
2183         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2184         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2185
2186         if (ret)
2187                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2188                        "floppy might not work\n");
2189
2190 }
2191 #else
2192 static inline void iommu_prepare_isa(void)
2193 {
2194         return;
2195 }
2196 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2197
2198 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2199
2200 static int __init si_domain_work_fn(unsigned long start_pfn,
2201                                     unsigned long end_pfn, void *datax)
2202 {
2203         int *ret = datax;
2204
2205         *ret = iommu_domain_identity_map(si_domain,
2206                                          (uint64_t)start_pfn << PAGE_SHIFT,
2207                                          (uint64_t)end_pfn << PAGE_SHIFT);
2208         return *ret;
2209
2210 }
2211
2212 static int __init si_domain_init(int hw)
2213 {
2214         struct dmar_drhd_unit *drhd;
2215         struct intel_iommu *iommu;
2216         int nid, ret = 0;
2217
2218         si_domain = alloc_domain();
2219         if (!si_domain)
2220                 return -EFAULT;
2221
2222         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2223
2224         for_each_active_iommu(iommu, drhd) {
2225                 ret = iommu_attach_domain(si_domain, iommu);
2226                 if (ret) {
2227                         domain_exit(si_domain);
2228                         return -EFAULT;
2229                 }
2230         }
2231
2232         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2233                 domain_exit(si_domain);
2234                 return -EFAULT;
2235         }
2236
2237         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2238
2239         if (hw)
2240                 return 0;
2241
2242         for_each_online_node(nid) {
2243                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2244                 if (ret)
2245                         return ret;
2246         }
2247
2248         return 0;
2249 }
2250
2251 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2252                                           struct pci_dev *pdev);
2253 static int identity_mapping(struct pci_dev *pdev)
2254 {
2255         struct device_domain_info *info;
2256
2257         if (likely(!iommu_identity_mapping))
2258                 return 0;
2259
2260         info = pdev->dev.archdata.iommu;
2261         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2262                 return (info->domain == si_domain);
2263
2264         return 0;
2265 }
2266
2267 static int domain_add_dev_info(struct dmar_domain *domain,
2268                                struct pci_dev *pdev,
2269                                int translation)
2270 {
2271         struct device_domain_info *info;
2272         unsigned long flags;
2273         int ret;
2274
2275         info = alloc_devinfo_mem();
2276         if (!info)
2277                 return -ENOMEM;
2278
2279         info->segment = pci_domain_nr(pdev->bus);
2280         info->bus = pdev->bus->number;
2281         info->devfn = pdev->devfn;
2282         info->dev = pdev;
2283         info->domain = domain;
2284
2285         spin_lock_irqsave(&device_domain_lock, flags);
2286         list_add(&info->link, &domain->devices);
2287         list_add(&info->global, &device_domain_list);
2288         pdev->dev.archdata.iommu = info;
2289         spin_unlock_irqrestore(&device_domain_lock, flags);
2290
2291         ret = domain_context_mapping(domain, pdev, translation);
2292         if (ret) {
2293                 spin_lock_irqsave(&device_domain_lock, flags);
2294                 list_del(&info->link);
2295                 list_del(&info->global);
2296                 pdev->dev.archdata.iommu = NULL;
2297                 spin_unlock_irqrestore(&device_domain_lock, flags);
2298                 free_devinfo_mem(info);
2299                 return ret;
2300         }
2301
2302         return 0;
2303 }
2304
2305 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2306 {
2307         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2308                 return 1;
2309
2310         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2311                 return 1;
2312
2313         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2314                 return 0;
2315
2316         /*
2317          * We want to start off with all devices in the 1:1 domain, and
2318          * take them out later if we find they can't access all of memory.
2319          *
2320          * However, we can't do this for PCI devices behind bridges,
2321          * because all PCI devices behind the same bridge will end up
2322          * with the same source-id on their transactions.
2323          *
2324          * Practically speaking, we can't change things around for these
2325          * devices at run-time, because we can't be sure there'll be no
2326          * DMA transactions in flight for any of their siblings.
2327          * 
2328          * So PCI devices (unless they're on the root bus) as well as
2329          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2330          * the 1:1 domain, just in _case_ one of their siblings turns out
2331          * not to be able to map all of memory.
2332          */
2333         if (!pci_is_pcie(pdev)) {
2334                 if (!pci_is_root_bus(pdev->bus))
2335                         return 0;
2336                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2337                         return 0;
2338         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2339                 return 0;
2340
2341         /* 
2342          * At boot time, we don't yet know if devices will be 64-bit capable.
2343          * Assume that they will -- if they turn out not to be, then we can 
2344          * take them out of the 1:1 domain later.
2345          */
2346         if (!startup) {
2347                 /*
2348                  * If the device's dma_mask is less than the system's memory
2349                  * size then this is not a candidate for identity mapping.
2350                  */
2351                 u64 dma_mask = pdev->dma_mask;
2352
2353                 if (pdev->dev.coherent_dma_mask &&
2354                     pdev->dev.coherent_dma_mask < dma_mask)
2355                         dma_mask = pdev->dev.coherent_dma_mask;
2356
2357                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2358         }
2359
2360         return 1;
2361 }
2362
2363 static int __init iommu_prepare_static_identity_mapping(int hw)
2364 {
2365         struct pci_dev *pdev = NULL;
2366         int ret;
2367
2368         ret = si_domain_init(hw);
2369         if (ret)
2370                 return -EFAULT;
2371
2372         for_each_pci_dev(pdev) {
2373                 /* Skip Host/PCI Bridge devices */
2374                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2375                         continue;
2376                 if (iommu_should_identity_map(pdev, 1)) {
2377                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2378                                hw ? "hardware" : "software", pci_name(pdev));
2379
2380                         ret = domain_add_dev_info(si_domain, pdev,
2381                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2382                                                      CONTEXT_TT_MULTI_LEVEL);
2383                         if (ret)
2384                                 return ret;
2385                 }
2386         }
2387
2388         return 0;
2389 }
2390
2391 static int __init init_dmars(void)
2392 {
2393         struct dmar_drhd_unit *drhd;
2394         struct dmar_rmrr_unit *rmrr;
2395         struct pci_dev *pdev;
2396         struct intel_iommu *iommu;
2397         int i, ret;
2398
2399         /*
2400          * for each drhd
2401          *    allocate root
2402          *    initialize and program root entry to not present
2403          * endfor
2404          */
2405         for_each_drhd_unit(drhd) {
2406                 g_num_of_iommus++;
2407                 /*
2408                  * lock not needed as this is only incremented in the single
2409                  * threaded kernel __init code path all other access are read
2410                  * only
2411                  */
2412         }
2413
2414         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2415                         GFP_KERNEL);
2416         if (!g_iommus) {
2417                 printk(KERN_ERR "Allocating global iommu array failed\n");
2418                 ret = -ENOMEM;
2419                 goto error;
2420         }
2421
2422         deferred_flush = kzalloc(g_num_of_iommus *
2423                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2424         if (!deferred_flush) {
2425                 ret = -ENOMEM;
2426                 goto error;
2427         }
2428
2429         for_each_drhd_unit(drhd) {
2430                 if (drhd->ignored)
2431                         continue;
2432
2433                 iommu = drhd->iommu;
2434                 g_iommus[iommu->seq_id] = iommu;
2435
2436                 ret = iommu_init_domains(iommu);
2437                 if (ret)
2438                         goto error;
2439
2440                 /*
2441                  * TBD:
2442                  * we could share the same root & context tables
2443                  * among all IOMMU's. Need to Split it later.
2444                  */
2445                 ret = iommu_alloc_root_entry(iommu);
2446                 if (ret) {
2447                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2448                         goto error;
2449                 }
2450                 if (!ecap_pass_through(iommu->ecap))
2451                         hw_pass_through = 0;
2452         }
2453
2454         /*
2455          * Start from the sane iommu hardware state.
2456          */
2457         for_each_drhd_unit(drhd) {
2458                 if (drhd->ignored)
2459                         continue;
2460
2461                 iommu = drhd->iommu;
2462
2463                 /*
2464                  * If the queued invalidation is already initialized by us
2465                  * (for example, while enabling interrupt-remapping) then
2466                  * we got the things already rolling from a sane state.
2467                  */
2468                 if (iommu->qi)
2469                         continue;
2470
2471                 /*
2472                  * Clear any previous faults.
2473                  */
2474                 dmar_fault(-1, iommu);
2475                 /*
2476                  * Disable queued invalidation if supported and already enabled
2477                  * before OS handover.
2478                  */
2479                 dmar_disable_qi(iommu);
2480         }
2481
2482         for_each_drhd_unit(drhd) {
2483                 if (drhd->ignored)
2484                         continue;
2485
2486                 iommu = drhd->iommu;
2487
2488                 if (dmar_enable_qi(iommu)) {
2489                         /*
2490                          * Queued Invalidate not enabled, use Register Based
2491                          * Invalidate
2492                          */
2493                         iommu->flush.flush_context = __iommu_flush_context;
2494                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2495                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2496                                "invalidation\n",
2497                                 iommu->seq_id,
2498                                (unsigned long long)drhd->reg_base_addr);
2499                 } else {
2500                         iommu->flush.flush_context = qi_flush_context;
2501                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2502                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2503                                "invalidation\n",
2504                                 iommu->seq_id,
2505                                (unsigned long long)drhd->reg_base_addr);
2506                 }
2507         }
2508
2509         if (iommu_pass_through)
2510                 iommu_identity_mapping |= IDENTMAP_ALL;
2511
2512 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2513         iommu_identity_mapping |= IDENTMAP_GFX;
2514 #endif
2515
2516         check_tylersburg_isoch();
2517
2518         /*
2519          * If pass through is not set or not enabled, setup context entries for
2520          * identity mappings for rmrr, gfx, and isa and may fall back to static
2521          * identity mapping if iommu_identity_mapping is set.
2522          */
2523         if (iommu_identity_mapping) {
2524                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2525                 if (ret) {
2526                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2527                         goto error;
2528                 }
2529         }
2530         /*
2531          * For each rmrr
2532          *   for each dev attached to rmrr
2533          *   do
2534          *     locate drhd for dev, alloc domain for dev
2535          *     allocate free domain
2536          *     allocate page table entries for rmrr
2537          *     if context not allocated for bus
2538          *           allocate and init context
2539          *           set present in root table for this bus
2540          *     init context with domain, translation etc
2541          *    endfor
2542          * endfor
2543          */
2544         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2545         for_each_rmrr_units(rmrr) {
2546                 for (i = 0; i < rmrr->devices_cnt; i++) {
2547                         pdev = rmrr->devices[i];
2548                         /*
2549                          * some BIOS lists non-exist devices in DMAR
2550                          * table.
2551                          */
2552                         if (!pdev)
2553                                 continue;
2554                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2555                         if (ret)
2556                                 printk(KERN_ERR
2557                                        "IOMMU: mapping reserved region failed\n");
2558                 }
2559         }
2560
2561         iommu_prepare_isa();
2562
2563         /*
2564          * for each drhd
2565          *   enable fault log
2566          *   global invalidate context cache
2567          *   global invalidate iotlb
2568          *   enable translation
2569          */
2570         for_each_drhd_unit(drhd) {
2571                 if (drhd->ignored) {
2572                         /*
2573                          * we always have to disable PMRs or DMA may fail on
2574                          * this device
2575                          */
2576                         if (force_on)
2577                                 iommu_disable_protect_mem_regions(drhd->iommu);
2578                         continue;
2579                 }
2580                 iommu = drhd->iommu;
2581
2582                 iommu_flush_write_buffer(iommu);
2583
2584                 ret = dmar_set_interrupt(iommu);
2585                 if (ret)
2586                         goto error;
2587
2588                 iommu_set_root_entry(iommu);
2589
2590                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2591                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2592
2593                 ret = iommu_enable_translation(iommu);
2594                 if (ret)
2595                         goto error;
2596
2597                 iommu_disable_protect_mem_regions(iommu);
2598         }
2599
2600         return 0;
2601 error:
2602         for_each_drhd_unit(drhd) {
2603                 if (drhd->ignored)
2604                         continue;
2605                 iommu = drhd->iommu;
2606                 free_iommu(iommu);
2607         }
2608         kfree(g_iommus);
2609         return ret;
2610 }
2611
2612 /* This takes a number of _MM_ pages, not VTD pages */
2613 static struct iova *intel_alloc_iova(struct device *dev,
2614                                      struct dmar_domain *domain,
2615                                      unsigned long nrpages, uint64_t dma_mask)
2616 {
2617         struct pci_dev *pdev = to_pci_dev(dev);
2618         struct iova *iova = NULL;
2619
2620         /* Restrict dma_mask to the width that the iommu can handle */
2621         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2622
2623         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2624                 /*
2625                  * First try to allocate an io virtual address in
2626                  * DMA_BIT_MASK(32) and if that fails then try allocating
2627                  * from higher range
2628                  */
2629                 iova = alloc_iova(&domain->iovad, nrpages,
2630                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2631                 if (iova)
2632                         return iova;
2633         }
2634         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2635         if (unlikely(!iova)) {
2636                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2637                        nrpages, pci_name(pdev));
2638                 return NULL;
2639         }
2640
2641         return iova;
2642 }
2643
2644 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2645 {
2646         struct dmar_domain *domain;
2647         int ret;
2648
2649         domain = get_domain_for_dev(pdev,
2650                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2651         if (!domain) {
2652                 printk(KERN_ERR
2653                         "Allocating domain for %s failed", pci_name(pdev));
2654                 return NULL;
2655         }
2656
2657         /* make sure context mapping is ok */
2658         if (unlikely(!domain_context_mapped(pdev))) {
2659                 ret = domain_context_mapping(domain, pdev,
2660                                              CONTEXT_TT_MULTI_LEVEL);
2661                 if (ret) {
2662                         printk(KERN_ERR
2663                                 "Domain context map for %s failed",
2664                                 pci_name(pdev));
2665                         return NULL;
2666                 }
2667         }
2668
2669         return domain;
2670 }
2671
2672 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2673 {
2674         struct device_domain_info *info;
2675
2676         /* No lock here, assumes no domain exit in normal case */
2677         info = dev->dev.archdata.iommu;
2678         if (likely(info))
2679                 return info->domain;
2680
2681         return __get_valid_domain_for_dev(dev);
2682 }
2683
2684 static int iommu_dummy(struct pci_dev *pdev)
2685 {
2686         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2687 }
2688
2689 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2690 static int iommu_no_mapping(struct device *dev)
2691 {
2692         struct pci_dev *pdev;
2693         int found;
2694
2695         if (unlikely(dev->bus != &pci_bus_type))
2696                 return 1;
2697
2698         pdev = to_pci_dev(dev);
2699         if (iommu_dummy(pdev))
2700                 return 1;
2701
2702         if (!iommu_identity_mapping)
2703                 return 0;
2704
2705         found = identity_mapping(pdev);
2706         if (found) {
2707                 if (iommu_should_identity_map(pdev, 0))
2708                         return 1;
2709                 else {
2710                         /*
2711                          * 32 bit DMA is removed from si_domain and fall back
2712                          * to non-identity mapping.
2713                          */
2714                         domain_remove_one_dev_info(si_domain, pdev);
2715                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2716                                pci_name(pdev));
2717                         return 0;
2718                 }
2719         } else {
2720                 /*
2721                  * In case of a detached 64 bit DMA device from vm, the device
2722                  * is put into si_domain for identity mapping.
2723                  */
2724                 if (iommu_should_identity_map(pdev, 0)) {
2725                         int ret;
2726                         ret = domain_add_dev_info(si_domain, pdev,
2727                                                   hw_pass_through ?
2728                                                   CONTEXT_TT_PASS_THROUGH :
2729                                                   CONTEXT_TT_MULTI_LEVEL);
2730                         if (!ret) {
2731                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2732                                        pci_name(pdev));
2733                                 return 1;
2734                         }
2735                 }
2736         }
2737
2738         return 0;
2739 }
2740
2741 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2742                                      size_t size, int dir, u64 dma_mask)
2743 {
2744         struct pci_dev *pdev = to_pci_dev(hwdev);
2745         struct dmar_domain *domain;
2746         phys_addr_t start_paddr;
2747         struct iova *iova;
2748         int prot = 0;
2749         int ret;
2750         struct intel_iommu *iommu;
2751         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2752
2753         BUG_ON(dir == DMA_NONE);
2754
2755         if (iommu_no_mapping(hwdev))
2756                 return paddr;
2757
2758         domain = get_valid_domain_for_dev(pdev);
2759         if (!domain)
2760                 return 0;
2761
2762         iommu = domain_get_iommu(domain);
2763         size = aligned_nrpages(paddr, size);
2764
2765         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2766         if (!iova)
2767                 goto error;
2768
2769         /*
2770          * Check if DMAR supports zero-length reads on write only
2771          * mappings..
2772          */
2773         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2774                         !cap_zlr(iommu->cap))
2775                 prot |= DMA_PTE_READ;
2776         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2777                 prot |= DMA_PTE_WRITE;
2778         /*
2779          * paddr - (paddr + size) might be partial page, we should map the whole
2780          * page.  Note: if two part of one page are separately mapped, we
2781          * might have two guest_addr mapping to the same host paddr, but this
2782          * is not a big problem
2783          */
2784         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2785                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2786         if (ret)
2787                 goto error;
2788
2789         /* it's a non-present to present mapping. Only flush if caching mode */
2790         if (cap_caching_mode(iommu->cap))
2791                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2792         else
2793                 iommu_flush_write_buffer(iommu);
2794
2795         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2796         start_paddr += paddr & ~PAGE_MASK;
2797         return start_paddr;
2798
2799 error:
2800         if (iova)
2801                 __free_iova(&domain->iovad, iova);
2802         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2803                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2804         return 0;
2805 }
2806
2807 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2808                                  unsigned long offset, size_t size,
2809                                  enum dma_data_direction dir,
2810                                  struct dma_attrs *attrs)
2811 {
2812         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2813                                   dir, to_pci_dev(dev)->dma_mask);
2814 }
2815
2816 static void flush_unmaps(void)
2817 {
2818         int i, j;
2819
2820         timer_on = 0;
2821
2822         /* just flush them all */
2823         for (i = 0; i < g_num_of_iommus; i++) {
2824                 struct intel_iommu *iommu = g_iommus[i];
2825                 if (!iommu)
2826                         continue;
2827
2828                 if (!deferred_flush[i].next)
2829                         continue;
2830
2831                 /* In caching mode, global flushes turn emulation expensive */
2832                 if (!cap_caching_mode(iommu->cap))
2833                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2834                                          DMA_TLB_GLOBAL_FLUSH);
2835                 for (j = 0; j < deferred_flush[i].next; j++) {
2836                         unsigned long mask;
2837                         struct iova *iova = deferred_flush[i].iova[j];
2838                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2839
2840                         /* On real hardware multiple invalidations are expensive */
2841                         if (cap_caching_mode(iommu->cap))
2842                                 iommu_flush_iotlb_psi(iommu, domain->id,
2843                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2844                         else {
2845                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2846                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2847                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2848                         }
2849                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2850                 }
2851                 deferred_flush[i].next = 0;
2852         }
2853
2854         list_size = 0;
2855 }
2856
2857 static void flush_unmaps_timeout(unsigned long data)
2858 {
2859         unsigned long flags;
2860
2861         spin_lock_irqsave(&async_umap_flush_lock, flags);
2862         flush_unmaps();
2863         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2864 }
2865
2866 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2867 {
2868         unsigned long flags;
2869         int next, iommu_id;
2870         struct intel_iommu *iommu;
2871
2872         spin_lock_irqsave(&async_umap_flush_lock, flags);
2873         if (list_size == HIGH_WATER_MARK)
2874                 flush_unmaps();
2875
2876         iommu = domain_get_iommu(dom);
2877         iommu_id = iommu->seq_id;
2878
2879         next = deferred_flush[iommu_id].next;
2880         deferred_flush[iommu_id].domain[next] = dom;
2881         deferred_flush[iommu_id].iova[next] = iova;
2882         deferred_flush[iommu_id].next++;
2883
2884         if (!timer_on) {
2885                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2886                 timer_on = 1;
2887         }
2888         list_size++;
2889         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2890 }
2891
2892 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2893                              size_t size, enum dma_data_direction dir,
2894                              struct dma_attrs *attrs)
2895 {
2896         struct pci_dev *pdev = to_pci_dev(dev);
2897         struct dmar_domain *domain;
2898         unsigned long start_pfn, last_pfn;
2899         struct iova *iova;
2900         struct intel_iommu *iommu;
2901
2902         if (iommu_no_mapping(dev))
2903                 return;
2904
2905         domain = find_domain(pdev);
2906         BUG_ON(!domain);
2907
2908         iommu = domain_get_iommu(domain);
2909
2910         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2911         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2912                       (unsigned long long)dev_addr))
2913                 return;
2914
2915         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2916         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2917
2918         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2919                  pci_name(pdev), start_pfn, last_pfn);
2920
2921         /*  clear the whole page */
2922         dma_pte_clear_range(domain, start_pfn, last_pfn);
2923
2924         /* free page tables */
2925         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2926
2927         if (intel_iommu_strict) {
2928                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2929                                       last_pfn - start_pfn + 1, 0);
2930                 /* free iova */
2931                 __free_iova(&domain->iovad, iova);
2932         } else {
2933                 add_unmap(domain, iova);
2934                 /*
2935                  * queue up the release of the unmap to save the 1/6th of the
2936                  * cpu used up by the iotlb flush operation...
2937                  */
2938         }
2939 }
2940
2941 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2942                                   dma_addr_t *dma_handle, gfp_t flags)
2943 {
2944         void *vaddr;
2945         int order;
2946
2947         size = PAGE_ALIGN(size);
2948         order = get_order(size);
2949
2950         if (!iommu_no_mapping(hwdev))
2951                 flags &= ~(GFP_DMA | GFP_DMA32);
2952         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2953                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2954                         flags |= GFP_DMA;
2955                 else
2956                         flags |= GFP_DMA32;
2957         }
2958
2959         vaddr = (void *)__get_free_pages(flags, order);
2960         if (!vaddr)
2961                 return NULL;
2962         memset(vaddr, 0, size);
2963
2964         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2965                                          DMA_BIDIRECTIONAL,
2966                                          hwdev->coherent_dma_mask);
2967         if (*dma_handle)
2968                 return vaddr;
2969         free_pages((unsigned long)vaddr, order);
2970         return NULL;
2971 }
2972
2973 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2974                                 dma_addr_t dma_handle)
2975 {
2976         int order;
2977
2978         size = PAGE_ALIGN(size);
2979         order = get_order(size);
2980
2981         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2982         free_pages((unsigned long)vaddr, order);
2983 }
2984
2985 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2986                            int nelems, enum dma_data_direction dir,
2987                            struct dma_attrs *attrs)
2988 {
2989         struct pci_dev *pdev = to_pci_dev(hwdev);
2990         struct dmar_domain *domain;
2991         unsigned long start_pfn, last_pfn;
2992         struct iova *iova;
2993         struct intel_iommu *iommu;
2994
2995         if (iommu_no_mapping(hwdev))
2996                 return;
2997
2998         domain = find_domain(pdev);
2999         BUG_ON(!domain);
3000
3001         iommu = domain_get_iommu(domain);
3002
3003         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3004         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3005                       (unsigned long long)sglist[0].dma_address))
3006                 return;
3007
3008         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3009         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3010
3011         /*  clear the whole page */
3012         dma_pte_clear_range(domain, start_pfn, last_pfn);
3013
3014         /* free page tables */
3015         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3016
3017         if (intel_iommu_strict) {
3018                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3019                                       last_pfn - start_pfn + 1, 0);
3020                 /* free iova */
3021                 __free_iova(&domain->iovad, iova);
3022         } else {
3023                 add_unmap(domain, iova);
3024                 /*
3025                  * queue up the release of the unmap to save the 1/6th of the
3026                  * cpu used up by the iotlb flush operation...
3027                  */
3028         }
3029 }
3030
3031 static int intel_nontranslate_map_sg(struct device *hddev,
3032         struct scatterlist *sglist, int nelems, int dir)
3033 {
3034         int i;
3035         struct scatterlist *sg;
3036
3037         for_each_sg(sglist, sg, nelems, i) {
3038                 BUG_ON(!sg_page(sg));
3039                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3040                 sg->dma_length = sg->length;
3041         }
3042         return nelems;
3043 }
3044
3045 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3046                         enum dma_data_direction dir, struct dma_attrs *attrs)
3047 {
3048         int i;
3049         struct pci_dev *pdev = to_pci_dev(hwdev);
3050         struct dmar_domain *domain;
3051         size_t size = 0;
3052         int prot = 0;
3053         struct iova *iova = NULL;
3054         int ret;
3055         struct scatterlist *sg;
3056         unsigned long start_vpfn;
3057         struct intel_iommu *iommu;
3058
3059         BUG_ON(dir == DMA_NONE);
3060         if (iommu_no_mapping(hwdev))
3061                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3062
3063         domain = get_valid_domain_for_dev(pdev);
3064         if (!domain)
3065                 return 0;
3066
3067         iommu = domain_get_iommu(domain);
3068
3069         for_each_sg(sglist, sg, nelems, i)
3070                 size += aligned_nrpages(sg->offset, sg->length);
3071
3072         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3073                                 pdev->dma_mask);
3074         if (!iova) {
3075                 sglist->dma_length = 0;
3076                 return 0;
3077         }
3078
3079         /*
3080          * Check if DMAR supports zero-length reads on write only
3081          * mappings..
3082          */
3083         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3084                         !cap_zlr(iommu->cap))
3085                 prot |= DMA_PTE_READ;
3086         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3087                 prot |= DMA_PTE_WRITE;
3088
3089         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3090
3091         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3092         if (unlikely(ret)) {
3093                 /*  clear the page */
3094                 dma_pte_clear_range(domain, start_vpfn,
3095                                     start_vpfn + size - 1);
3096                 /* free page tables */
3097                 dma_pte_free_pagetable(domain, start_vpfn,
3098                                        start_vpfn + size - 1);
3099                 /* free iova */
3100                 __free_iova(&domain->iovad, iova);
3101                 return 0;
3102         }
3103
3104         /* it's a non-present to present mapping. Only flush if caching mode */
3105         if (cap_caching_mode(iommu->cap))
3106                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3107         else
3108                 iommu_flush_write_buffer(iommu);
3109
3110         return nelems;
3111 }
3112
3113 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3114 {
3115         return !dma_addr;
3116 }
3117
3118 struct dma_map_ops intel_dma_ops = {
3119         .alloc_coherent = intel_alloc_coherent,
3120         .free_coherent = intel_free_coherent,
3121         .map_sg = intel_map_sg,
3122         .unmap_sg = intel_unmap_sg,
3123         .map_page = intel_map_page,
3124         .unmap_page = intel_unmap_page,
3125         .mapping_error = intel_mapping_error,
3126 };
3127
3128 static inline int iommu_domain_cache_init(void)
3129 {
3130         int ret = 0;
3131
3132         iommu_domain_cache = kmem_cache_create("iommu_domain",
3133                                          sizeof(struct dmar_domain),
3134                                          0,
3135                                          SLAB_HWCACHE_ALIGN,
3136
3137                                          NULL);
3138         if (!iommu_domain_cache) {
3139                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3140                 ret = -ENOMEM;
3141         }
3142
3143         return ret;
3144 }
3145
3146 static inline int iommu_devinfo_cache_init(void)
3147 {
3148         int ret = 0;
3149
3150         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3151                                          sizeof(struct device_domain_info),
3152                                          0,
3153                                          SLAB_HWCACHE_ALIGN,
3154                                          NULL);
3155         if (!iommu_devinfo_cache) {
3156                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3157                 ret = -ENOMEM;
3158         }
3159
3160         return ret;
3161 }
3162
3163 static inline int iommu_iova_cache_init(void)
3164 {
3165         int ret = 0;
3166
3167         iommu_iova_cache = kmem_cache_create("iommu_iova",
3168                                          sizeof(struct iova),
3169                                          0,
3170                                          SLAB_HWCACHE_ALIGN,
3171                                          NULL);
3172         if (!iommu_iova_cache) {
3173                 printk(KERN_ERR "Couldn't create iova cache\n");
3174                 ret = -ENOMEM;
3175         }
3176
3177         return ret;
3178 }
3179
3180 static int __init iommu_init_mempool(void)
3181 {
3182         int ret;
3183         ret = iommu_iova_cache_init();
3184         if (ret)
3185                 return ret;
3186
3187         ret = iommu_domain_cache_init();
3188         if (ret)
3189                 goto domain_error;
3190
3191         ret = iommu_devinfo_cache_init();
3192         if (!ret)
3193                 return ret;
3194
3195         kmem_cache_destroy(iommu_domain_cache);
3196 domain_error:
3197         kmem_cache_destroy(iommu_iova_cache);
3198
3199         return -ENOMEM;
3200 }
3201
3202 static void __init iommu_exit_mempool(void)
3203 {
3204         kmem_cache_destroy(iommu_devinfo_cache);
3205         kmem_cache_destroy(iommu_domain_cache);
3206         kmem_cache_destroy(iommu_iova_cache);
3207
3208 }
3209
3210 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3211 {
3212         struct dmar_drhd_unit *drhd;
3213         u32 vtbar;
3214         int rc;
3215
3216         /* We know that this device on this chipset has its own IOMMU.
3217          * If we find it under a different IOMMU, then the BIOS is lying
3218          * to us. Hope that the IOMMU for this device is actually
3219          * disabled, and it needs no translation...
3220          */
3221         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3222         if (rc) {
3223                 /* "can't" happen */
3224                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3225                 return;
3226         }
3227         vtbar &= 0xffff0000;
3228
3229         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3230         drhd = dmar_find_matched_drhd_unit(pdev);
3231         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3232                             TAINT_FIRMWARE_WORKAROUND,
3233                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3234                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3235 }
3236 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3237
3238 static void __init init_no_remapping_devices(void)
3239 {
3240         struct dmar_drhd_unit *drhd;
3241
3242         for_each_drhd_unit(drhd) {
3243                 if (!drhd->include_all) {
3244                         int i;
3245                         for (i = 0; i < drhd->devices_cnt; i++)
3246                                 if (drhd->devices[i] != NULL)
3247                                         break;
3248                         /* ignore DMAR unit if no pci devices exist */
3249                         if (i == drhd->devices_cnt)
3250                                 drhd->ignored = 1;
3251                 }
3252         }
3253
3254         for_each_drhd_unit(drhd) {
3255                 int i;
3256                 if (drhd->ignored || drhd->include_all)
3257                         continue;
3258
3259                 for (i = 0; i < drhd->devices_cnt; i++)
3260                         if (drhd->devices[i] &&
3261                             !IS_GFX_DEVICE(drhd->devices[i]))
3262                                 break;
3263
3264                 if (i < drhd->devices_cnt)
3265                         continue;
3266
3267                 /* This IOMMU has *only* gfx devices. Either bypass it or
3268                    set the gfx_mapped flag, as appropriate */
3269                 if (dmar_map_gfx) {
3270                         intel_iommu_gfx_mapped = 1;
3271                 } else {
3272                         drhd->ignored = 1;
3273                         for (i = 0; i < drhd->devices_cnt; i++) {
3274                                 if (!drhd->devices[i])
3275                                         continue;
3276                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3277                         }
3278                 }
3279         }
3280 }
3281
3282 #ifdef CONFIG_SUSPEND
3283 static int init_iommu_hw(void)
3284 {
3285         struct dmar_drhd_unit *drhd;
3286         struct intel_iommu *iommu = NULL;
3287
3288         for_each_active_iommu(iommu, drhd)
3289                 if (iommu->qi)
3290                         dmar_reenable_qi(iommu);
3291
3292         for_each_iommu(iommu, drhd) {
3293                 if (drhd->ignored) {
3294                         /*
3295                          * we always have to disable PMRs or DMA may fail on
3296                          * this device
3297                          */
3298                         if (force_on)
3299                                 iommu_disable_protect_mem_regions(iommu);
3300                         continue;
3301                 }
3302         
3303                 iommu_flush_write_buffer(iommu);
3304
3305                 iommu_set_root_entry(iommu);
3306
3307                 iommu->flush.flush_context(iommu, 0, 0, 0,
3308                                            DMA_CCMD_GLOBAL_INVL);
3309                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3310                                          DMA_TLB_GLOBAL_FLUSH);
3311                 if (iommu_enable_translation(iommu))
3312                         return 1;
3313                 iommu_disable_protect_mem_regions(iommu);
3314         }
3315
3316         return 0;
3317 }
3318
3319 static void iommu_flush_all(void)
3320 {
3321         struct dmar_drhd_unit *drhd;
3322         struct intel_iommu *iommu;
3323
3324         for_each_active_iommu(iommu, drhd) {
3325                 iommu->flush.flush_context(iommu, 0, 0, 0,
3326                                            DMA_CCMD_GLOBAL_INVL);
3327                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3328                                          DMA_TLB_GLOBAL_FLUSH);
3329         }
3330 }
3331
3332 static int iommu_suspend(void)
3333 {
3334         struct dmar_drhd_unit *drhd;
3335         struct intel_iommu *iommu = NULL;
3336         unsigned long flag;
3337
3338         for_each_active_iommu(iommu, drhd) {
3339                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3340                                                  GFP_ATOMIC);
3341                 if (!iommu->iommu_state)
3342                         goto nomem;
3343         }
3344
3345         iommu_flush_all();
3346
3347         for_each_active_iommu(iommu, drhd) {
3348                 iommu_disable_translation(iommu);
3349
3350                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3351
3352                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3353                         readl(iommu->reg + DMAR_FECTL_REG);
3354                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3355                         readl(iommu->reg + DMAR_FEDATA_REG);
3356                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3357                         readl(iommu->reg + DMAR_FEADDR_REG);
3358                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3359                         readl(iommu->reg + DMAR_FEUADDR_REG);
3360
3361                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3362         }
3363         return 0;
3364
3365 nomem:
3366         for_each_active_iommu(iommu, drhd)
3367                 kfree(iommu->iommu_state);
3368
3369         return -ENOMEM;
3370 }
3371
3372 static void iommu_resume(void)
3373 {
3374         struct dmar_drhd_unit *drhd;
3375         struct intel_iommu *iommu = NULL;
3376         unsigned long flag;
3377
3378         if (init_iommu_hw()) {
3379                 if (force_on)
3380                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3381                 else
3382                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3383                 return;
3384         }
3385
3386         for_each_active_iommu(iommu, drhd) {
3387
3388                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3389
3390                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3391                         iommu->reg + DMAR_FECTL_REG);
3392                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3393                         iommu->reg + DMAR_FEDATA_REG);
3394                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3395                         iommu->reg + DMAR_FEADDR_REG);
3396                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3397                         iommu->reg + DMAR_FEUADDR_REG);
3398
3399                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3400         }
3401
3402         for_each_active_iommu(iommu, drhd)
3403                 kfree(iommu->iommu_state);
3404 }
3405
3406 static struct syscore_ops iommu_syscore_ops = {
3407         .resume         = iommu_resume,
3408         .suspend        = iommu_suspend,
3409 };
3410
3411 static void __init init_iommu_pm_ops(void)
3412 {
3413         register_syscore_ops(&iommu_syscore_ops);
3414 }
3415
3416 #else
3417 static inline void init_iommu_pm_ops(void) {}
3418 #endif  /* CONFIG_PM */
3419
3420 LIST_HEAD(dmar_rmrr_units);
3421
3422 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3423 {
3424         list_add(&rmrr->list, &dmar_rmrr_units);
3425 }
3426
3427
3428 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3429 {
3430         struct acpi_dmar_reserved_memory *rmrr;
3431         struct dmar_rmrr_unit *rmrru;
3432
3433         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3434         if (!rmrru)
3435                 return -ENOMEM;
3436
3437         rmrru->hdr = header;
3438         rmrr = (struct acpi_dmar_reserved_memory *)header;
3439         rmrru->base_address = rmrr->base_address;
3440         rmrru->end_address = rmrr->end_address;
3441
3442         dmar_register_rmrr_unit(rmrru);
3443         return 0;
3444 }
3445
3446 static int __init
3447 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3448 {
3449         struct acpi_dmar_reserved_memory *rmrr;
3450         int ret;
3451
3452         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3453         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3454                 ((void *)rmrr) + rmrr->header.length,
3455                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3456
3457         if (ret || (rmrru->devices_cnt == 0)) {
3458                 list_del(&rmrru->list);
3459                 kfree(rmrru);
3460         }
3461         return ret;
3462 }
3463
3464 static LIST_HEAD(dmar_atsr_units);
3465
3466 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3467 {
3468         struct acpi_dmar_atsr *atsr;
3469         struct dmar_atsr_unit *atsru;
3470
3471         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3472         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3473         if (!atsru)
3474                 return -ENOMEM;
3475
3476         atsru->hdr = hdr;
3477         atsru->include_all = atsr->flags & 0x1;
3478
3479         list_add(&atsru->list, &dmar_atsr_units);
3480
3481         return 0;
3482 }
3483
3484 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3485 {
3486         int rc;
3487         struct acpi_dmar_atsr *atsr;
3488
3489         if (atsru->include_all)
3490                 return 0;
3491
3492         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3493         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3494                                 (void *)atsr + atsr->header.length,
3495                                 &atsru->devices_cnt, &atsru->devices,
3496                                 atsr->segment);
3497         if (rc || !atsru->devices_cnt) {
3498                 list_del(&atsru->list);
3499                 kfree(atsru);
3500         }
3501
3502         return rc;
3503 }
3504
3505 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3506 {
3507         int i;
3508         struct pci_bus *bus;
3509         struct acpi_dmar_atsr *atsr;
3510         struct dmar_atsr_unit *atsru;
3511
3512         dev = pci_physfn(dev);
3513
3514         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3515                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3516                 if (atsr->segment == pci_domain_nr(dev->bus))
3517                         goto found;
3518         }
3519
3520         return 0;
3521
3522 found:
3523         for (bus = dev->bus; bus; bus = bus->parent) {
3524                 struct pci_dev *bridge = bus->self;
3525
3526                 if (!bridge || !pci_is_pcie(bridge) ||
3527                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3528                         return 0;
3529
3530                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3531                         for (i = 0; i < atsru->devices_cnt; i++)
3532                                 if (atsru->devices[i] == bridge)
3533                                         return 1;
3534                         break;
3535                 }
3536         }
3537
3538         if (atsru->include_all)
3539                 return 1;
3540
3541         return 0;
3542 }
3543
3544 int __init dmar_parse_rmrr_atsr_dev(void)
3545 {
3546         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3547         struct dmar_atsr_unit *atsr, *atsr_n;
3548         int ret = 0;
3549
3550         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3551                 ret = rmrr_parse_dev(rmrr);
3552                 if (ret)
3553                         return ret;
3554         }
3555
3556         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3557                 ret = atsr_parse_dev(atsr);
3558                 if (ret)
3559                         return ret;
3560         }
3561
3562         return ret;
3563 }
3564
3565 /*
3566  * Here we only respond to action of unbound device from driver.
3567  *
3568  * Added device is not attached to its DMAR domain here yet. That will happen
3569  * when mapping the device to iova.
3570  */
3571 static int device_notifier(struct notifier_block *nb,
3572                                   unsigned long action, void *data)
3573 {
3574         struct device *dev = data;
3575         struct pci_dev *pdev = to_pci_dev(dev);
3576         struct dmar_domain *domain;
3577
3578         if (iommu_no_mapping(dev))
3579                 return 0;
3580
3581         domain = find_domain(pdev);
3582         if (!domain)
3583                 return 0;
3584
3585         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3586                 domain_remove_one_dev_info(domain, pdev);
3587
3588                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3589                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3590                     list_empty(&domain->devices))
3591                         domain_exit(domain);
3592         }
3593
3594         return 0;
3595 }
3596
3597 static struct notifier_block device_nb = {
3598         .notifier_call = device_notifier,
3599 };
3600
3601 int __init intel_iommu_init(void)
3602 {
3603         int ret = 0;
3604
3605         /* VT-d is required for a TXT/tboot launch, so enforce that */
3606         force_on = tboot_force_iommu();
3607
3608         if (dmar_table_init()) {
3609                 if (force_on)
3610                         panic("tboot: Failed to initialize DMAR table\n");
3611                 return  -ENODEV;
3612         }
3613
3614         if (dmar_dev_scope_init() < 0) {
3615                 if (force_on)
3616                         panic("tboot: Failed to initialize DMAR device scope\n");
3617                 return  -ENODEV;
3618         }
3619
3620         if (no_iommu || dmar_disabled)
3621                 return -ENODEV;
3622
3623         if (iommu_init_mempool()) {
3624                 if (force_on)
3625                         panic("tboot: Failed to initialize iommu memory\n");
3626                 return  -ENODEV;
3627         }
3628
3629         if (list_empty(&dmar_rmrr_units))
3630                 printk(KERN_INFO "DMAR: No RMRR found\n");
3631
3632         if (list_empty(&dmar_atsr_units))
3633                 printk(KERN_INFO "DMAR: No ATSR found\n");
3634
3635         if (dmar_init_reserved_ranges()) {
3636                 if (force_on)
3637                         panic("tboot: Failed to reserve iommu ranges\n");
3638                 return  -ENODEV;
3639         }
3640
3641         init_no_remapping_devices();
3642
3643         ret = init_dmars();
3644         if (ret) {
3645                 if (force_on)
3646                         panic("tboot: Failed to initialize DMARs\n");
3647                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3648                 put_iova_domain(&reserved_iova_list);
3649                 iommu_exit_mempool();
3650                 return ret;
3651         }
3652         printk(KERN_INFO
3653         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3654
3655         init_timer(&unmap_timer);
3656 #ifdef CONFIG_SWIOTLB
3657         swiotlb = 0;
3658 #endif
3659         dma_ops = &intel_dma_ops;
3660
3661         init_iommu_pm_ops();
3662
3663         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3664
3665         bus_register_notifier(&pci_bus_type, &device_nb);
3666
3667         intel_iommu_enabled = 1;
3668
3669         return 0;
3670 }
3671
3672 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3673                                            struct pci_dev *pdev)
3674 {
3675         struct pci_dev *tmp, *parent;
3676
3677         if (!iommu || !pdev)
3678                 return;
3679
3680         /* dependent device detach */
3681         tmp = pci_find_upstream_pcie_bridge(pdev);
3682         /* Secondary interface's bus number and devfn 0 */
3683         if (tmp) {
3684                 parent = pdev->bus->self;
3685                 while (parent != tmp) {
3686                         iommu_detach_dev(iommu, parent->bus->number,
3687                                          parent->devfn);
3688                         parent = parent->bus->self;
3689                 }
3690                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3691                         iommu_detach_dev(iommu,
3692                                 tmp->subordinate->number, 0);
3693                 else /* this is a legacy PCI bridge */
3694                         iommu_detach_dev(iommu, tmp->bus->number,
3695                                          tmp->devfn);
3696         }
3697 }
3698
3699 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3700                                           struct pci_dev *pdev)
3701 {
3702         struct device_domain_info *info;
3703         struct intel_iommu *iommu;
3704         unsigned long flags;
3705         int found = 0;
3706         struct list_head *entry, *tmp;
3707
3708         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3709                                 pdev->devfn);
3710         if (!iommu)
3711                 return;
3712
3713         spin_lock_irqsave(&device_domain_lock, flags);
3714         list_for_each_safe(entry, tmp, &domain->devices) {
3715                 info = list_entry(entry, struct device_domain_info, link);
3716                 if (info->segment == pci_domain_nr(pdev->bus) &&
3717                     info->bus == pdev->bus->number &&
3718                     info->devfn == pdev->devfn) {
3719                         list_del(&info->link);
3720                         list_del(&info->global);
3721                         if (info->dev)
3722                                 info->dev->dev.archdata.iommu = NULL;
3723                         spin_unlock_irqrestore(&device_domain_lock, flags);
3724
3725                         iommu_disable_dev_iotlb(info);
3726                         iommu_detach_dev(iommu, info->bus, info->devfn);
3727                         iommu_detach_dependent_devices(iommu, pdev);
3728                         free_devinfo_mem(info);
3729
3730                         spin_lock_irqsave(&device_domain_lock, flags);
3731
3732                         if (found)
3733                                 break;
3734                         else
3735                                 continue;
3736                 }
3737
3738                 /* if there is no other devices under the same iommu
3739                  * owned by this domain, clear this iommu in iommu_bmp
3740                  * update iommu count and coherency
3741                  */
3742                 if (iommu == device_to_iommu(info->segment, info->bus,
3743                                             info->devfn))
3744                         found = 1;
3745         }
3746
3747         spin_unlock_irqrestore(&device_domain_lock, flags);
3748
3749         if (found == 0) {
3750                 unsigned long tmp_flags;
3751                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3752                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3753                 domain->iommu_count--;
3754                 domain_update_iommu_cap(domain);
3755                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3756
3757                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3758                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3759                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3760                         clear_bit(domain->id, iommu->domain_ids);
3761                         iommu->domains[domain->id] = NULL;
3762                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3763                 }
3764         }
3765 }
3766
3767 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3768 {
3769         struct device_domain_info *info;
3770         struct intel_iommu *iommu;
3771         unsigned long flags1, flags2;
3772
3773         spin_lock_irqsave(&device_domain_lock, flags1);
3774         while (!list_empty(&domain->devices)) {
3775                 info = list_entry(domain->devices.next,
3776                         struct device_domain_info, link);
3777                 list_del(&info->link);
3778                 list_del(&info->global);
3779                 if (info->dev)
3780                         info->dev->dev.archdata.iommu = NULL;
3781
3782                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3783
3784                 iommu_disable_dev_iotlb(info);
3785                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3786                 iommu_detach_dev(iommu, info->bus, info->devfn);
3787                 iommu_detach_dependent_devices(iommu, info->dev);
3788
3789                 /* clear this iommu in iommu_bmp, update iommu count
3790                  * and capabilities
3791                  */
3792                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3793                 if (test_and_clear_bit(iommu->seq_id,
3794                                        &domain->iommu_bmp)) {
3795                         domain->iommu_count--;
3796                         domain_update_iommu_cap(domain);
3797                 }
3798                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3799
3800                 free_devinfo_mem(info);
3801                 spin_lock_irqsave(&device_domain_lock, flags1);
3802         }
3803         spin_unlock_irqrestore(&device_domain_lock, flags1);
3804 }
3805
3806 /* domain id for virtual machine, it won't be set in context */
3807 static unsigned long vm_domid;
3808
3809 static struct dmar_domain *iommu_alloc_vm_domain(void)
3810 {
3811         struct dmar_domain *domain;
3812
3813         domain = alloc_domain_mem();
3814         if (!domain)
3815                 return NULL;
3816
3817         domain->id = vm_domid++;
3818         domain->nid = -1;
3819         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3820         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3821
3822         return domain;
3823 }
3824
3825 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3826 {
3827         int adjust_width;
3828
3829         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3830         spin_lock_init(&domain->iommu_lock);
3831
3832         domain_reserve_special_ranges(domain);
3833
3834         /* calculate AGAW */
3835         domain->gaw = guest_width;
3836         adjust_width = guestwidth_to_adjustwidth(guest_width);
3837         domain->agaw = width_to_agaw(adjust_width);
3838
3839         INIT_LIST_HEAD(&domain->devices);
3840
3841         domain->iommu_count = 0;
3842         domain->iommu_coherency = 0;
3843         domain->iommu_snooping = 0;
3844         domain->iommu_superpage = 0;
3845         domain->max_addr = 0;
3846         domain->nid = -1;
3847
3848         /* always allocate the top pgd */
3849         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3850         if (!domain->pgd)
3851                 return -ENOMEM;
3852         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3853         return 0;
3854 }
3855
3856 static void iommu_free_vm_domain(struct dmar_domain *domain)
3857 {
3858         unsigned long flags;
3859         struct dmar_drhd_unit *drhd;
3860         struct intel_iommu *iommu;
3861         unsigned long i;
3862         unsigned long ndomains;
3863
3864         for_each_drhd_unit(drhd) {
3865                 if (drhd->ignored)
3866                         continue;
3867                 iommu = drhd->iommu;
3868
3869                 ndomains = cap_ndoms(iommu->cap);
3870                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3871                         if (iommu->domains[i] == domain) {
3872                                 spin_lock_irqsave(&iommu->lock, flags);
3873                                 clear_bit(i, iommu->domain_ids);
3874                                 iommu->domains[i] = NULL;
3875                                 spin_unlock_irqrestore(&iommu->lock, flags);
3876                                 break;
3877                         }
3878                 }
3879         }
3880 }
3881
3882 static void vm_domain_exit(struct dmar_domain *domain)
3883 {
3884         /* Domain 0 is reserved, so dont process it */
3885         if (!domain)
3886                 return;
3887
3888         vm_domain_remove_all_dev_info(domain);
3889         /* destroy iovas */
3890         put_iova_domain(&domain->iovad);
3891
3892         /* clear ptes */
3893         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3894
3895         /* free page tables */
3896         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3897
3898         iommu_free_vm_domain(domain);
3899         free_domain_mem(domain);
3900 }
3901
3902 static int intel_iommu_domain_init(struct iommu_domain *domain)
3903 {
3904         struct dmar_domain *dmar_domain;
3905
3906         dmar_domain = iommu_alloc_vm_domain();
3907         if (!dmar_domain) {
3908                 printk(KERN_ERR
3909                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3910                 return -ENOMEM;
3911         }
3912         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3913                 printk(KERN_ERR
3914                         "intel_iommu_domain_init() failed\n");
3915                 vm_domain_exit(dmar_domain);
3916                 return -ENOMEM;
3917         }
3918         domain_update_iommu_cap(dmar_domain);
3919         domain->priv = dmar_domain;
3920
3921         return 0;
3922 }
3923
3924 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3925 {
3926         struct dmar_domain *dmar_domain = domain->priv;
3927
3928         domain->priv = NULL;
3929         vm_domain_exit(dmar_domain);
3930 }
3931
3932 static int intel_iommu_attach_device(struct iommu_domain *domain,
3933                                      struct device *dev)
3934 {
3935         struct dmar_domain *dmar_domain = domain->priv;
3936         struct pci_dev *pdev = to_pci_dev(dev);
3937         struct intel_iommu *iommu;
3938         int addr_width;
3939
3940         /* normally pdev is not mapped */
3941         if (unlikely(domain_context_mapped(pdev))) {
3942                 struct dmar_domain *old_domain;
3943
3944                 old_domain = find_domain(pdev);
3945                 if (old_domain) {
3946                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3947                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3948                                 domain_remove_one_dev_info(old_domain, pdev);
3949                         else
3950                                 domain_remove_dev_info(old_domain);
3951                 }
3952         }
3953
3954         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3955                                 pdev->devfn);
3956         if (!iommu)
3957                 return -ENODEV;
3958
3959         /* check if this iommu agaw is sufficient for max mapped address */
3960         addr_width = agaw_to_width(iommu->agaw);
3961         if (addr_width > cap_mgaw(iommu->cap))
3962                 addr_width = cap_mgaw(iommu->cap);
3963
3964         if (dmar_domain->max_addr > (1LL << addr_width)) {
3965                 printk(KERN_ERR "%s: iommu width (%d) is not "
3966                        "sufficient for the mapped address (%llx)\n",
3967                        __func__, addr_width, dmar_domain->max_addr);
3968                 return -EFAULT;
3969         }
3970         dmar_domain->gaw = addr_width;
3971
3972         /*
3973          * Knock out extra levels of page tables if necessary
3974          */
3975         while (iommu->agaw < dmar_domain->agaw) {
3976                 struct dma_pte *pte;
3977
3978                 pte = dmar_domain->pgd;
3979                 if (dma_pte_present(pte)) {
3980                         dmar_domain->pgd = (struct dma_pte *)
3981                                 phys_to_virt(dma_pte_addr(pte));
3982                         free_pgtable_page(pte);
3983                 }
3984                 dmar_domain->agaw--;
3985         }
3986
3987         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3988 }
3989
3990 static void intel_iommu_detach_device(struct iommu_domain *domain,
3991                                       struct device *dev)
3992 {
3993         struct dmar_domain *dmar_domain = domain->priv;
3994         struct pci_dev *pdev = to_pci_dev(dev);
3995
3996         domain_remove_one_dev_info(dmar_domain, pdev);
3997 }
3998
3999 static int intel_iommu_map(struct iommu_domain *domain,
4000                            unsigned long iova, phys_addr_t hpa,
4001                            int gfp_order, int iommu_prot)
4002 {
4003         struct dmar_domain *dmar_domain = domain->priv;
4004         u64 max_addr;
4005         int prot = 0;
4006         size_t size;
4007         int ret;
4008
4009         if (iommu_prot & IOMMU_READ)
4010                 prot |= DMA_PTE_READ;
4011         if (iommu_prot & IOMMU_WRITE)
4012                 prot |= DMA_PTE_WRITE;
4013         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4014                 prot |= DMA_PTE_SNP;
4015
4016         size     = PAGE_SIZE << gfp_order;
4017         max_addr = iova + size;
4018         if (dmar_domain->max_addr < max_addr) {
4019                 u64 end;
4020
4021                 /* check if minimum agaw is sufficient for mapped address */
4022                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4023                 if (end < max_addr) {
4024                         printk(KERN_ERR "%s: iommu width (%d) is not "
4025                                "sufficient for the mapped address (%llx)\n",
4026                                __func__, dmar_domain->gaw, max_addr);
4027                         return -EFAULT;
4028                 }
4029                 dmar_domain->max_addr = max_addr;
4030         }
4031         /* Round up size to next multiple of PAGE_SIZE, if it and
4032            the low bits of hpa would take us onto the next page */
4033         size = aligned_nrpages(hpa, size);
4034         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4035                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4036         return ret;
4037 }
4038
4039 static int intel_iommu_unmap(struct iommu_domain *domain,
4040                              unsigned long iova, int gfp_order)
4041 {
4042         struct dmar_domain *dmar_domain = domain->priv;
4043         size_t size = PAGE_SIZE << gfp_order;
4044         int order;
4045
4046         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4047                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4048
4049         if (dmar_domain->max_addr == iova + size)
4050                 dmar_domain->max_addr = iova;
4051
4052         return order;
4053 }
4054
4055 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4056                                             unsigned long iova)
4057 {
4058         struct dmar_domain *dmar_domain = domain->priv;
4059         struct dma_pte *pte;
4060         u64 phys = 0;
4061
4062         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4063         if (pte)
4064                 phys = dma_pte_addr(pte);
4065
4066         return phys;
4067 }
4068
4069 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4070                                       unsigned long cap)
4071 {
4072         struct dmar_domain *dmar_domain = domain->priv;
4073
4074         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4075                 return dmar_domain->iommu_snooping;
4076         if (cap == IOMMU_CAP_INTR_REMAP)
4077                 return intr_remapping_enabled;
4078
4079         return 0;
4080 }
4081
4082 static struct iommu_ops intel_iommu_ops = {
4083         .domain_init    = intel_iommu_domain_init,
4084         .domain_destroy = intel_iommu_domain_destroy,
4085         .attach_dev     = intel_iommu_attach_device,
4086         .detach_dev     = intel_iommu_detach_device,
4087         .map            = intel_iommu_map,
4088         .unmap          = intel_iommu_unmap,
4089         .iova_to_phys   = intel_iommu_iova_to_phys,
4090         .domain_has_cap = intel_iommu_domain_has_cap,
4091 };
4092
4093 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4094 {
4095         /*
4096          * Mobile 4 Series Chipset neglects to set RWBF capability,
4097          * but needs it:
4098          */
4099         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4100         rwbf_quirk = 1;
4101
4102         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4103         if (dev->revision == 0x07) {
4104                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4105                 dmar_map_gfx = 0;
4106         }
4107 }
4108
4109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4110
4111 #define GGC 0x52
4112 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4113 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4114 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4115 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4116 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4117 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4118 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4119 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4120
4121 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4122 {
4123         unsigned short ggc;
4124
4125         if (pci_read_config_word(dev, GGC, &ggc))
4126                 return;
4127
4128         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4129                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4130                 dmar_map_gfx = 0;
4131         } else if (dmar_map_gfx) {
4132                 /* we have to ensure the gfx device is idle before we flush */
4133                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4134                 intel_iommu_strict = 1;
4135        }
4136 }
4137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4141
4142 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4143    ISOCH DMAR unit for the Azalia sound device, but not give it any
4144    TLB entries, which causes it to deadlock. Check for that.  We do
4145    this in a function called from init_dmars(), instead of in a PCI
4146    quirk, because we don't want to print the obnoxious "BIOS broken"
4147    message if VT-d is actually disabled.
4148 */
4149 static void __init check_tylersburg_isoch(void)
4150 {
4151         struct pci_dev *pdev;
4152         uint32_t vtisochctrl;
4153
4154         /* If there's no Azalia in the system anyway, forget it. */
4155         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4156         if (!pdev)
4157                 return;
4158         pci_dev_put(pdev);
4159
4160         /* System Management Registers. Might be hidden, in which case
4161            we can't do the sanity check. But that's OK, because the
4162            known-broken BIOSes _don't_ actually hide it, so far. */
4163         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4164         if (!pdev)
4165                 return;
4166
4167         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4168                 pci_dev_put(pdev);
4169                 return;
4170         }
4171
4172         pci_dev_put(pdev);
4173
4174         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4175         if (vtisochctrl & 1)
4176                 return;
4177
4178         /* Drop all bits other than the number of TLB entries */
4179         vtisochctrl &= 0x1c;
4180
4181         /* If we have the recommended number of TLB entries (16), fine. */
4182         if (vtisochctrl == 0x10)
4183                 return;
4184
4185         /* Zero TLB entries? You get to ride the short bus to school. */
4186         if (!vtisochctrl) {
4187                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4188                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4189                      dmi_get_system_info(DMI_BIOS_VENDOR),
4190                      dmi_get_system_info(DMI_BIOS_VERSION),
4191                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4192                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4193                 return;
4194         }
4195         
4196         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4197                vtisochctrl);
4198 }