]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/iommu/intel-iommu.c
iommu/vt-d: keep shared resources when failed to initialize iommu devices
[karo-tx-linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #include "irq_remapping.h"
50 #include "pci.h"
51
52 #define ROOT_SIZE               VTD_PAGE_SIZE
53 #define CONTEXT_SIZE            VTD_PAGE_SIZE
54
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58
59 #define IOAPIC_RANGE_START      (0xfee00000)
60 #define IOAPIC_RANGE_END        (0xfeefffff)
61 #define IOVA_START_ADDR         (0x1000)
62
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
64
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
67
68 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
70
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
74                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
76
77 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
78 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
79 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
80
81 /* page table handling */
82 #define LEVEL_STRIDE            (9)
83 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
84
85 /*
86  * This bitmap is used to advertise the page sizes our hardware support
87  * to the IOMMU core, which will then use this information to split
88  * physically contiguous memory regions it is mapping into page sizes
89  * that we support.
90  *
91  * Traditionally the IOMMU core just handed us the mappings directly,
92  * after making sure the size is an order of a 4KiB page and that the
93  * mapping has natural alignment.
94  *
95  * To retain this behavior, we currently advertise that we support
96  * all page sizes that are an order of 4KiB.
97  *
98  * If at some point we'd like to utilize the IOMMU core's new behavior,
99  * we could change this to advertise the real page sizes we support.
100  */
101 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
102
103 static inline int agaw_to_level(int agaw)
104 {
105         return agaw + 2;
106 }
107
108 static inline int agaw_to_width(int agaw)
109 {
110         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
111 }
112
113 static inline int width_to_agaw(int width)
114 {
115         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
116 }
117
118 static inline unsigned int level_to_offset_bits(int level)
119 {
120         return (level - 1) * LEVEL_STRIDE;
121 }
122
123 static inline int pfn_level_offset(unsigned long pfn, int level)
124 {
125         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
126 }
127
128 static inline unsigned long level_mask(int level)
129 {
130         return -1UL << level_to_offset_bits(level);
131 }
132
133 static inline unsigned long level_size(int level)
134 {
135         return 1UL << level_to_offset_bits(level);
136 }
137
138 static inline unsigned long align_to_level(unsigned long pfn, int level)
139 {
140         return (pfn + level_size(level) - 1) & level_mask(level);
141 }
142
143 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
144 {
145         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
146 }
147
148 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
149    are never going to work. */
150 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
151 {
152         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
153 }
154
155 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
156 {
157         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 }
159 static inline unsigned long page_to_dma_pfn(struct page *pg)
160 {
161         return mm_to_dma_pfn(page_to_pfn(pg));
162 }
163 static inline unsigned long virt_to_dma_pfn(void *p)
164 {
165         return page_to_dma_pfn(virt_to_page(p));
166 }
167
168 /* global iommu list, set NULL for ignored DMAR units */
169 static struct intel_iommu **g_iommus;
170
171 static void __init check_tylersburg_isoch(void);
172 static int rwbf_quirk;
173
174 /*
175  * set to 1 to panic kernel if can't successfully enable VT-d
176  * (used when kernel is launched w/ TXT)
177  */
178 static int force_on = 0;
179
180 /*
181  * 0: Present
182  * 1-11: Reserved
183  * 12-63: Context Ptr (12 - (haw-1))
184  * 64-127: Reserved
185  */
186 struct root_entry {
187         u64     val;
188         u64     rsvd1;
189 };
190 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
191 static inline bool root_present(struct root_entry *root)
192 {
193         return (root->val & 1);
194 }
195 static inline void set_root_present(struct root_entry *root)
196 {
197         root->val |= 1;
198 }
199 static inline void set_root_value(struct root_entry *root, unsigned long value)
200 {
201         root->val |= value & VTD_PAGE_MASK;
202 }
203
204 static inline struct context_entry *
205 get_context_addr_from_root(struct root_entry *root)
206 {
207         return (struct context_entry *)
208                 (root_present(root)?phys_to_virt(
209                 root->val & VTD_PAGE_MASK) :
210                 NULL);
211 }
212
213 /*
214  * low 64 bits:
215  * 0: present
216  * 1: fault processing disable
217  * 2-3: translation type
218  * 12-63: address space root
219  * high 64 bits:
220  * 0-2: address width
221  * 3-6: aval
222  * 8-23: domain id
223  */
224 struct context_entry {
225         u64 lo;
226         u64 hi;
227 };
228
229 static inline bool context_present(struct context_entry *context)
230 {
231         return (context->lo & 1);
232 }
233 static inline void context_set_present(struct context_entry *context)
234 {
235         context->lo |= 1;
236 }
237
238 static inline void context_set_fault_enable(struct context_entry *context)
239 {
240         context->lo &= (((u64)-1) << 2) | 1;
241 }
242
243 static inline void context_set_translation_type(struct context_entry *context,
244                                                 unsigned long value)
245 {
246         context->lo &= (((u64)-1) << 4) | 3;
247         context->lo |= (value & 3) << 2;
248 }
249
250 static inline void context_set_address_root(struct context_entry *context,
251                                             unsigned long value)
252 {
253         context->lo |= value & VTD_PAGE_MASK;
254 }
255
256 static inline void context_set_address_width(struct context_entry *context,
257                                              unsigned long value)
258 {
259         context->hi |= value & 7;
260 }
261
262 static inline void context_set_domain_id(struct context_entry *context,
263                                          unsigned long value)
264 {
265         context->hi |= (value & ((1 << 16) - 1)) << 8;
266 }
267
268 static inline void context_clear_entry(struct context_entry *context)
269 {
270         context->lo = 0;
271         context->hi = 0;
272 }
273
274 /*
275  * 0: readable
276  * 1: writable
277  * 2-6: reserved
278  * 7: super page
279  * 8-10: available
280  * 11: snoop behavior
281  * 12-63: Host physcial address
282  */
283 struct dma_pte {
284         u64 val;
285 };
286
287 static inline void dma_clear_pte(struct dma_pte *pte)
288 {
289         pte->val = 0;
290 }
291
292 static inline u64 dma_pte_addr(struct dma_pte *pte)
293 {
294 #ifdef CONFIG_64BIT
295         return pte->val & VTD_PAGE_MASK;
296 #else
297         /* Must have a full atomic 64-bit read */
298         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
299 #endif
300 }
301
302 static inline bool dma_pte_present(struct dma_pte *pte)
303 {
304         return (pte->val & 3) != 0;
305 }
306
307 static inline bool dma_pte_superpage(struct dma_pte *pte)
308 {
309         return (pte->val & (1 << 7));
310 }
311
312 static inline int first_pte_in_page(struct dma_pte *pte)
313 {
314         return !((unsigned long)pte & ~VTD_PAGE_MASK);
315 }
316
317 /*
318  * This domain is a statically identity mapping domain.
319  *      1. This domain creats a static 1:1 mapping to all usable memory.
320  *      2. It maps to each iommu if successful.
321  *      3. Each iommu mapps to this domain if successful.
322  */
323 static struct dmar_domain *si_domain;
324 static int hw_pass_through = 1;
325
326 /* devices under the same p2p bridge are owned in one domain */
327 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
328
329 /* domain represents a virtual machine, more than one devices
330  * across iommus may be owned in one domain, e.g. kvm guest.
331  */
332 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
333
334 /* si_domain contains mulitple devices */
335 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
336
337 /* define the limit of IOMMUs supported in each domain */
338 #ifdef  CONFIG_X86
339 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
340 #else
341 # define        IOMMU_UNITS_SUPPORTED   64
342 #endif
343
344 struct dmar_domain {
345         int     id;                     /* domain id */
346         int     nid;                    /* node id */
347         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
348                                         /* bitmap of iommus this domain uses*/
349
350         struct list_head devices;       /* all devices' list */
351         struct iova_domain iovad;       /* iova's that belong to this domain */
352
353         struct dma_pte  *pgd;           /* virtual address */
354         int             gaw;            /* max guest address width */
355
356         /* adjusted guest address width, 0 is level 2 30-bit */
357         int             agaw;
358
359         int             flags;          /* flags to find out type of domain */
360
361         int             iommu_coherency;/* indicate coherency of iommu access */
362         int             iommu_snooping; /* indicate snooping control feature*/
363         int             iommu_count;    /* reference count of iommu */
364         int             iommu_superpage;/* Level of superpages supported:
365                                            0 == 4KiB (no superpages), 1 == 2MiB,
366                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
367         spinlock_t      iommu_lock;     /* protect iommu set in domain */
368         u64             max_addr;       /* maximum mapped address */
369 };
370
371 /* PCI domain-device relationship */
372 struct device_domain_info {
373         struct list_head link;  /* link to domain siblings */
374         struct list_head global; /* link to global list */
375         int segment;            /* PCI domain */
376         u8 bus;                 /* PCI bus number */
377         u8 devfn;               /* PCI devfn number */
378         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
379         struct intel_iommu *iommu; /* IOMMU used by this device */
380         struct dmar_domain *domain; /* pointer to domain */
381 };
382
383 static void flush_unmaps_timeout(unsigned long data);
384
385 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
386
387 #define HIGH_WATER_MARK 250
388 struct deferred_flush_tables {
389         int next;
390         struct iova *iova[HIGH_WATER_MARK];
391         struct dmar_domain *domain[HIGH_WATER_MARK];
392 };
393
394 static struct deferred_flush_tables *deferred_flush;
395
396 /* bitmap for indexing intel_iommus */
397 static int g_num_of_iommus;
398
399 static DEFINE_SPINLOCK(async_umap_flush_lock);
400 static LIST_HEAD(unmaps_to_do);
401
402 static int timer_on;
403 static long list_size;
404
405 static void domain_remove_dev_info(struct dmar_domain *domain);
406
407 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
408 int dmar_disabled = 0;
409 #else
410 int dmar_disabled = 1;
411 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
412
413 int intel_iommu_enabled = 0;
414 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
415
416 static int dmar_map_gfx = 1;
417 static int dmar_forcedac;
418 static int intel_iommu_strict;
419 static int intel_iommu_superpage = 1;
420
421 int intel_iommu_gfx_mapped;
422 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
423
424 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
425 static DEFINE_SPINLOCK(device_domain_lock);
426 static LIST_HEAD(device_domain_list);
427
428 static struct iommu_ops intel_iommu_ops;
429
430 static int __init intel_iommu_setup(char *str)
431 {
432         if (!str)
433                 return -EINVAL;
434         while (*str) {
435                 if (!strncmp(str, "on", 2)) {
436                         dmar_disabled = 0;
437                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
438                 } else if (!strncmp(str, "off", 3)) {
439                         dmar_disabled = 1;
440                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
441                 } else if (!strncmp(str, "igfx_off", 8)) {
442                         dmar_map_gfx = 0;
443                         printk(KERN_INFO
444                                 "Intel-IOMMU: disable GFX device mapping\n");
445                 } else if (!strncmp(str, "forcedac", 8)) {
446                         printk(KERN_INFO
447                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
448                         dmar_forcedac = 1;
449                 } else if (!strncmp(str, "strict", 6)) {
450                         printk(KERN_INFO
451                                 "Intel-IOMMU: disable batched IOTLB flush\n");
452                         intel_iommu_strict = 1;
453                 } else if (!strncmp(str, "sp_off", 6)) {
454                         printk(KERN_INFO
455                                 "Intel-IOMMU: disable supported super page\n");
456                         intel_iommu_superpage = 0;
457                 }
458
459                 str += strcspn(str, ",");
460                 while (*str == ',')
461                         str++;
462         }
463         return 0;
464 }
465 __setup("intel_iommu=", intel_iommu_setup);
466
467 static struct kmem_cache *iommu_domain_cache;
468 static struct kmem_cache *iommu_devinfo_cache;
469 static struct kmem_cache *iommu_iova_cache;
470
471 static inline void *alloc_pgtable_page(int node)
472 {
473         struct page *page;
474         void *vaddr = NULL;
475
476         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
477         if (page)
478                 vaddr = page_address(page);
479         return vaddr;
480 }
481
482 static inline void free_pgtable_page(void *vaddr)
483 {
484         free_page((unsigned long)vaddr);
485 }
486
487 static inline void *alloc_domain_mem(void)
488 {
489         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
490 }
491
492 static void free_domain_mem(void *vaddr)
493 {
494         kmem_cache_free(iommu_domain_cache, vaddr);
495 }
496
497 static inline void * alloc_devinfo_mem(void)
498 {
499         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
500 }
501
502 static inline void free_devinfo_mem(void *vaddr)
503 {
504         kmem_cache_free(iommu_devinfo_cache, vaddr);
505 }
506
507 struct iova *alloc_iova_mem(void)
508 {
509         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
510 }
511
512 void free_iova_mem(struct iova *iova)
513 {
514         kmem_cache_free(iommu_iova_cache, iova);
515 }
516
517
518 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
519 {
520         unsigned long sagaw;
521         int agaw = -1;
522
523         sagaw = cap_sagaw(iommu->cap);
524         for (agaw = width_to_agaw(max_gaw);
525              agaw >= 0; agaw--) {
526                 if (test_bit(agaw, &sagaw))
527                         break;
528         }
529
530         return agaw;
531 }
532
533 /*
534  * Calculate max SAGAW for each iommu.
535  */
536 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
537 {
538         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
539 }
540
541 /*
542  * calculate agaw for each iommu.
543  * "SAGAW" may be different across iommus, use a default agaw, and
544  * get a supported less agaw for iommus that don't support the default agaw.
545  */
546 int iommu_calculate_agaw(struct intel_iommu *iommu)
547 {
548         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
549 }
550
551 /* This functionin only returns single iommu in a domain */
552 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
553 {
554         int iommu_id;
555
556         /* si_domain and vm domain should not get here. */
557         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
558         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
559
560         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
561         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
562                 return NULL;
563
564         return g_iommus[iommu_id];
565 }
566
567 static void domain_update_iommu_coherency(struct dmar_domain *domain)
568 {
569         int i;
570
571         i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
572
573         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
574
575         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
576                 if (!ecap_coherent(g_iommus[i]->ecap)) {
577                         domain->iommu_coherency = 0;
578                         break;
579                 }
580         }
581 }
582
583 static void domain_update_iommu_snooping(struct dmar_domain *domain)
584 {
585         int i;
586
587         domain->iommu_snooping = 1;
588
589         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
590                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
591                         domain->iommu_snooping = 0;
592                         break;
593                 }
594         }
595 }
596
597 static void domain_update_iommu_superpage(struct dmar_domain *domain)
598 {
599         struct dmar_drhd_unit *drhd;
600         struct intel_iommu *iommu = NULL;
601         int mask = 0xf;
602
603         if (!intel_iommu_superpage) {
604                 domain->iommu_superpage = 0;
605                 return;
606         }
607
608         /* set iommu_superpage to the smallest common denominator */
609         for_each_active_iommu(iommu, drhd) {
610                 mask &= cap_super_page_val(iommu->cap);
611                 if (!mask) {
612                         break;
613                 }
614         }
615         domain->iommu_superpage = fls(mask);
616 }
617
618 /* Some capabilities may be different across iommus */
619 static void domain_update_iommu_cap(struct dmar_domain *domain)
620 {
621         domain_update_iommu_coherency(domain);
622         domain_update_iommu_snooping(domain);
623         domain_update_iommu_superpage(domain);
624 }
625
626 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
627 {
628         struct dmar_drhd_unit *drhd = NULL;
629         int i;
630
631         for_each_active_drhd_unit(drhd) {
632                 if (segment != drhd->segment)
633                         continue;
634
635                 for (i = 0; i < drhd->devices_cnt; i++) {
636                         if (drhd->devices[i] &&
637                             drhd->devices[i]->bus->number == bus &&
638                             drhd->devices[i]->devfn == devfn)
639                                 return drhd->iommu;
640                         if (drhd->devices[i] &&
641                             drhd->devices[i]->subordinate &&
642                             drhd->devices[i]->subordinate->number <= bus &&
643                             drhd->devices[i]->subordinate->busn_res.end >= bus)
644                                 return drhd->iommu;
645                 }
646
647                 if (drhd->include_all)
648                         return drhd->iommu;
649         }
650
651         return NULL;
652 }
653
654 static void domain_flush_cache(struct dmar_domain *domain,
655                                void *addr, int size)
656 {
657         if (!domain->iommu_coherency)
658                 clflush_cache_range(addr, size);
659 }
660
661 /* Gets context entry for a given bus and devfn */
662 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
663                 u8 bus, u8 devfn)
664 {
665         struct root_entry *root;
666         struct context_entry *context;
667         unsigned long phy_addr;
668         unsigned long flags;
669
670         spin_lock_irqsave(&iommu->lock, flags);
671         root = &iommu->root_entry[bus];
672         context = get_context_addr_from_root(root);
673         if (!context) {
674                 context = (struct context_entry *)
675                                 alloc_pgtable_page(iommu->node);
676                 if (!context) {
677                         spin_unlock_irqrestore(&iommu->lock, flags);
678                         return NULL;
679                 }
680                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
681                 phy_addr = virt_to_phys((void *)context);
682                 set_root_value(root, phy_addr);
683                 set_root_present(root);
684                 __iommu_flush_cache(iommu, root, sizeof(*root));
685         }
686         spin_unlock_irqrestore(&iommu->lock, flags);
687         return &context[devfn];
688 }
689
690 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
691 {
692         struct root_entry *root;
693         struct context_entry *context;
694         int ret;
695         unsigned long flags;
696
697         spin_lock_irqsave(&iommu->lock, flags);
698         root = &iommu->root_entry[bus];
699         context = get_context_addr_from_root(root);
700         if (!context) {
701                 ret = 0;
702                 goto out;
703         }
704         ret = context_present(&context[devfn]);
705 out:
706         spin_unlock_irqrestore(&iommu->lock, flags);
707         return ret;
708 }
709
710 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
711 {
712         struct root_entry *root;
713         struct context_entry *context;
714         unsigned long flags;
715
716         spin_lock_irqsave(&iommu->lock, flags);
717         root = &iommu->root_entry[bus];
718         context = get_context_addr_from_root(root);
719         if (context) {
720                 context_clear_entry(&context[devfn]);
721                 __iommu_flush_cache(iommu, &context[devfn], \
722                         sizeof(*context));
723         }
724         spin_unlock_irqrestore(&iommu->lock, flags);
725 }
726
727 static void free_context_table(struct intel_iommu *iommu)
728 {
729         struct root_entry *root;
730         int i;
731         unsigned long flags;
732         struct context_entry *context;
733
734         spin_lock_irqsave(&iommu->lock, flags);
735         if (!iommu->root_entry) {
736                 goto out;
737         }
738         for (i = 0; i < ROOT_ENTRY_NR; i++) {
739                 root = &iommu->root_entry[i];
740                 context = get_context_addr_from_root(root);
741                 if (context)
742                         free_pgtable_page(context);
743         }
744         free_pgtable_page(iommu->root_entry);
745         iommu->root_entry = NULL;
746 out:
747         spin_unlock_irqrestore(&iommu->lock, flags);
748 }
749
750 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
751                                       unsigned long pfn, int target_level)
752 {
753         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
754         struct dma_pte *parent, *pte = NULL;
755         int level = agaw_to_level(domain->agaw);
756         int offset;
757
758         BUG_ON(!domain->pgd);
759
760         if (addr_width < BITS_PER_LONG && pfn >> addr_width)
761                 /* Address beyond IOMMU's addressing capabilities. */
762                 return NULL;
763
764         parent = domain->pgd;
765
766         while (level > 0) {
767                 void *tmp_page;
768
769                 offset = pfn_level_offset(pfn, level);
770                 pte = &parent[offset];
771                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
772                         break;
773                 if (level == target_level)
774                         break;
775
776                 if (!dma_pte_present(pte)) {
777                         uint64_t pteval;
778
779                         tmp_page = alloc_pgtable_page(domain->nid);
780
781                         if (!tmp_page)
782                                 return NULL;
783
784                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
785                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
786                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
787                                 /* Someone else set it while we were thinking; use theirs. */
788                                 free_pgtable_page(tmp_page);
789                         } else {
790                                 dma_pte_addr(pte);
791                                 domain_flush_cache(domain, pte, sizeof(*pte));
792                         }
793                 }
794                 parent = phys_to_virt(dma_pte_addr(pte));
795                 level--;
796         }
797
798         return pte;
799 }
800
801
802 /* return address's pte at specific level */
803 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
804                                          unsigned long pfn,
805                                          int level, int *large_page)
806 {
807         struct dma_pte *parent, *pte = NULL;
808         int total = agaw_to_level(domain->agaw);
809         int offset;
810
811         parent = domain->pgd;
812         while (level <= total) {
813                 offset = pfn_level_offset(pfn, total);
814                 pte = &parent[offset];
815                 if (level == total)
816                         return pte;
817
818                 if (!dma_pte_present(pte)) {
819                         *large_page = total;
820                         break;
821                 }
822
823                 if (pte->val & DMA_PTE_LARGE_PAGE) {
824                         *large_page = total;
825                         return pte;
826                 }
827
828                 parent = phys_to_virt(dma_pte_addr(pte));
829                 total--;
830         }
831         return NULL;
832 }
833
834 /* clear last level pte, a tlb flush should be followed */
835 static int dma_pte_clear_range(struct dmar_domain *domain,
836                                 unsigned long start_pfn,
837                                 unsigned long last_pfn)
838 {
839         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
840         unsigned int large_page = 1;
841         struct dma_pte *first_pte, *pte;
842
843         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
844         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
845         BUG_ON(start_pfn > last_pfn);
846
847         /* we don't need lock here; nobody else touches the iova range */
848         do {
849                 large_page = 1;
850                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
851                 if (!pte) {
852                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
853                         continue;
854                 }
855                 do {
856                         dma_clear_pte(pte);
857                         start_pfn += lvl_to_nr_pages(large_page);
858                         pte++;
859                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
860
861                 domain_flush_cache(domain, first_pte,
862                                    (void *)pte - (void *)first_pte);
863
864         } while (start_pfn && start_pfn <= last_pfn);
865
866         return min_t(int, (large_page - 1) * 9, MAX_AGAW_PFN_WIDTH);
867 }
868
869 static void dma_pte_free_level(struct dmar_domain *domain, int level,
870                                struct dma_pte *pte, unsigned long pfn,
871                                unsigned long start_pfn, unsigned long last_pfn)
872 {
873         pfn = max(start_pfn, pfn);
874         pte = &pte[pfn_level_offset(pfn, level)];
875
876         do {
877                 unsigned long level_pfn;
878                 struct dma_pte *level_pte;
879
880                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
881                         goto next;
882
883                 level_pfn = pfn & level_mask(level - 1);
884                 level_pte = phys_to_virt(dma_pte_addr(pte));
885
886                 if (level > 2)
887                         dma_pte_free_level(domain, level - 1, level_pte,
888                                            level_pfn, start_pfn, last_pfn);
889
890                 /* If range covers entire pagetable, free it */
891                 if (!(start_pfn > level_pfn ||
892                       last_pfn < level_pfn + level_size(level))) {
893                         dma_clear_pte(pte);
894                         domain_flush_cache(domain, pte, sizeof(*pte));
895                         free_pgtable_page(level_pte);
896                 }
897 next:
898                 pfn += level_size(level);
899         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
900 }
901
902 /* free page table pages. last level pte should already be cleared */
903 static void dma_pte_free_pagetable(struct dmar_domain *domain,
904                                    unsigned long start_pfn,
905                                    unsigned long last_pfn)
906 {
907         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
908
909         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
910         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
911         BUG_ON(start_pfn > last_pfn);
912
913         /* We don't need lock here; nobody else touches the iova range */
914         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
915                            domain->pgd, 0, start_pfn, last_pfn);
916
917         /* free pgd */
918         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
919                 free_pgtable_page(domain->pgd);
920                 domain->pgd = NULL;
921         }
922 }
923
924 /* iommu handling */
925 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
926 {
927         struct root_entry *root;
928         unsigned long flags;
929
930         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
931         if (!root)
932                 return -ENOMEM;
933
934         __iommu_flush_cache(iommu, root, ROOT_SIZE);
935
936         spin_lock_irqsave(&iommu->lock, flags);
937         iommu->root_entry = root;
938         spin_unlock_irqrestore(&iommu->lock, flags);
939
940         return 0;
941 }
942
943 static void iommu_set_root_entry(struct intel_iommu *iommu)
944 {
945         void *addr;
946         u32 sts;
947         unsigned long flag;
948
949         addr = iommu->root_entry;
950
951         raw_spin_lock_irqsave(&iommu->register_lock, flag);
952         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
953
954         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
955
956         /* Make sure hardware complete it */
957         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
958                       readl, (sts & DMA_GSTS_RTPS), sts);
959
960         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
961 }
962
963 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
964 {
965         u32 val;
966         unsigned long flag;
967
968         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
969                 return;
970
971         raw_spin_lock_irqsave(&iommu->register_lock, flag);
972         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
973
974         /* Make sure hardware complete it */
975         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
976                       readl, (!(val & DMA_GSTS_WBFS)), val);
977
978         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
979 }
980
981 /* return value determine if we need a write buffer flush */
982 static void __iommu_flush_context(struct intel_iommu *iommu,
983                                   u16 did, u16 source_id, u8 function_mask,
984                                   u64 type)
985 {
986         u64 val = 0;
987         unsigned long flag;
988
989         switch (type) {
990         case DMA_CCMD_GLOBAL_INVL:
991                 val = DMA_CCMD_GLOBAL_INVL;
992                 break;
993         case DMA_CCMD_DOMAIN_INVL:
994                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
995                 break;
996         case DMA_CCMD_DEVICE_INVL:
997                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
998                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
999                 break;
1000         default:
1001                 BUG();
1002         }
1003         val |= DMA_CCMD_ICC;
1004
1005         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1006         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1007
1008         /* Make sure hardware complete it */
1009         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1010                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1011
1012         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1013 }
1014
1015 /* return value determine if we need a write buffer flush */
1016 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1017                                 u64 addr, unsigned int size_order, u64 type)
1018 {
1019         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1020         u64 val = 0, val_iva = 0;
1021         unsigned long flag;
1022
1023         switch (type) {
1024         case DMA_TLB_GLOBAL_FLUSH:
1025                 /* global flush doesn't need set IVA_REG */
1026                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1027                 break;
1028         case DMA_TLB_DSI_FLUSH:
1029                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1030                 break;
1031         case DMA_TLB_PSI_FLUSH:
1032                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1033                 /* Note: always flush non-leaf currently */
1034                 val_iva = size_order | addr;
1035                 break;
1036         default:
1037                 BUG();
1038         }
1039         /* Note: set drain read/write */
1040 #if 0
1041         /*
1042          * This is probably to be super secure.. Looks like we can
1043          * ignore it without any impact.
1044          */
1045         if (cap_read_drain(iommu->cap))
1046                 val |= DMA_TLB_READ_DRAIN;
1047 #endif
1048         if (cap_write_drain(iommu->cap))
1049                 val |= DMA_TLB_WRITE_DRAIN;
1050
1051         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1052         /* Note: Only uses first TLB reg currently */
1053         if (val_iva)
1054                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1055         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1056
1057         /* Make sure hardware complete it */
1058         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1059                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1060
1061         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1062
1063         /* check IOTLB invalidation granularity */
1064         if (DMA_TLB_IAIG(val) == 0)
1065                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1066         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1067                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1068                         (unsigned long long)DMA_TLB_IIRG(type),
1069                         (unsigned long long)DMA_TLB_IAIG(val));
1070 }
1071
1072 static struct device_domain_info *iommu_support_dev_iotlb(
1073         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1074 {
1075         int found = 0;
1076         unsigned long flags;
1077         struct device_domain_info *info;
1078         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1079
1080         if (!ecap_dev_iotlb_support(iommu->ecap))
1081                 return NULL;
1082
1083         if (!iommu->qi)
1084                 return NULL;
1085
1086         spin_lock_irqsave(&device_domain_lock, flags);
1087         list_for_each_entry(info, &domain->devices, link)
1088                 if (info->bus == bus && info->devfn == devfn) {
1089                         found = 1;
1090                         break;
1091                 }
1092         spin_unlock_irqrestore(&device_domain_lock, flags);
1093
1094         if (!found || !info->dev)
1095                 return NULL;
1096
1097         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1098                 return NULL;
1099
1100         if (!dmar_find_matched_atsr_unit(info->dev))
1101                 return NULL;
1102
1103         info->iommu = iommu;
1104
1105         return info;
1106 }
1107
1108 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1109 {
1110         if (!info)
1111                 return;
1112
1113         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1114 }
1115
1116 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1117 {
1118         if (!info->dev || !pci_ats_enabled(info->dev))
1119                 return;
1120
1121         pci_disable_ats(info->dev);
1122 }
1123
1124 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1125                                   u64 addr, unsigned mask)
1126 {
1127         u16 sid, qdep;
1128         unsigned long flags;
1129         struct device_domain_info *info;
1130
1131         spin_lock_irqsave(&device_domain_lock, flags);
1132         list_for_each_entry(info, &domain->devices, link) {
1133                 if (!info->dev || !pci_ats_enabled(info->dev))
1134                         continue;
1135
1136                 sid = info->bus << 8 | info->devfn;
1137                 qdep = pci_ats_queue_depth(info->dev);
1138                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1139         }
1140         spin_unlock_irqrestore(&device_domain_lock, flags);
1141 }
1142
1143 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1144                                   unsigned long pfn, unsigned int pages, int map)
1145 {
1146         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1147         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1148
1149         BUG_ON(pages == 0);
1150
1151         /*
1152          * Fallback to domain selective flush if no PSI support or the size is
1153          * too big.
1154          * PSI requires page size to be 2 ^ x, and the base address is naturally
1155          * aligned to the size
1156          */
1157         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1158                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1159                                                 DMA_TLB_DSI_FLUSH);
1160         else
1161                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1162                                                 DMA_TLB_PSI_FLUSH);
1163
1164         /*
1165          * In caching mode, changes of pages from non-present to present require
1166          * flush. However, device IOTLB doesn't need to be flushed in this case.
1167          */
1168         if (!cap_caching_mode(iommu->cap) || !map)
1169                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1170 }
1171
1172 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1173 {
1174         u32 pmen;
1175         unsigned long flags;
1176
1177         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1178         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1179         pmen &= ~DMA_PMEN_EPM;
1180         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1181
1182         /* wait for the protected region status bit to clear */
1183         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1184                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1185
1186         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1187 }
1188
1189 static int iommu_enable_translation(struct intel_iommu *iommu)
1190 {
1191         u32 sts;
1192         unsigned long flags;
1193
1194         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1195         iommu->gcmd |= DMA_GCMD_TE;
1196         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1197
1198         /* Make sure hardware complete it */
1199         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1200                       readl, (sts & DMA_GSTS_TES), sts);
1201
1202         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1203         return 0;
1204 }
1205
1206 static int iommu_disable_translation(struct intel_iommu *iommu)
1207 {
1208         u32 sts;
1209         unsigned long flag;
1210
1211         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1212         iommu->gcmd &= ~DMA_GCMD_TE;
1213         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1214
1215         /* Make sure hardware complete it */
1216         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217                       readl, (!(sts & DMA_GSTS_TES)), sts);
1218
1219         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220         return 0;
1221 }
1222
1223
1224 static int iommu_init_domains(struct intel_iommu *iommu)
1225 {
1226         unsigned long ndomains;
1227         unsigned long nlongs;
1228
1229         ndomains = cap_ndoms(iommu->cap);
1230         pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1231                  iommu->seq_id, ndomains);
1232         nlongs = BITS_TO_LONGS(ndomains);
1233
1234         spin_lock_init(&iommu->lock);
1235
1236         /* TBD: there might be 64K domains,
1237          * consider other allocation for future chip
1238          */
1239         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1240         if (!iommu->domain_ids) {
1241                 pr_err("IOMMU%d: allocating domain id array failed\n",
1242                        iommu->seq_id);
1243                 return -ENOMEM;
1244         }
1245         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1246                         GFP_KERNEL);
1247         if (!iommu->domains) {
1248                 pr_err("IOMMU%d: allocating domain array failed\n",
1249                        iommu->seq_id);
1250                 kfree(iommu->domain_ids);
1251                 iommu->domain_ids = NULL;
1252                 return -ENOMEM;
1253         }
1254
1255         /*
1256          * if Caching mode is set, then invalid translations are tagged
1257          * with domainid 0. Hence we need to pre-allocate it.
1258          */
1259         if (cap_caching_mode(iommu->cap))
1260                 set_bit(0, iommu->domain_ids);
1261         return 0;
1262 }
1263
1264
1265 static void domain_exit(struct dmar_domain *domain);
1266 static void vm_domain_exit(struct dmar_domain *domain);
1267
1268 static void free_dmar_iommu(struct intel_iommu *iommu)
1269 {
1270         struct dmar_domain *domain;
1271         int i;
1272         unsigned long flags;
1273
1274         if ((iommu->domains) && (iommu->domain_ids)) {
1275                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1276                         domain = iommu->domains[i];
1277                         clear_bit(i, iommu->domain_ids);
1278
1279                         spin_lock_irqsave(&domain->iommu_lock, flags);
1280                         if (--domain->iommu_count == 0) {
1281                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1282                                         vm_domain_exit(domain);
1283                                 else
1284                                         domain_exit(domain);
1285                         }
1286                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1287                 }
1288         }
1289
1290         if (iommu->gcmd & DMA_GCMD_TE)
1291                 iommu_disable_translation(iommu);
1292
1293         kfree(iommu->domains);
1294         kfree(iommu->domain_ids);
1295         iommu->domains = NULL;
1296         iommu->domain_ids = NULL;
1297
1298         g_iommus[iommu->seq_id] = NULL;
1299
1300         /* if all iommus are freed, free g_iommus */
1301         for (i = 0; i < g_num_of_iommus; i++) {
1302                 if (g_iommus[i])
1303                         break;
1304         }
1305
1306         if (i == g_num_of_iommus)
1307                 kfree(g_iommus);
1308
1309         /* free context mapping */
1310         free_context_table(iommu);
1311 }
1312
1313 static struct dmar_domain *alloc_domain(void)
1314 {
1315         struct dmar_domain *domain;
1316
1317         domain = alloc_domain_mem();
1318         if (!domain)
1319                 return NULL;
1320
1321         domain->nid = -1;
1322         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1323         domain->flags = 0;
1324
1325         return domain;
1326 }
1327
1328 static int iommu_attach_domain(struct dmar_domain *domain,
1329                                struct intel_iommu *iommu)
1330 {
1331         int num;
1332         unsigned long ndomains;
1333         unsigned long flags;
1334
1335         ndomains = cap_ndoms(iommu->cap);
1336
1337         spin_lock_irqsave(&iommu->lock, flags);
1338
1339         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1340         if (num >= ndomains) {
1341                 spin_unlock_irqrestore(&iommu->lock, flags);
1342                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1343                 return -ENOMEM;
1344         }
1345
1346         domain->id = num;
1347         set_bit(num, iommu->domain_ids);
1348         set_bit(iommu->seq_id, domain->iommu_bmp);
1349         iommu->domains[num] = domain;
1350         spin_unlock_irqrestore(&iommu->lock, flags);
1351
1352         return 0;
1353 }
1354
1355 static void iommu_detach_domain(struct dmar_domain *domain,
1356                                 struct intel_iommu *iommu)
1357 {
1358         unsigned long flags;
1359         int num, ndomains;
1360         int found = 0;
1361
1362         spin_lock_irqsave(&iommu->lock, flags);
1363         ndomains = cap_ndoms(iommu->cap);
1364         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1365                 if (iommu->domains[num] == domain) {
1366                         found = 1;
1367                         break;
1368                 }
1369         }
1370
1371         if (found) {
1372                 clear_bit(num, iommu->domain_ids);
1373                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1374                 iommu->domains[num] = NULL;
1375         }
1376         spin_unlock_irqrestore(&iommu->lock, flags);
1377 }
1378
1379 static struct iova_domain reserved_iova_list;
1380 static struct lock_class_key reserved_rbtree_key;
1381
1382 static int dmar_init_reserved_ranges(void)
1383 {
1384         struct pci_dev *pdev = NULL;
1385         struct iova *iova;
1386         int i;
1387
1388         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1389
1390         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1391                 &reserved_rbtree_key);
1392
1393         /* IOAPIC ranges shouldn't be accessed by DMA */
1394         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1395                 IOVA_PFN(IOAPIC_RANGE_END));
1396         if (!iova) {
1397                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1398                 return -ENODEV;
1399         }
1400
1401         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1402         for_each_pci_dev(pdev) {
1403                 struct resource *r;
1404
1405                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1406                         r = &pdev->resource[i];
1407                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1408                                 continue;
1409                         iova = reserve_iova(&reserved_iova_list,
1410                                             IOVA_PFN(r->start),
1411                                             IOVA_PFN(r->end));
1412                         if (!iova) {
1413                                 printk(KERN_ERR "Reserve iova failed\n");
1414                                 return -ENODEV;
1415                         }
1416                 }
1417         }
1418         return 0;
1419 }
1420
1421 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1422 {
1423         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1424 }
1425
1426 static inline int guestwidth_to_adjustwidth(int gaw)
1427 {
1428         int agaw;
1429         int r = (gaw - 12) % 9;
1430
1431         if (r == 0)
1432                 agaw = gaw;
1433         else
1434                 agaw = gaw + 9 - r;
1435         if (agaw > 64)
1436                 agaw = 64;
1437         return agaw;
1438 }
1439
1440 static int domain_init(struct dmar_domain *domain, int guest_width)
1441 {
1442         struct intel_iommu *iommu;
1443         int adjust_width, agaw;
1444         unsigned long sagaw;
1445
1446         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1447         spin_lock_init(&domain->iommu_lock);
1448
1449         domain_reserve_special_ranges(domain);
1450
1451         /* calculate AGAW */
1452         iommu = domain_get_iommu(domain);
1453         if (guest_width > cap_mgaw(iommu->cap))
1454                 guest_width = cap_mgaw(iommu->cap);
1455         domain->gaw = guest_width;
1456         adjust_width = guestwidth_to_adjustwidth(guest_width);
1457         agaw = width_to_agaw(adjust_width);
1458         sagaw = cap_sagaw(iommu->cap);
1459         if (!test_bit(agaw, &sagaw)) {
1460                 /* hardware doesn't support it, choose a bigger one */
1461                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1462                 agaw = find_next_bit(&sagaw, 5, agaw);
1463                 if (agaw >= 5)
1464                         return -ENODEV;
1465         }
1466         domain->agaw = agaw;
1467         INIT_LIST_HEAD(&domain->devices);
1468
1469         if (ecap_coherent(iommu->ecap))
1470                 domain->iommu_coherency = 1;
1471         else
1472                 domain->iommu_coherency = 0;
1473
1474         if (ecap_sc_support(iommu->ecap))
1475                 domain->iommu_snooping = 1;
1476         else
1477                 domain->iommu_snooping = 0;
1478
1479         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1480         domain->iommu_count = 1;
1481         domain->nid = iommu->node;
1482
1483         /* always allocate the top pgd */
1484         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1485         if (!domain->pgd)
1486                 return -ENOMEM;
1487         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1488         return 0;
1489 }
1490
1491 static void domain_exit(struct dmar_domain *domain)
1492 {
1493         struct dmar_drhd_unit *drhd;
1494         struct intel_iommu *iommu;
1495
1496         /* Domain 0 is reserved, so dont process it */
1497         if (!domain)
1498                 return;
1499
1500         /* Flush any lazy unmaps that may reference this domain */
1501         if (!intel_iommu_strict)
1502                 flush_unmaps_timeout(0);
1503
1504         domain_remove_dev_info(domain);
1505         /* destroy iovas */
1506         put_iova_domain(&domain->iovad);
1507
1508         /* clear ptes */
1509         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1510
1511         /* free page tables */
1512         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1513
1514         for_each_active_iommu(iommu, drhd)
1515                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1516                         iommu_detach_domain(domain, iommu);
1517
1518         free_domain_mem(domain);
1519 }
1520
1521 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1522                                  u8 bus, u8 devfn, int translation)
1523 {
1524         struct context_entry *context;
1525         unsigned long flags;
1526         struct intel_iommu *iommu;
1527         struct dma_pte *pgd;
1528         unsigned long num;
1529         unsigned long ndomains;
1530         int id;
1531         int agaw;
1532         struct device_domain_info *info = NULL;
1533
1534         pr_debug("Set context mapping for %02x:%02x.%d\n",
1535                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1536
1537         BUG_ON(!domain->pgd);
1538         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1539                translation != CONTEXT_TT_MULTI_LEVEL);
1540
1541         iommu = device_to_iommu(segment, bus, devfn);
1542         if (!iommu)
1543                 return -ENODEV;
1544
1545         context = device_to_context_entry(iommu, bus, devfn);
1546         if (!context)
1547                 return -ENOMEM;
1548         spin_lock_irqsave(&iommu->lock, flags);
1549         if (context_present(context)) {
1550                 spin_unlock_irqrestore(&iommu->lock, flags);
1551                 return 0;
1552         }
1553
1554         id = domain->id;
1555         pgd = domain->pgd;
1556
1557         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1558             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1559                 int found = 0;
1560
1561                 /* find an available domain id for this device in iommu */
1562                 ndomains = cap_ndoms(iommu->cap);
1563                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1564                         if (iommu->domains[num] == domain) {
1565                                 id = num;
1566                                 found = 1;
1567                                 break;
1568                         }
1569                 }
1570
1571                 if (found == 0) {
1572                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1573                         if (num >= ndomains) {
1574                                 spin_unlock_irqrestore(&iommu->lock, flags);
1575                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1576                                 return -EFAULT;
1577                         }
1578
1579                         set_bit(num, iommu->domain_ids);
1580                         iommu->domains[num] = domain;
1581                         id = num;
1582                 }
1583
1584                 /* Skip top levels of page tables for
1585                  * iommu which has less agaw than default.
1586                  * Unnecessary for PT mode.
1587                  */
1588                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1589                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1590                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1591                                 if (!dma_pte_present(pgd)) {
1592                                         spin_unlock_irqrestore(&iommu->lock, flags);
1593                                         return -ENOMEM;
1594                                 }
1595                         }
1596                 }
1597         }
1598
1599         context_set_domain_id(context, id);
1600
1601         if (translation != CONTEXT_TT_PASS_THROUGH) {
1602                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1603                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1604                                      CONTEXT_TT_MULTI_LEVEL;
1605         }
1606         /*
1607          * In pass through mode, AW must be programmed to indicate the largest
1608          * AGAW value supported by hardware. And ASR is ignored by hardware.
1609          */
1610         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1611                 context_set_address_width(context, iommu->msagaw);
1612         else {
1613                 context_set_address_root(context, virt_to_phys(pgd));
1614                 context_set_address_width(context, iommu->agaw);
1615         }
1616
1617         context_set_translation_type(context, translation);
1618         context_set_fault_enable(context);
1619         context_set_present(context);
1620         domain_flush_cache(domain, context, sizeof(*context));
1621
1622         /*
1623          * It's a non-present to present mapping. If hardware doesn't cache
1624          * non-present entry we only need to flush the write-buffer. If the
1625          * _does_ cache non-present entries, then it does so in the special
1626          * domain #0, which we have to flush:
1627          */
1628         if (cap_caching_mode(iommu->cap)) {
1629                 iommu->flush.flush_context(iommu, 0,
1630                                            (((u16)bus) << 8) | devfn,
1631                                            DMA_CCMD_MASK_NOBIT,
1632                                            DMA_CCMD_DEVICE_INVL);
1633                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1634         } else {
1635                 iommu_flush_write_buffer(iommu);
1636         }
1637         iommu_enable_dev_iotlb(info);
1638         spin_unlock_irqrestore(&iommu->lock, flags);
1639
1640         spin_lock_irqsave(&domain->iommu_lock, flags);
1641         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1642                 domain->iommu_count++;
1643                 if (domain->iommu_count == 1)
1644                         domain->nid = iommu->node;
1645                 domain_update_iommu_cap(domain);
1646         }
1647         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1648         return 0;
1649 }
1650
1651 static int
1652 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1653                         int translation)
1654 {
1655         int ret;
1656         struct pci_dev *tmp, *parent;
1657
1658         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1659                                          pdev->bus->number, pdev->devfn,
1660                                          translation);
1661         if (ret)
1662                 return ret;
1663
1664         /* dependent device mapping */
1665         tmp = pci_find_upstream_pcie_bridge(pdev);
1666         if (!tmp)
1667                 return 0;
1668         /* Secondary interface's bus number and devfn 0 */
1669         parent = pdev->bus->self;
1670         while (parent != tmp) {
1671                 ret = domain_context_mapping_one(domain,
1672                                                  pci_domain_nr(parent->bus),
1673                                                  parent->bus->number,
1674                                                  parent->devfn, translation);
1675                 if (ret)
1676                         return ret;
1677                 parent = parent->bus->self;
1678         }
1679         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1680                 return domain_context_mapping_one(domain,
1681                                         pci_domain_nr(tmp->subordinate),
1682                                         tmp->subordinate->number, 0,
1683                                         translation);
1684         else /* this is a legacy PCI bridge */
1685                 return domain_context_mapping_one(domain,
1686                                                   pci_domain_nr(tmp->bus),
1687                                                   tmp->bus->number,
1688                                                   tmp->devfn,
1689                                                   translation);
1690 }
1691
1692 static int domain_context_mapped(struct pci_dev *pdev)
1693 {
1694         int ret;
1695         struct pci_dev *tmp, *parent;
1696         struct intel_iommu *iommu;
1697
1698         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1699                                 pdev->devfn);
1700         if (!iommu)
1701                 return -ENODEV;
1702
1703         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1704         if (!ret)
1705                 return ret;
1706         /* dependent device mapping */
1707         tmp = pci_find_upstream_pcie_bridge(pdev);
1708         if (!tmp)
1709                 return ret;
1710         /* Secondary interface's bus number and devfn 0 */
1711         parent = pdev->bus->self;
1712         while (parent != tmp) {
1713                 ret = device_context_mapped(iommu, parent->bus->number,
1714                                             parent->devfn);
1715                 if (!ret)
1716                         return ret;
1717                 parent = parent->bus->self;
1718         }
1719         if (pci_is_pcie(tmp))
1720                 return device_context_mapped(iommu, tmp->subordinate->number,
1721                                              0);
1722         else
1723                 return device_context_mapped(iommu, tmp->bus->number,
1724                                              tmp->devfn);
1725 }
1726
1727 /* Returns a number of VTD pages, but aligned to MM page size */
1728 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1729                                             size_t size)
1730 {
1731         host_addr &= ~PAGE_MASK;
1732         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1733 }
1734
1735 /* Return largest possible superpage level for a given mapping */
1736 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1737                                           unsigned long iov_pfn,
1738                                           unsigned long phy_pfn,
1739                                           unsigned long pages)
1740 {
1741         int support, level = 1;
1742         unsigned long pfnmerge;
1743
1744         support = domain->iommu_superpage;
1745
1746         /* To use a large page, the virtual *and* physical addresses
1747            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1748            of them will mean we have to use smaller pages. So just
1749            merge them and check both at once. */
1750         pfnmerge = iov_pfn | phy_pfn;
1751
1752         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1753                 pages >>= VTD_STRIDE_SHIFT;
1754                 if (!pages)
1755                         break;
1756                 pfnmerge >>= VTD_STRIDE_SHIFT;
1757                 level++;
1758                 support--;
1759         }
1760         return level;
1761 }
1762
1763 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1764                             struct scatterlist *sg, unsigned long phys_pfn,
1765                             unsigned long nr_pages, int prot)
1766 {
1767         struct dma_pte *first_pte = NULL, *pte = NULL;
1768         phys_addr_t uninitialized_var(pteval);
1769         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1770         unsigned long sg_res;
1771         unsigned int largepage_lvl = 0;
1772         unsigned long lvl_pages = 0;
1773
1774         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1775
1776         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1777                 return -EINVAL;
1778
1779         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1780
1781         if (sg)
1782                 sg_res = 0;
1783         else {
1784                 sg_res = nr_pages + 1;
1785                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1786         }
1787
1788         while (nr_pages > 0) {
1789                 uint64_t tmp;
1790
1791                 if (!sg_res) {
1792                         sg_res = aligned_nrpages(sg->offset, sg->length);
1793                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1794                         sg->dma_length = sg->length;
1795                         pteval = page_to_phys(sg_page(sg)) | prot;
1796                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1797                 }
1798
1799                 if (!pte) {
1800                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1801
1802                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1803                         if (!pte)
1804                                 return -ENOMEM;
1805                         /* It is large page*/
1806                         if (largepage_lvl > 1) {
1807                                 pteval |= DMA_PTE_LARGE_PAGE;
1808                                 /* Ensure that old small page tables are removed to make room
1809                                    for superpage, if they exist. */
1810                                 dma_pte_clear_range(domain, iov_pfn,
1811                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1812                                 dma_pte_free_pagetable(domain, iov_pfn,
1813                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1814                         } else {
1815                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1816                         }
1817
1818                 }
1819                 /* We don't need lock here, nobody else
1820                  * touches the iova range
1821                  */
1822                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1823                 if (tmp) {
1824                         static int dumps = 5;
1825                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1826                                iov_pfn, tmp, (unsigned long long)pteval);
1827                         if (dumps) {
1828                                 dumps--;
1829                                 debug_dma_dump_mappings(NULL);
1830                         }
1831                         WARN_ON(1);
1832                 }
1833
1834                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1835
1836                 BUG_ON(nr_pages < lvl_pages);
1837                 BUG_ON(sg_res < lvl_pages);
1838
1839                 nr_pages -= lvl_pages;
1840                 iov_pfn += lvl_pages;
1841                 phys_pfn += lvl_pages;
1842                 pteval += lvl_pages * VTD_PAGE_SIZE;
1843                 sg_res -= lvl_pages;
1844
1845                 /* If the next PTE would be the first in a new page, then we
1846                    need to flush the cache on the entries we've just written.
1847                    And then we'll need to recalculate 'pte', so clear it and
1848                    let it get set again in the if (!pte) block above.
1849
1850                    If we're done (!nr_pages) we need to flush the cache too.
1851
1852                    Also if we've been setting superpages, we may need to
1853                    recalculate 'pte' and switch back to smaller pages for the
1854                    end of the mapping, if the trailing size is not enough to
1855                    use another superpage (i.e. sg_res < lvl_pages). */
1856                 pte++;
1857                 if (!nr_pages || first_pte_in_page(pte) ||
1858                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1859                         domain_flush_cache(domain, first_pte,
1860                                            (void *)pte - (void *)first_pte);
1861                         pte = NULL;
1862                 }
1863
1864                 if (!sg_res && nr_pages)
1865                         sg = sg_next(sg);
1866         }
1867         return 0;
1868 }
1869
1870 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1871                                     struct scatterlist *sg, unsigned long nr_pages,
1872                                     int prot)
1873 {
1874         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1875 }
1876
1877 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1878                                      unsigned long phys_pfn, unsigned long nr_pages,
1879                                      int prot)
1880 {
1881         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1882 }
1883
1884 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1885 {
1886         if (!iommu)
1887                 return;
1888
1889         clear_context_table(iommu, bus, devfn);
1890         iommu->flush.flush_context(iommu, 0, 0, 0,
1891                                            DMA_CCMD_GLOBAL_INVL);
1892         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1893 }
1894
1895 static inline void unlink_domain_info(struct device_domain_info *info)
1896 {
1897         assert_spin_locked(&device_domain_lock);
1898         list_del(&info->link);
1899         list_del(&info->global);
1900         if (info->dev)
1901                 info->dev->dev.archdata.iommu = NULL;
1902 }
1903
1904 static void domain_remove_dev_info(struct dmar_domain *domain)
1905 {
1906         struct device_domain_info *info;
1907         unsigned long flags;
1908         struct intel_iommu *iommu;
1909
1910         spin_lock_irqsave(&device_domain_lock, flags);
1911         while (!list_empty(&domain->devices)) {
1912                 info = list_entry(domain->devices.next,
1913                         struct device_domain_info, link);
1914                 unlink_domain_info(info);
1915                 spin_unlock_irqrestore(&device_domain_lock, flags);
1916
1917                 iommu_disable_dev_iotlb(info);
1918                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1919                 iommu_detach_dev(iommu, info->bus, info->devfn);
1920                 free_devinfo_mem(info);
1921
1922                 spin_lock_irqsave(&device_domain_lock, flags);
1923         }
1924         spin_unlock_irqrestore(&device_domain_lock, flags);
1925 }
1926
1927 /*
1928  * find_domain
1929  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1930  */
1931 static struct dmar_domain *
1932 find_domain(struct pci_dev *pdev)
1933 {
1934         struct device_domain_info *info;
1935
1936         /* No lock here, assumes no domain exit in normal case */
1937         info = pdev->dev.archdata.iommu;
1938         if (info)
1939                 return info->domain;
1940         return NULL;
1941 }
1942
1943 /* domain is initialized */
1944 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1945 {
1946         struct dmar_domain *domain, *found = NULL;
1947         struct intel_iommu *iommu;
1948         struct dmar_drhd_unit *drhd;
1949         struct device_domain_info *info, *tmp;
1950         struct pci_dev *dev_tmp;
1951         unsigned long flags;
1952         int bus = 0, devfn = 0;
1953         int segment;
1954         int ret;
1955
1956         domain = find_domain(pdev);
1957         if (domain)
1958                 return domain;
1959
1960         segment = pci_domain_nr(pdev->bus);
1961
1962         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1963         if (dev_tmp) {
1964                 if (pci_is_pcie(dev_tmp)) {
1965                         bus = dev_tmp->subordinate->number;
1966                         devfn = 0;
1967                 } else {
1968                         bus = dev_tmp->bus->number;
1969                         devfn = dev_tmp->devfn;
1970                 }
1971                 spin_lock_irqsave(&device_domain_lock, flags);
1972                 list_for_each_entry(info, &device_domain_list, global) {
1973                         if (info->segment == segment &&
1974                             info->bus == bus && info->devfn == devfn) {
1975                                 found = info->domain;
1976                                 break;
1977                         }
1978                 }
1979                 spin_unlock_irqrestore(&device_domain_lock, flags);
1980                 /* pcie-pci bridge already has a domain, uses it */
1981                 if (found) {
1982                         domain = found;
1983                         goto found_domain;
1984                 }
1985         }
1986
1987         domain = alloc_domain();
1988         if (!domain)
1989                 goto error;
1990
1991         /* Allocate new domain for the device */
1992         drhd = dmar_find_matched_drhd_unit(pdev);
1993         if (!drhd) {
1994                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1995                         pci_name(pdev));
1996                 free_domain_mem(domain);
1997                 return NULL;
1998         }
1999         iommu = drhd->iommu;
2000
2001         ret = iommu_attach_domain(domain, iommu);
2002         if (ret) {
2003                 free_domain_mem(domain);
2004                 goto error;
2005         }
2006
2007         if (domain_init(domain, gaw)) {
2008                 domain_exit(domain);
2009                 goto error;
2010         }
2011
2012         /* register pcie-to-pci device */
2013         if (dev_tmp) {
2014                 info = alloc_devinfo_mem();
2015                 if (!info) {
2016                         domain_exit(domain);
2017                         goto error;
2018                 }
2019                 info->segment = segment;
2020                 info->bus = bus;
2021                 info->devfn = devfn;
2022                 info->dev = NULL;
2023                 info->domain = domain;
2024                 /* This domain is shared by devices under p2p bridge */
2025                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2026
2027                 /* pcie-to-pci bridge already has a domain, uses it */
2028                 found = NULL;
2029                 spin_lock_irqsave(&device_domain_lock, flags);
2030                 list_for_each_entry(tmp, &device_domain_list, global) {
2031                         if (tmp->segment == segment &&
2032                             tmp->bus == bus && tmp->devfn == devfn) {
2033                                 found = tmp->domain;
2034                                 break;
2035                         }
2036                 }
2037                 if (found) {
2038                         spin_unlock_irqrestore(&device_domain_lock, flags);
2039                         free_devinfo_mem(info);
2040                         domain_exit(domain);
2041                         domain = found;
2042                 } else {
2043                         list_add(&info->link, &domain->devices);
2044                         list_add(&info->global, &device_domain_list);
2045                         spin_unlock_irqrestore(&device_domain_lock, flags);
2046                 }
2047         }
2048
2049 found_domain:
2050         info = alloc_devinfo_mem();
2051         if (!info)
2052                 goto error;
2053         info->segment = segment;
2054         info->bus = pdev->bus->number;
2055         info->devfn = pdev->devfn;
2056         info->dev = pdev;
2057         info->domain = domain;
2058         spin_lock_irqsave(&device_domain_lock, flags);
2059         /* somebody is fast */
2060         found = find_domain(pdev);
2061         if (found != NULL) {
2062                 spin_unlock_irqrestore(&device_domain_lock, flags);
2063                 if (found != domain) {
2064                         domain_exit(domain);
2065                         domain = found;
2066                 }
2067                 free_devinfo_mem(info);
2068                 return domain;
2069         }
2070         list_add(&info->link, &domain->devices);
2071         list_add(&info->global, &device_domain_list);
2072         pdev->dev.archdata.iommu = info;
2073         spin_unlock_irqrestore(&device_domain_lock, flags);
2074         return domain;
2075 error:
2076         /* recheck it here, maybe others set it */
2077         return find_domain(pdev);
2078 }
2079
2080 static int iommu_identity_mapping;
2081 #define IDENTMAP_ALL            1
2082 #define IDENTMAP_GFX            2
2083 #define IDENTMAP_AZALIA         4
2084
2085 static int iommu_domain_identity_map(struct dmar_domain *domain,
2086                                      unsigned long long start,
2087                                      unsigned long long end)
2088 {
2089         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2090         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2091
2092         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2093                           dma_to_mm_pfn(last_vpfn))) {
2094                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2095                 return -ENOMEM;
2096         }
2097
2098         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2099                  start, end, domain->id);
2100         /*
2101          * RMRR range might have overlap with physical memory range,
2102          * clear it first
2103          */
2104         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2105
2106         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2107                                   last_vpfn - first_vpfn + 1,
2108                                   DMA_PTE_READ|DMA_PTE_WRITE);
2109 }
2110
2111 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2112                                       unsigned long long start,
2113                                       unsigned long long end)
2114 {
2115         struct dmar_domain *domain;
2116         int ret;
2117
2118         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2119         if (!domain)
2120                 return -ENOMEM;
2121
2122         /* For _hardware_ passthrough, don't bother. But for software
2123            passthrough, we do it anyway -- it may indicate a memory
2124            range which is reserved in E820, so which didn't get set
2125            up to start with in si_domain */
2126         if (domain == si_domain && hw_pass_through) {
2127                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2128                        pci_name(pdev), start, end);
2129                 return 0;
2130         }
2131
2132         printk(KERN_INFO
2133                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2134                pci_name(pdev), start, end);
2135         
2136         if (end < start) {
2137                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2138                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2139                         dmi_get_system_info(DMI_BIOS_VENDOR),
2140                         dmi_get_system_info(DMI_BIOS_VERSION),
2141                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2142                 ret = -EIO;
2143                 goto error;
2144         }
2145
2146         if (end >> agaw_to_width(domain->agaw)) {
2147                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2148                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2149                      agaw_to_width(domain->agaw),
2150                      dmi_get_system_info(DMI_BIOS_VENDOR),
2151                      dmi_get_system_info(DMI_BIOS_VERSION),
2152                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2153                 ret = -EIO;
2154                 goto error;
2155         }
2156
2157         ret = iommu_domain_identity_map(domain, start, end);
2158         if (ret)
2159                 goto error;
2160
2161         /* context entry init */
2162         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2163         if (ret)
2164                 goto error;
2165
2166         return 0;
2167
2168  error:
2169         domain_exit(domain);
2170         return ret;
2171 }
2172
2173 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2174         struct pci_dev *pdev)
2175 {
2176         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2177                 return 0;
2178         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2179                 rmrr->end_address);
2180 }
2181
2182 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2183 static inline void iommu_prepare_isa(void)
2184 {
2185         struct pci_dev *pdev;
2186         int ret;
2187
2188         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2189         if (!pdev)
2190                 return;
2191
2192         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2193         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2194
2195         if (ret)
2196                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2197                        "floppy might not work\n");
2198
2199 }
2200 #else
2201 static inline void iommu_prepare_isa(void)
2202 {
2203         return;
2204 }
2205 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2206
2207 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2208
2209 static int __init si_domain_init(int hw)
2210 {
2211         struct dmar_drhd_unit *drhd;
2212         struct intel_iommu *iommu;
2213         int nid, ret = 0;
2214
2215         si_domain = alloc_domain();
2216         if (!si_domain)
2217                 return -EFAULT;
2218
2219         for_each_active_iommu(iommu, drhd) {
2220                 ret = iommu_attach_domain(si_domain, iommu);
2221                 if (ret) {
2222                         domain_exit(si_domain);
2223                         return -EFAULT;
2224                 }
2225         }
2226
2227         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2228                 domain_exit(si_domain);
2229                 return -EFAULT;
2230         }
2231
2232         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2233         pr_debug("IOMMU: identity mapping domain is domain %d\n",
2234                  si_domain->id);
2235
2236         if (hw)
2237                 return 0;
2238
2239         for_each_online_node(nid) {
2240                 unsigned long start_pfn, end_pfn;
2241                 int i;
2242
2243                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2244                         ret = iommu_domain_identity_map(si_domain,
2245                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2246                         if (ret)
2247                                 return ret;
2248                 }
2249         }
2250
2251         return 0;
2252 }
2253
2254 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2255                                           struct pci_dev *pdev);
2256 static int identity_mapping(struct pci_dev *pdev)
2257 {
2258         struct device_domain_info *info;
2259
2260         if (likely(!iommu_identity_mapping))
2261                 return 0;
2262
2263         info = pdev->dev.archdata.iommu;
2264         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2265                 return (info->domain == si_domain);
2266
2267         return 0;
2268 }
2269
2270 static int domain_add_dev_info(struct dmar_domain *domain,
2271                                struct pci_dev *pdev,
2272                                int translation)
2273 {
2274         struct device_domain_info *info;
2275         unsigned long flags;
2276         int ret;
2277
2278         info = alloc_devinfo_mem();
2279         if (!info)
2280                 return -ENOMEM;
2281
2282         info->segment = pci_domain_nr(pdev->bus);
2283         info->bus = pdev->bus->number;
2284         info->devfn = pdev->devfn;
2285         info->dev = pdev;
2286         info->domain = domain;
2287
2288         spin_lock_irqsave(&device_domain_lock, flags);
2289         list_add(&info->link, &domain->devices);
2290         list_add(&info->global, &device_domain_list);
2291         pdev->dev.archdata.iommu = info;
2292         spin_unlock_irqrestore(&device_domain_lock, flags);
2293
2294         ret = domain_context_mapping(domain, pdev, translation);
2295         if (ret) {
2296                 spin_lock_irqsave(&device_domain_lock, flags);
2297                 unlink_domain_info(info);
2298                 spin_unlock_irqrestore(&device_domain_lock, flags);
2299                 free_devinfo_mem(info);
2300                 return ret;
2301         }
2302
2303         return 0;
2304 }
2305
2306 static bool device_has_rmrr(struct pci_dev *dev)
2307 {
2308         struct dmar_rmrr_unit *rmrr;
2309         int i;
2310
2311         for_each_rmrr_units(rmrr) {
2312                 for (i = 0; i < rmrr->devices_cnt; i++) {
2313                         /*
2314                          * Return TRUE if this RMRR contains the device that
2315                          * is passed in.
2316                          */
2317                         if (rmrr->devices[i] == dev)
2318                                 return true;
2319                 }
2320         }
2321         return false;
2322 }
2323
2324 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2325 {
2326
2327         /*
2328          * We want to prevent any device associated with an RMRR from
2329          * getting placed into the SI Domain. This is done because
2330          * problems exist when devices are moved in and out of domains
2331          * and their respective RMRR info is lost. We exempt USB devices
2332          * from this process due to their usage of RMRRs that are known
2333          * to not be needed after BIOS hand-off to OS.
2334          */
2335         if (device_has_rmrr(pdev) &&
2336             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2337                 return 0;
2338
2339         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2340                 return 1;
2341
2342         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2343                 return 1;
2344
2345         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2346                 return 0;
2347
2348         /*
2349          * We want to start off with all devices in the 1:1 domain, and
2350          * take them out later if we find they can't access all of memory.
2351          *
2352          * However, we can't do this for PCI devices behind bridges,
2353          * because all PCI devices behind the same bridge will end up
2354          * with the same source-id on their transactions.
2355          *
2356          * Practically speaking, we can't change things around for these
2357          * devices at run-time, because we can't be sure there'll be no
2358          * DMA transactions in flight for any of their siblings.
2359          * 
2360          * So PCI devices (unless they're on the root bus) as well as
2361          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2362          * the 1:1 domain, just in _case_ one of their siblings turns out
2363          * not to be able to map all of memory.
2364          */
2365         if (!pci_is_pcie(pdev)) {
2366                 if (!pci_is_root_bus(pdev->bus))
2367                         return 0;
2368                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2369                         return 0;
2370         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2371                 return 0;
2372
2373         /* 
2374          * At boot time, we don't yet know if devices will be 64-bit capable.
2375          * Assume that they will -- if they turn out not to be, then we can 
2376          * take them out of the 1:1 domain later.
2377          */
2378         if (!startup) {
2379                 /*
2380                  * If the device's dma_mask is less than the system's memory
2381                  * size then this is not a candidate for identity mapping.
2382                  */
2383                 u64 dma_mask = pdev->dma_mask;
2384
2385                 if (pdev->dev.coherent_dma_mask &&
2386                     pdev->dev.coherent_dma_mask < dma_mask)
2387                         dma_mask = pdev->dev.coherent_dma_mask;
2388
2389                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2390         }
2391
2392         return 1;
2393 }
2394
2395 static int __init iommu_prepare_static_identity_mapping(int hw)
2396 {
2397         struct pci_dev *pdev = NULL;
2398         int ret;
2399
2400         ret = si_domain_init(hw);
2401         if (ret)
2402                 return -EFAULT;
2403
2404         for_each_pci_dev(pdev) {
2405                 if (iommu_should_identity_map(pdev, 1)) {
2406                         ret = domain_add_dev_info(si_domain, pdev,
2407                                              hw ? CONTEXT_TT_PASS_THROUGH :
2408                                                   CONTEXT_TT_MULTI_LEVEL);
2409                         if (ret) {
2410                                 /* device not associated with an iommu */
2411                                 if (ret == -ENODEV)
2412                                         continue;
2413                                 return ret;
2414                         }
2415                         pr_info("IOMMU: %s identity mapping for device %s\n",
2416                                 hw ? "hardware" : "software", pci_name(pdev));
2417                 }
2418         }
2419
2420         return 0;
2421 }
2422
2423 static int __init init_dmars(void)
2424 {
2425         struct dmar_drhd_unit *drhd;
2426         struct dmar_rmrr_unit *rmrr;
2427         struct pci_dev *pdev;
2428         struct intel_iommu *iommu;
2429         int i, ret;
2430
2431         /*
2432          * for each drhd
2433          *    allocate root
2434          *    initialize and program root entry to not present
2435          * endfor
2436          */
2437         for_each_drhd_unit(drhd) {
2438                 /*
2439                  * lock not needed as this is only incremented in the single
2440                  * threaded kernel __init code path all other access are read
2441                  * only
2442                  */
2443                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2444                         g_num_of_iommus++;
2445                         continue;
2446                 }
2447                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2448                           IOMMU_UNITS_SUPPORTED);
2449         }
2450
2451         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2452                         GFP_KERNEL);
2453         if (!g_iommus) {
2454                 printk(KERN_ERR "Allocating global iommu array failed\n");
2455                 ret = -ENOMEM;
2456                 goto error;
2457         }
2458
2459         deferred_flush = kzalloc(g_num_of_iommus *
2460                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2461         if (!deferred_flush) {
2462                 ret = -ENOMEM;
2463                 goto error;
2464         }
2465
2466         for_each_active_iommu(iommu, drhd) {
2467                 g_iommus[iommu->seq_id] = iommu;
2468
2469                 ret = iommu_init_domains(iommu);
2470                 if (ret)
2471                         goto error;
2472
2473                 /*
2474                  * TBD:
2475                  * we could share the same root & context tables
2476                  * among all IOMMU's. Need to Split it later.
2477                  */
2478                 ret = iommu_alloc_root_entry(iommu);
2479                 if (ret) {
2480                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2481                         goto error;
2482                 }
2483                 if (!ecap_pass_through(iommu->ecap))
2484                         hw_pass_through = 0;
2485         }
2486
2487         /*
2488          * Start from the sane iommu hardware state.
2489          */
2490         for_each_active_iommu(iommu, drhd) {
2491                 /*
2492                  * If the queued invalidation is already initialized by us
2493                  * (for example, while enabling interrupt-remapping) then
2494                  * we got the things already rolling from a sane state.
2495                  */
2496                 if (iommu->qi)
2497                         continue;
2498
2499                 /*
2500                  * Clear any previous faults.
2501                  */
2502                 dmar_fault(-1, iommu);
2503                 /*
2504                  * Disable queued invalidation if supported and already enabled
2505                  * before OS handover.
2506                  */
2507                 dmar_disable_qi(iommu);
2508         }
2509
2510         for_each_active_iommu(iommu, drhd) {
2511                 if (dmar_enable_qi(iommu)) {
2512                         /*
2513                          * Queued Invalidate not enabled, use Register Based
2514                          * Invalidate
2515                          */
2516                         iommu->flush.flush_context = __iommu_flush_context;
2517                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2518                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2519                                "invalidation\n",
2520                                 iommu->seq_id,
2521                                (unsigned long long)drhd->reg_base_addr);
2522                 } else {
2523                         iommu->flush.flush_context = qi_flush_context;
2524                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2525                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2526                                "invalidation\n",
2527                                 iommu->seq_id,
2528                                (unsigned long long)drhd->reg_base_addr);
2529                 }
2530         }
2531
2532         if (iommu_pass_through)
2533                 iommu_identity_mapping |= IDENTMAP_ALL;
2534
2535 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2536         iommu_identity_mapping |= IDENTMAP_GFX;
2537 #endif
2538
2539         check_tylersburg_isoch();
2540
2541         /*
2542          * If pass through is not set or not enabled, setup context entries for
2543          * identity mappings for rmrr, gfx, and isa and may fall back to static
2544          * identity mapping if iommu_identity_mapping is set.
2545          */
2546         if (iommu_identity_mapping) {
2547                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2548                 if (ret) {
2549                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2550                         goto error;
2551                 }
2552         }
2553         /*
2554          * For each rmrr
2555          *   for each dev attached to rmrr
2556          *   do
2557          *     locate drhd for dev, alloc domain for dev
2558          *     allocate free domain
2559          *     allocate page table entries for rmrr
2560          *     if context not allocated for bus
2561          *           allocate and init context
2562          *           set present in root table for this bus
2563          *     init context with domain, translation etc
2564          *    endfor
2565          * endfor
2566          */
2567         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2568         for_each_rmrr_units(rmrr) {
2569                 for (i = 0; i < rmrr->devices_cnt; i++) {
2570                         pdev = rmrr->devices[i];
2571                         /*
2572                          * some BIOS lists non-exist devices in DMAR
2573                          * table.
2574                          */
2575                         if (!pdev)
2576                                 continue;
2577                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2578                         if (ret)
2579                                 printk(KERN_ERR
2580                                        "IOMMU: mapping reserved region failed\n");
2581                 }
2582         }
2583
2584         iommu_prepare_isa();
2585
2586         /*
2587          * for each drhd
2588          *   enable fault log
2589          *   global invalidate context cache
2590          *   global invalidate iotlb
2591          *   enable translation
2592          */
2593         for_each_iommu(iommu, drhd) {
2594                 if (drhd->ignored) {
2595                         /*
2596                          * we always have to disable PMRs or DMA may fail on
2597                          * this device
2598                          */
2599                         if (force_on)
2600                                 iommu_disable_protect_mem_regions(iommu);
2601                         continue;
2602                 }
2603
2604                 iommu_flush_write_buffer(iommu);
2605
2606                 ret = dmar_set_interrupt(iommu);
2607                 if (ret)
2608                         goto error;
2609
2610                 iommu_set_root_entry(iommu);
2611
2612                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2613                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2614
2615                 ret = iommu_enable_translation(iommu);
2616                 if (ret)
2617                         goto error;
2618
2619                 iommu_disable_protect_mem_regions(iommu);
2620         }
2621
2622         return 0;
2623 error:
2624         for_each_active_iommu(iommu, drhd)
2625                 free_dmar_iommu(iommu);
2626         kfree(g_iommus);
2627         return ret;
2628 }
2629
2630 /* This takes a number of _MM_ pages, not VTD pages */
2631 static struct iova *intel_alloc_iova(struct device *dev,
2632                                      struct dmar_domain *domain,
2633                                      unsigned long nrpages, uint64_t dma_mask)
2634 {
2635         struct pci_dev *pdev = to_pci_dev(dev);
2636         struct iova *iova = NULL;
2637
2638         /* Restrict dma_mask to the width that the iommu can handle */
2639         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2640
2641         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2642                 /*
2643                  * First try to allocate an io virtual address in
2644                  * DMA_BIT_MASK(32) and if that fails then try allocating
2645                  * from higher range
2646                  */
2647                 iova = alloc_iova(&domain->iovad, nrpages,
2648                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2649                 if (iova)
2650                         return iova;
2651         }
2652         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2653         if (unlikely(!iova)) {
2654                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2655                        nrpages, pci_name(pdev));
2656                 return NULL;
2657         }
2658
2659         return iova;
2660 }
2661
2662 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2663 {
2664         struct dmar_domain *domain;
2665         int ret;
2666
2667         domain = get_domain_for_dev(pdev,
2668                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2669         if (!domain) {
2670                 printk(KERN_ERR
2671                         "Allocating domain for %s failed", pci_name(pdev));
2672                 return NULL;
2673         }
2674
2675         /* make sure context mapping is ok */
2676         if (unlikely(!domain_context_mapped(pdev))) {
2677                 ret = domain_context_mapping(domain, pdev,
2678                                              CONTEXT_TT_MULTI_LEVEL);
2679                 if (ret) {
2680                         printk(KERN_ERR
2681                                 "Domain context map for %s failed",
2682                                 pci_name(pdev));
2683                         return NULL;
2684                 }
2685         }
2686
2687         return domain;
2688 }
2689
2690 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2691 {
2692         struct device_domain_info *info;
2693
2694         /* No lock here, assumes no domain exit in normal case */
2695         info = dev->dev.archdata.iommu;
2696         if (likely(info))
2697                 return info->domain;
2698
2699         return __get_valid_domain_for_dev(dev);
2700 }
2701
2702 static int iommu_dummy(struct pci_dev *pdev)
2703 {
2704         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2705 }
2706
2707 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2708 static int iommu_no_mapping(struct device *dev)
2709 {
2710         struct pci_dev *pdev;
2711         int found;
2712
2713         if (unlikely(!dev_is_pci(dev)))
2714                 return 1;
2715
2716         pdev = to_pci_dev(dev);
2717         if (iommu_dummy(pdev))
2718                 return 1;
2719
2720         if (!iommu_identity_mapping)
2721                 return 0;
2722
2723         found = identity_mapping(pdev);
2724         if (found) {
2725                 if (iommu_should_identity_map(pdev, 0))
2726                         return 1;
2727                 else {
2728                         /*
2729                          * 32 bit DMA is removed from si_domain and fall back
2730                          * to non-identity mapping.
2731                          */
2732                         domain_remove_one_dev_info(si_domain, pdev);
2733                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2734                                pci_name(pdev));
2735                         return 0;
2736                 }
2737         } else {
2738                 /*
2739                  * In case of a detached 64 bit DMA device from vm, the device
2740                  * is put into si_domain for identity mapping.
2741                  */
2742                 if (iommu_should_identity_map(pdev, 0)) {
2743                         int ret;
2744                         ret = domain_add_dev_info(si_domain, pdev,
2745                                                   hw_pass_through ?
2746                                                   CONTEXT_TT_PASS_THROUGH :
2747                                                   CONTEXT_TT_MULTI_LEVEL);
2748                         if (!ret) {
2749                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2750                                        pci_name(pdev));
2751                                 return 1;
2752                         }
2753                 }
2754         }
2755
2756         return 0;
2757 }
2758
2759 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2760                                      size_t size, int dir, u64 dma_mask)
2761 {
2762         struct pci_dev *pdev = to_pci_dev(hwdev);
2763         struct dmar_domain *domain;
2764         phys_addr_t start_paddr;
2765         struct iova *iova;
2766         int prot = 0;
2767         int ret;
2768         struct intel_iommu *iommu;
2769         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2770
2771         BUG_ON(dir == DMA_NONE);
2772
2773         if (iommu_no_mapping(hwdev))
2774                 return paddr;
2775
2776         domain = get_valid_domain_for_dev(pdev);
2777         if (!domain)
2778                 return 0;
2779
2780         iommu = domain_get_iommu(domain);
2781         size = aligned_nrpages(paddr, size);
2782
2783         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2784         if (!iova)
2785                 goto error;
2786
2787         /*
2788          * Check if DMAR supports zero-length reads on write only
2789          * mappings..
2790          */
2791         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2792                         !cap_zlr(iommu->cap))
2793                 prot |= DMA_PTE_READ;
2794         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2795                 prot |= DMA_PTE_WRITE;
2796         /*
2797          * paddr - (paddr + size) might be partial page, we should map the whole
2798          * page.  Note: if two part of one page are separately mapped, we
2799          * might have two guest_addr mapping to the same host paddr, but this
2800          * is not a big problem
2801          */
2802         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2803                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2804         if (ret)
2805                 goto error;
2806
2807         /* it's a non-present to present mapping. Only flush if caching mode */
2808         if (cap_caching_mode(iommu->cap))
2809                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2810         else
2811                 iommu_flush_write_buffer(iommu);
2812
2813         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2814         start_paddr += paddr & ~PAGE_MASK;
2815         return start_paddr;
2816
2817 error:
2818         if (iova)
2819                 __free_iova(&domain->iovad, iova);
2820         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2821                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2822         return 0;
2823 }
2824
2825 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2826                                  unsigned long offset, size_t size,
2827                                  enum dma_data_direction dir,
2828                                  struct dma_attrs *attrs)
2829 {
2830         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2831                                   dir, to_pci_dev(dev)->dma_mask);
2832 }
2833
2834 static void flush_unmaps(void)
2835 {
2836         int i, j;
2837
2838         timer_on = 0;
2839
2840         /* just flush them all */
2841         for (i = 0; i < g_num_of_iommus; i++) {
2842                 struct intel_iommu *iommu = g_iommus[i];
2843                 if (!iommu)
2844                         continue;
2845
2846                 if (!deferred_flush[i].next)
2847                         continue;
2848
2849                 /* In caching mode, global flushes turn emulation expensive */
2850                 if (!cap_caching_mode(iommu->cap))
2851                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2852                                          DMA_TLB_GLOBAL_FLUSH);
2853                 for (j = 0; j < deferred_flush[i].next; j++) {
2854                         unsigned long mask;
2855                         struct iova *iova = deferred_flush[i].iova[j];
2856                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2857
2858                         /* On real hardware multiple invalidations are expensive */
2859                         if (cap_caching_mode(iommu->cap))
2860                                 iommu_flush_iotlb_psi(iommu, domain->id,
2861                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2862                         else {
2863                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2864                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2865                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2866                         }
2867                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2868                 }
2869                 deferred_flush[i].next = 0;
2870         }
2871
2872         list_size = 0;
2873 }
2874
2875 static void flush_unmaps_timeout(unsigned long data)
2876 {
2877         unsigned long flags;
2878
2879         spin_lock_irqsave(&async_umap_flush_lock, flags);
2880         flush_unmaps();
2881         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2882 }
2883
2884 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2885 {
2886         unsigned long flags;
2887         int next, iommu_id;
2888         struct intel_iommu *iommu;
2889
2890         spin_lock_irqsave(&async_umap_flush_lock, flags);
2891         if (list_size == HIGH_WATER_MARK)
2892                 flush_unmaps();
2893
2894         iommu = domain_get_iommu(dom);
2895         iommu_id = iommu->seq_id;
2896
2897         next = deferred_flush[iommu_id].next;
2898         deferred_flush[iommu_id].domain[next] = dom;
2899         deferred_flush[iommu_id].iova[next] = iova;
2900         deferred_flush[iommu_id].next++;
2901
2902         if (!timer_on) {
2903                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2904                 timer_on = 1;
2905         }
2906         list_size++;
2907         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2908 }
2909
2910 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2911                              size_t size, enum dma_data_direction dir,
2912                              struct dma_attrs *attrs)
2913 {
2914         struct pci_dev *pdev = to_pci_dev(dev);
2915         struct dmar_domain *domain;
2916         unsigned long start_pfn, last_pfn;
2917         struct iova *iova;
2918         struct intel_iommu *iommu;
2919
2920         if (iommu_no_mapping(dev))
2921                 return;
2922
2923         domain = find_domain(pdev);
2924         BUG_ON(!domain);
2925
2926         iommu = domain_get_iommu(domain);
2927
2928         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2929         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2930                       (unsigned long long)dev_addr))
2931                 return;
2932
2933         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2934         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2935
2936         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2937                  pci_name(pdev), start_pfn, last_pfn);
2938
2939         /*  clear the whole page */
2940         dma_pte_clear_range(domain, start_pfn, last_pfn);
2941
2942         /* free page tables */
2943         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2944
2945         if (intel_iommu_strict) {
2946                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2947                                       last_pfn - start_pfn + 1, 0);
2948                 /* free iova */
2949                 __free_iova(&domain->iovad, iova);
2950         } else {
2951                 add_unmap(domain, iova);
2952                 /*
2953                  * queue up the release of the unmap to save the 1/6th of the
2954                  * cpu used up by the iotlb flush operation...
2955                  */
2956         }
2957 }
2958
2959 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2960                                   dma_addr_t *dma_handle, gfp_t flags,
2961                                   struct dma_attrs *attrs)
2962 {
2963         void *vaddr;
2964         int order;
2965
2966         size = PAGE_ALIGN(size);
2967         order = get_order(size);
2968
2969         if (!iommu_no_mapping(hwdev))
2970                 flags &= ~(GFP_DMA | GFP_DMA32);
2971         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2972                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2973                         flags |= GFP_DMA;
2974                 else
2975                         flags |= GFP_DMA32;
2976         }
2977
2978         vaddr = (void *)__get_free_pages(flags, order);
2979         if (!vaddr)
2980                 return NULL;
2981         memset(vaddr, 0, size);
2982
2983         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2984                                          DMA_BIDIRECTIONAL,
2985                                          hwdev->coherent_dma_mask);
2986         if (*dma_handle)
2987                 return vaddr;
2988         free_pages((unsigned long)vaddr, order);
2989         return NULL;
2990 }
2991
2992 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2993                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
2994 {
2995         int order;
2996
2997         size = PAGE_ALIGN(size);
2998         order = get_order(size);
2999
3000         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3001         free_pages((unsigned long)vaddr, order);
3002 }
3003
3004 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3005                            int nelems, enum dma_data_direction dir,
3006                            struct dma_attrs *attrs)
3007 {
3008         struct pci_dev *pdev = to_pci_dev(hwdev);
3009         struct dmar_domain *domain;
3010         unsigned long start_pfn, last_pfn;
3011         struct iova *iova;
3012         struct intel_iommu *iommu;
3013
3014         if (iommu_no_mapping(hwdev))
3015                 return;
3016
3017         domain = find_domain(pdev);
3018         BUG_ON(!domain);
3019
3020         iommu = domain_get_iommu(domain);
3021
3022         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3023         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3024                       (unsigned long long)sglist[0].dma_address))
3025                 return;
3026
3027         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3028         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3029
3030         /*  clear the whole page */
3031         dma_pte_clear_range(domain, start_pfn, last_pfn);
3032
3033         /* free page tables */
3034         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3035
3036         if (intel_iommu_strict) {
3037                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3038                                       last_pfn - start_pfn + 1, 0);
3039                 /* free iova */
3040                 __free_iova(&domain->iovad, iova);
3041         } else {
3042                 add_unmap(domain, iova);
3043                 /*
3044                  * queue up the release of the unmap to save the 1/6th of the
3045                  * cpu used up by the iotlb flush operation...
3046                  */
3047         }
3048 }
3049
3050 static int intel_nontranslate_map_sg(struct device *hddev,
3051         struct scatterlist *sglist, int nelems, int dir)
3052 {
3053         int i;
3054         struct scatterlist *sg;
3055
3056         for_each_sg(sglist, sg, nelems, i) {
3057                 BUG_ON(!sg_page(sg));
3058                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3059                 sg->dma_length = sg->length;
3060         }
3061         return nelems;
3062 }
3063
3064 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3065                         enum dma_data_direction dir, struct dma_attrs *attrs)
3066 {
3067         int i;
3068         struct pci_dev *pdev = to_pci_dev(hwdev);
3069         struct dmar_domain *domain;
3070         size_t size = 0;
3071         int prot = 0;
3072         struct iova *iova = NULL;
3073         int ret;
3074         struct scatterlist *sg;
3075         unsigned long start_vpfn;
3076         struct intel_iommu *iommu;
3077
3078         BUG_ON(dir == DMA_NONE);
3079         if (iommu_no_mapping(hwdev))
3080                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3081
3082         domain = get_valid_domain_for_dev(pdev);
3083         if (!domain)
3084                 return 0;
3085
3086         iommu = domain_get_iommu(domain);
3087
3088         for_each_sg(sglist, sg, nelems, i)
3089                 size += aligned_nrpages(sg->offset, sg->length);
3090
3091         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3092                                 pdev->dma_mask);
3093         if (!iova) {
3094                 sglist->dma_length = 0;
3095                 return 0;
3096         }
3097
3098         /*
3099          * Check if DMAR supports zero-length reads on write only
3100          * mappings..
3101          */
3102         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3103                         !cap_zlr(iommu->cap))
3104                 prot |= DMA_PTE_READ;
3105         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3106                 prot |= DMA_PTE_WRITE;
3107
3108         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3109
3110         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3111         if (unlikely(ret)) {
3112                 /*  clear the page */
3113                 dma_pte_clear_range(domain, start_vpfn,
3114                                     start_vpfn + size - 1);
3115                 /* free page tables */
3116                 dma_pte_free_pagetable(domain, start_vpfn,
3117                                        start_vpfn + size - 1);
3118                 /* free iova */
3119                 __free_iova(&domain->iovad, iova);
3120                 return 0;
3121         }
3122
3123         /* it's a non-present to present mapping. Only flush if caching mode */
3124         if (cap_caching_mode(iommu->cap))
3125                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3126         else
3127                 iommu_flush_write_buffer(iommu);
3128
3129         return nelems;
3130 }
3131
3132 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3133 {
3134         return !dma_addr;
3135 }
3136
3137 struct dma_map_ops intel_dma_ops = {
3138         .alloc = intel_alloc_coherent,
3139         .free = intel_free_coherent,
3140         .map_sg = intel_map_sg,
3141         .unmap_sg = intel_unmap_sg,
3142         .map_page = intel_map_page,
3143         .unmap_page = intel_unmap_page,
3144         .mapping_error = intel_mapping_error,
3145 };
3146
3147 static inline int iommu_domain_cache_init(void)
3148 {
3149         int ret = 0;
3150
3151         iommu_domain_cache = kmem_cache_create("iommu_domain",
3152                                          sizeof(struct dmar_domain),
3153                                          0,
3154                                          SLAB_HWCACHE_ALIGN,
3155
3156                                          NULL);
3157         if (!iommu_domain_cache) {
3158                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3159                 ret = -ENOMEM;
3160         }
3161
3162         return ret;
3163 }
3164
3165 static inline int iommu_devinfo_cache_init(void)
3166 {
3167         int ret = 0;
3168
3169         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3170                                          sizeof(struct device_domain_info),
3171                                          0,
3172                                          SLAB_HWCACHE_ALIGN,
3173                                          NULL);
3174         if (!iommu_devinfo_cache) {
3175                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3176                 ret = -ENOMEM;
3177         }
3178
3179         return ret;
3180 }
3181
3182 static inline int iommu_iova_cache_init(void)
3183 {
3184         int ret = 0;
3185
3186         iommu_iova_cache = kmem_cache_create("iommu_iova",
3187                                          sizeof(struct iova),
3188                                          0,
3189                                          SLAB_HWCACHE_ALIGN,
3190                                          NULL);
3191         if (!iommu_iova_cache) {
3192                 printk(KERN_ERR "Couldn't create iova cache\n");
3193                 ret = -ENOMEM;
3194         }
3195
3196         return ret;
3197 }
3198
3199 static int __init iommu_init_mempool(void)
3200 {
3201         int ret;
3202         ret = iommu_iova_cache_init();
3203         if (ret)
3204                 return ret;
3205
3206         ret = iommu_domain_cache_init();
3207         if (ret)
3208                 goto domain_error;
3209
3210         ret = iommu_devinfo_cache_init();
3211         if (!ret)
3212                 return ret;
3213
3214         kmem_cache_destroy(iommu_domain_cache);
3215 domain_error:
3216         kmem_cache_destroy(iommu_iova_cache);
3217
3218         return -ENOMEM;
3219 }
3220
3221 static void __init iommu_exit_mempool(void)
3222 {
3223         kmem_cache_destroy(iommu_devinfo_cache);
3224         kmem_cache_destroy(iommu_domain_cache);
3225         kmem_cache_destroy(iommu_iova_cache);
3226
3227 }
3228
3229 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3230 {
3231         struct dmar_drhd_unit *drhd;
3232         u32 vtbar;
3233         int rc;
3234
3235         /* We know that this device on this chipset has its own IOMMU.
3236          * If we find it under a different IOMMU, then the BIOS is lying
3237          * to us. Hope that the IOMMU for this device is actually
3238          * disabled, and it needs no translation...
3239          */
3240         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3241         if (rc) {
3242                 /* "can't" happen */
3243                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3244                 return;
3245         }
3246         vtbar &= 0xffff0000;
3247
3248         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3249         drhd = dmar_find_matched_drhd_unit(pdev);
3250         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3251                             TAINT_FIRMWARE_WORKAROUND,
3252                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3253                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3254 }
3255 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3256
3257 static void __init init_no_remapping_devices(void)
3258 {
3259         struct dmar_drhd_unit *drhd;
3260
3261         for_each_drhd_unit(drhd) {
3262                 if (!drhd->include_all) {
3263                         int i;
3264                         for (i = 0; i < drhd->devices_cnt; i++)
3265                                 if (drhd->devices[i] != NULL)
3266                                         break;
3267                         /* ignore DMAR unit if no pci devices exist */
3268                         if (i == drhd->devices_cnt)
3269                                 drhd->ignored = 1;
3270                 }
3271         }
3272
3273         for_each_active_drhd_unit(drhd) {
3274                 int i;
3275                 if (drhd->include_all)
3276                         continue;
3277
3278                 for (i = 0; i < drhd->devices_cnt; i++)
3279                         if (drhd->devices[i] &&
3280                             !IS_GFX_DEVICE(drhd->devices[i]))
3281                                 break;
3282
3283                 if (i < drhd->devices_cnt)
3284                         continue;
3285
3286                 /* This IOMMU has *only* gfx devices. Either bypass it or
3287                    set the gfx_mapped flag, as appropriate */
3288                 if (dmar_map_gfx) {
3289                         intel_iommu_gfx_mapped = 1;
3290                 } else {
3291                         drhd->ignored = 1;
3292                         for (i = 0; i < drhd->devices_cnt; i++) {
3293                                 if (!drhd->devices[i])
3294                                         continue;
3295                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3296                         }
3297                 }
3298         }
3299 }
3300
3301 #ifdef CONFIG_SUSPEND
3302 static int init_iommu_hw(void)
3303 {
3304         struct dmar_drhd_unit *drhd;
3305         struct intel_iommu *iommu = NULL;
3306
3307         for_each_active_iommu(iommu, drhd)
3308                 if (iommu->qi)
3309                         dmar_reenable_qi(iommu);
3310
3311         for_each_iommu(iommu, drhd) {
3312                 if (drhd->ignored) {
3313                         /*
3314                          * we always have to disable PMRs or DMA may fail on
3315                          * this device
3316                          */
3317                         if (force_on)
3318                                 iommu_disable_protect_mem_regions(iommu);
3319                         continue;
3320                 }
3321         
3322                 iommu_flush_write_buffer(iommu);
3323
3324                 iommu_set_root_entry(iommu);
3325
3326                 iommu->flush.flush_context(iommu, 0, 0, 0,
3327                                            DMA_CCMD_GLOBAL_INVL);
3328                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3329                                          DMA_TLB_GLOBAL_FLUSH);
3330                 if (iommu_enable_translation(iommu))
3331                         return 1;
3332                 iommu_disable_protect_mem_regions(iommu);
3333         }
3334
3335         return 0;
3336 }
3337
3338 static void iommu_flush_all(void)
3339 {
3340         struct dmar_drhd_unit *drhd;
3341         struct intel_iommu *iommu;
3342
3343         for_each_active_iommu(iommu, drhd) {
3344                 iommu->flush.flush_context(iommu, 0, 0, 0,
3345                                            DMA_CCMD_GLOBAL_INVL);
3346                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3347                                          DMA_TLB_GLOBAL_FLUSH);
3348         }
3349 }
3350
3351 static int iommu_suspend(void)
3352 {
3353         struct dmar_drhd_unit *drhd;
3354         struct intel_iommu *iommu = NULL;
3355         unsigned long flag;
3356
3357         for_each_active_iommu(iommu, drhd) {
3358                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3359                                                  GFP_ATOMIC);
3360                 if (!iommu->iommu_state)
3361                         goto nomem;
3362         }
3363
3364         iommu_flush_all();
3365
3366         for_each_active_iommu(iommu, drhd) {
3367                 iommu_disable_translation(iommu);
3368
3369                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3370
3371                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3372                         readl(iommu->reg + DMAR_FECTL_REG);
3373                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3374                         readl(iommu->reg + DMAR_FEDATA_REG);
3375                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3376                         readl(iommu->reg + DMAR_FEADDR_REG);
3377                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3378                         readl(iommu->reg + DMAR_FEUADDR_REG);
3379
3380                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3381         }
3382         return 0;
3383
3384 nomem:
3385         for_each_active_iommu(iommu, drhd)
3386                 kfree(iommu->iommu_state);
3387
3388         return -ENOMEM;
3389 }
3390
3391 static void iommu_resume(void)
3392 {
3393         struct dmar_drhd_unit *drhd;
3394         struct intel_iommu *iommu = NULL;
3395         unsigned long flag;
3396
3397         if (init_iommu_hw()) {
3398                 if (force_on)
3399                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3400                 else
3401                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3402                 return;
3403         }
3404
3405         for_each_active_iommu(iommu, drhd) {
3406
3407                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3408
3409                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3410                         iommu->reg + DMAR_FECTL_REG);
3411                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3412                         iommu->reg + DMAR_FEDATA_REG);
3413                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3414                         iommu->reg + DMAR_FEADDR_REG);
3415                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3416                         iommu->reg + DMAR_FEUADDR_REG);
3417
3418                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3419         }
3420
3421         for_each_active_iommu(iommu, drhd)
3422                 kfree(iommu->iommu_state);
3423 }
3424
3425 static struct syscore_ops iommu_syscore_ops = {
3426         .resume         = iommu_resume,
3427         .suspend        = iommu_suspend,
3428 };
3429
3430 static void __init init_iommu_pm_ops(void)
3431 {
3432         register_syscore_ops(&iommu_syscore_ops);
3433 }
3434
3435 #else
3436 static inline void init_iommu_pm_ops(void) {}
3437 #endif  /* CONFIG_PM */
3438
3439 LIST_HEAD(dmar_rmrr_units);
3440
3441 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3442 {
3443         list_add(&rmrr->list, &dmar_rmrr_units);
3444 }
3445
3446
3447 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3448 {
3449         struct acpi_dmar_reserved_memory *rmrr;
3450         struct dmar_rmrr_unit *rmrru;
3451
3452         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3453         if (!rmrru)
3454                 return -ENOMEM;
3455
3456         rmrru->hdr = header;
3457         rmrr = (struct acpi_dmar_reserved_memory *)header;
3458         rmrru->base_address = rmrr->base_address;
3459         rmrru->end_address = rmrr->end_address;
3460
3461         dmar_register_rmrr_unit(rmrru);
3462         return 0;
3463 }
3464
3465 static int __init
3466 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3467 {
3468         struct acpi_dmar_reserved_memory *rmrr;
3469         int ret;
3470
3471         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3472         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3473                 ((void *)rmrr) + rmrr->header.length,
3474                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3475
3476         if (ret || (rmrru->devices_cnt == 0)) {
3477                 list_del(&rmrru->list);
3478                 kfree(rmrru);
3479         }
3480         return ret;
3481 }
3482
3483 static LIST_HEAD(dmar_atsr_units);
3484
3485 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3486 {
3487         struct acpi_dmar_atsr *atsr;
3488         struct dmar_atsr_unit *atsru;
3489
3490         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3491         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3492         if (!atsru)
3493                 return -ENOMEM;
3494
3495         atsru->hdr = hdr;
3496         atsru->include_all = atsr->flags & 0x1;
3497
3498         list_add(&atsru->list, &dmar_atsr_units);
3499
3500         return 0;
3501 }
3502
3503 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3504 {
3505         int rc;
3506         struct acpi_dmar_atsr *atsr;
3507
3508         if (atsru->include_all)
3509                 return 0;
3510
3511         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3512         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3513                                 (void *)atsr + atsr->header.length,
3514                                 &atsru->devices_cnt, &atsru->devices,
3515                                 atsr->segment);
3516         if (rc || !atsru->devices_cnt) {
3517                 list_del(&atsru->list);
3518                 kfree(atsru);
3519         }
3520
3521         return rc;
3522 }
3523
3524 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3525 {
3526         int i;
3527         struct pci_bus *bus;
3528         struct acpi_dmar_atsr *atsr;
3529         struct dmar_atsr_unit *atsru;
3530
3531         dev = pci_physfn(dev);
3532
3533         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3534                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3535                 if (atsr->segment == pci_domain_nr(dev->bus))
3536                         goto found;
3537         }
3538
3539         return 0;
3540
3541 found:
3542         for (bus = dev->bus; bus; bus = bus->parent) {
3543                 struct pci_dev *bridge = bus->self;
3544
3545                 if (!bridge || !pci_is_pcie(bridge) ||
3546                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3547                         return 0;
3548
3549                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3550                         for (i = 0; i < atsru->devices_cnt; i++)
3551                                 if (atsru->devices[i] == bridge)
3552                                         return 1;
3553                         break;
3554                 }
3555         }
3556
3557         if (atsru->include_all)
3558                 return 1;
3559
3560         return 0;
3561 }
3562
3563 int __init dmar_parse_rmrr_atsr_dev(void)
3564 {
3565         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3566         struct dmar_atsr_unit *atsr, *atsr_n;
3567         int ret = 0;
3568
3569         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3570                 ret = rmrr_parse_dev(rmrr);
3571                 if (ret)
3572                         return ret;
3573         }
3574
3575         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3576                 ret = atsr_parse_dev(atsr);
3577                 if (ret)
3578                         return ret;
3579         }
3580
3581         return ret;
3582 }
3583
3584 /*
3585  * Here we only respond to action of unbound device from driver.
3586  *
3587  * Added device is not attached to its DMAR domain here yet. That will happen
3588  * when mapping the device to iova.
3589  */
3590 static int device_notifier(struct notifier_block *nb,
3591                                   unsigned long action, void *data)
3592 {
3593         struct device *dev = data;
3594         struct pci_dev *pdev = to_pci_dev(dev);
3595         struct dmar_domain *domain;
3596
3597         if (iommu_no_mapping(dev))
3598                 return 0;
3599
3600         domain = find_domain(pdev);
3601         if (!domain)
3602                 return 0;
3603
3604         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3605                 domain_remove_one_dev_info(domain, pdev);
3606
3607                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3608                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3609                     list_empty(&domain->devices))
3610                         domain_exit(domain);
3611         }
3612
3613         return 0;
3614 }
3615
3616 static struct notifier_block device_nb = {
3617         .notifier_call = device_notifier,
3618 };
3619
3620 int __init intel_iommu_init(void)
3621 {
3622         int ret = 0;
3623         struct dmar_drhd_unit *drhd;
3624         struct intel_iommu *iommu;
3625
3626         /* VT-d is required for a TXT/tboot launch, so enforce that */
3627         force_on = tboot_force_iommu();
3628
3629         if (dmar_table_init()) {
3630                 if (force_on)
3631                         panic("tboot: Failed to initialize DMAR table\n");
3632                 return  -ENODEV;
3633         }
3634
3635         /*
3636          * Disable translation if already enabled prior to OS handover.
3637          */
3638         for_each_active_iommu(iommu, drhd)
3639                 if (iommu->gcmd & DMA_GCMD_TE)
3640                         iommu_disable_translation(iommu);
3641
3642         if (dmar_dev_scope_init() < 0) {
3643                 if (force_on)
3644                         panic("tboot: Failed to initialize DMAR device scope\n");
3645                 return  -ENODEV;
3646         }
3647
3648         if (no_iommu || dmar_disabled)
3649                 return -ENODEV;
3650
3651         if (iommu_init_mempool()) {
3652                 if (force_on)
3653                         panic("tboot: Failed to initialize iommu memory\n");
3654                 return  -ENODEV;
3655         }
3656
3657         if (list_empty(&dmar_rmrr_units))
3658                 printk(KERN_INFO "DMAR: No RMRR found\n");
3659
3660         if (list_empty(&dmar_atsr_units))
3661                 printk(KERN_INFO "DMAR: No ATSR found\n");
3662
3663         if (dmar_init_reserved_ranges()) {
3664                 if (force_on)
3665                         panic("tboot: Failed to reserve iommu ranges\n");
3666                 return  -ENODEV;
3667         }
3668
3669         init_no_remapping_devices();
3670
3671         ret = init_dmars();
3672         if (ret) {
3673                 if (force_on)
3674                         panic("tboot: Failed to initialize DMARs\n");
3675                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3676                 put_iova_domain(&reserved_iova_list);
3677                 iommu_exit_mempool();
3678                 return ret;
3679         }
3680         printk(KERN_INFO
3681         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3682
3683         init_timer(&unmap_timer);
3684 #ifdef CONFIG_SWIOTLB
3685         swiotlb = 0;
3686 #endif
3687         dma_ops = &intel_dma_ops;
3688
3689         init_iommu_pm_ops();
3690
3691         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3692
3693         bus_register_notifier(&pci_bus_type, &device_nb);
3694
3695         intel_iommu_enabled = 1;
3696
3697         return 0;
3698 }
3699
3700 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3701                                            struct pci_dev *pdev)
3702 {
3703         struct pci_dev *tmp, *parent;
3704
3705         if (!iommu || !pdev)
3706                 return;
3707
3708         /* dependent device detach */
3709         tmp = pci_find_upstream_pcie_bridge(pdev);
3710         /* Secondary interface's bus number and devfn 0 */
3711         if (tmp) {
3712                 parent = pdev->bus->self;
3713                 while (parent != tmp) {
3714                         iommu_detach_dev(iommu, parent->bus->number,
3715                                          parent->devfn);
3716                         parent = parent->bus->self;
3717                 }
3718                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3719                         iommu_detach_dev(iommu,
3720                                 tmp->subordinate->number, 0);
3721                 else /* this is a legacy PCI bridge */
3722                         iommu_detach_dev(iommu, tmp->bus->number,
3723                                          tmp->devfn);
3724         }
3725 }
3726
3727 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3728                                           struct pci_dev *pdev)
3729 {
3730         struct device_domain_info *info, *tmp;
3731         struct intel_iommu *iommu;
3732         unsigned long flags;
3733         int found = 0;
3734
3735         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3736                                 pdev->devfn);
3737         if (!iommu)
3738                 return;
3739
3740         spin_lock_irqsave(&device_domain_lock, flags);
3741         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
3742                 if (info->segment == pci_domain_nr(pdev->bus) &&
3743                     info->bus == pdev->bus->number &&
3744                     info->devfn == pdev->devfn) {
3745                         unlink_domain_info(info);
3746                         spin_unlock_irqrestore(&device_domain_lock, flags);
3747
3748                         iommu_disable_dev_iotlb(info);
3749                         iommu_detach_dev(iommu, info->bus, info->devfn);
3750                         iommu_detach_dependent_devices(iommu, pdev);
3751                         free_devinfo_mem(info);
3752
3753                         spin_lock_irqsave(&device_domain_lock, flags);
3754
3755                         if (found)
3756                                 break;
3757                         else
3758                                 continue;
3759                 }
3760
3761                 /* if there is no other devices under the same iommu
3762                  * owned by this domain, clear this iommu in iommu_bmp
3763                  * update iommu count and coherency
3764                  */
3765                 if (iommu == device_to_iommu(info->segment, info->bus,
3766                                             info->devfn))
3767                         found = 1;
3768         }
3769
3770         spin_unlock_irqrestore(&device_domain_lock, flags);
3771
3772         if (found == 0) {
3773                 unsigned long tmp_flags;
3774                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3775                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3776                 domain->iommu_count--;
3777                 domain_update_iommu_cap(domain);
3778                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3779
3780                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3781                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3782                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3783                         clear_bit(domain->id, iommu->domain_ids);
3784                         iommu->domains[domain->id] = NULL;
3785                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3786                 }
3787         }
3788 }
3789
3790 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3791 {
3792         struct device_domain_info *info;
3793         struct intel_iommu *iommu;
3794         unsigned long flags1, flags2;
3795
3796         spin_lock_irqsave(&device_domain_lock, flags1);
3797         while (!list_empty(&domain->devices)) {
3798                 info = list_entry(domain->devices.next,
3799                         struct device_domain_info, link);
3800                 unlink_domain_info(info);
3801                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3802
3803                 iommu_disable_dev_iotlb(info);
3804                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3805                 iommu_detach_dev(iommu, info->bus, info->devfn);
3806                 iommu_detach_dependent_devices(iommu, info->dev);
3807
3808                 /* clear this iommu in iommu_bmp, update iommu count
3809                  * and capabilities
3810                  */
3811                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3812                 if (test_and_clear_bit(iommu->seq_id,
3813                                        domain->iommu_bmp)) {
3814                         domain->iommu_count--;
3815                         domain_update_iommu_cap(domain);
3816                 }
3817                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3818
3819                 free_devinfo_mem(info);
3820                 spin_lock_irqsave(&device_domain_lock, flags1);
3821         }
3822         spin_unlock_irqrestore(&device_domain_lock, flags1);
3823 }
3824
3825 /* domain id for virtual machine, it won't be set in context */
3826 static atomic_t vm_domid = ATOMIC_INIT(0);
3827
3828 static struct dmar_domain *iommu_alloc_vm_domain(void)
3829 {
3830         struct dmar_domain *domain;
3831
3832         domain = alloc_domain_mem();
3833         if (!domain)
3834                 return NULL;
3835
3836         domain->id = atomic_inc_return(&vm_domid);
3837         domain->nid = -1;
3838         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3839         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3840
3841         return domain;
3842 }
3843
3844 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3845 {
3846         int adjust_width;
3847
3848         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3849         spin_lock_init(&domain->iommu_lock);
3850
3851         domain_reserve_special_ranges(domain);
3852
3853         /* calculate AGAW */
3854         domain->gaw = guest_width;
3855         adjust_width = guestwidth_to_adjustwidth(guest_width);
3856         domain->agaw = width_to_agaw(adjust_width);
3857
3858         INIT_LIST_HEAD(&domain->devices);
3859
3860         domain->iommu_count = 0;
3861         domain->iommu_coherency = 0;
3862         domain->iommu_snooping = 0;
3863         domain->iommu_superpage = 0;
3864         domain->max_addr = 0;
3865         domain->nid = -1;
3866
3867         /* always allocate the top pgd */
3868         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3869         if (!domain->pgd)
3870                 return -ENOMEM;
3871         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3872         return 0;
3873 }
3874
3875 static void iommu_free_vm_domain(struct dmar_domain *domain)
3876 {
3877         unsigned long flags;
3878         struct dmar_drhd_unit *drhd;
3879         struct intel_iommu *iommu;
3880         unsigned long i;
3881         unsigned long ndomains;
3882
3883         for_each_active_iommu(iommu, drhd) {
3884                 ndomains = cap_ndoms(iommu->cap);
3885                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3886                         if (iommu->domains[i] == domain) {
3887                                 spin_lock_irqsave(&iommu->lock, flags);
3888                                 clear_bit(i, iommu->domain_ids);
3889                                 iommu->domains[i] = NULL;
3890                                 spin_unlock_irqrestore(&iommu->lock, flags);
3891                                 break;
3892                         }
3893                 }
3894         }
3895 }
3896
3897 static void vm_domain_exit(struct dmar_domain *domain)
3898 {
3899         /* Domain 0 is reserved, so dont process it */
3900         if (!domain)
3901                 return;
3902
3903         vm_domain_remove_all_dev_info(domain);
3904         /* destroy iovas */
3905         put_iova_domain(&domain->iovad);
3906
3907         /* clear ptes */
3908         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3909
3910         /* free page tables */
3911         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3912
3913         iommu_free_vm_domain(domain);
3914         free_domain_mem(domain);
3915 }
3916
3917 static int intel_iommu_domain_init(struct iommu_domain *domain)
3918 {
3919         struct dmar_domain *dmar_domain;
3920
3921         dmar_domain = iommu_alloc_vm_domain();
3922         if (!dmar_domain) {
3923                 printk(KERN_ERR
3924                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3925                 return -ENOMEM;
3926         }
3927         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3928                 printk(KERN_ERR
3929                         "intel_iommu_domain_init() failed\n");
3930                 vm_domain_exit(dmar_domain);
3931                 return -ENOMEM;
3932         }
3933         domain_update_iommu_cap(dmar_domain);
3934         domain->priv = dmar_domain;
3935
3936         domain->geometry.aperture_start = 0;
3937         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3938         domain->geometry.force_aperture = true;
3939
3940         return 0;
3941 }
3942
3943 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3944 {
3945         struct dmar_domain *dmar_domain = domain->priv;
3946
3947         domain->priv = NULL;
3948         vm_domain_exit(dmar_domain);
3949 }
3950
3951 static int intel_iommu_attach_device(struct iommu_domain *domain,
3952                                      struct device *dev)
3953 {
3954         struct dmar_domain *dmar_domain = domain->priv;
3955         struct pci_dev *pdev = to_pci_dev(dev);
3956         struct intel_iommu *iommu;
3957         int addr_width;
3958
3959         /* normally pdev is not mapped */
3960         if (unlikely(domain_context_mapped(pdev))) {
3961                 struct dmar_domain *old_domain;
3962
3963                 old_domain = find_domain(pdev);
3964                 if (old_domain) {
3965                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3966                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3967                                 domain_remove_one_dev_info(old_domain, pdev);
3968                         else
3969                                 domain_remove_dev_info(old_domain);
3970                 }
3971         }
3972
3973         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3974                                 pdev->devfn);
3975         if (!iommu)
3976                 return -ENODEV;
3977
3978         /* check if this iommu agaw is sufficient for max mapped address */
3979         addr_width = agaw_to_width(iommu->agaw);
3980         if (addr_width > cap_mgaw(iommu->cap))
3981                 addr_width = cap_mgaw(iommu->cap);
3982
3983         if (dmar_domain->max_addr > (1LL << addr_width)) {
3984                 printk(KERN_ERR "%s: iommu width (%d) is not "
3985                        "sufficient for the mapped address (%llx)\n",
3986                        __func__, addr_width, dmar_domain->max_addr);
3987                 return -EFAULT;
3988         }
3989         dmar_domain->gaw = addr_width;
3990
3991         /*
3992          * Knock out extra levels of page tables if necessary
3993          */
3994         while (iommu->agaw < dmar_domain->agaw) {
3995                 struct dma_pte *pte;
3996
3997                 pte = dmar_domain->pgd;
3998                 if (dma_pte_present(pte)) {
3999                         dmar_domain->pgd = (struct dma_pte *)
4000                                 phys_to_virt(dma_pte_addr(pte));
4001                         free_pgtable_page(pte);
4002                 }
4003                 dmar_domain->agaw--;
4004         }
4005
4006         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4007 }
4008
4009 static void intel_iommu_detach_device(struct iommu_domain *domain,
4010                                       struct device *dev)
4011 {
4012         struct dmar_domain *dmar_domain = domain->priv;
4013         struct pci_dev *pdev = to_pci_dev(dev);
4014
4015         domain_remove_one_dev_info(dmar_domain, pdev);
4016 }
4017
4018 static int intel_iommu_map(struct iommu_domain *domain,
4019                            unsigned long iova, phys_addr_t hpa,
4020                            size_t size, int iommu_prot)
4021 {
4022         struct dmar_domain *dmar_domain = domain->priv;
4023         u64 max_addr;
4024         int prot = 0;
4025         int ret;
4026
4027         if (iommu_prot & IOMMU_READ)
4028                 prot |= DMA_PTE_READ;
4029         if (iommu_prot & IOMMU_WRITE)
4030                 prot |= DMA_PTE_WRITE;
4031         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4032                 prot |= DMA_PTE_SNP;
4033
4034         max_addr = iova + size;
4035         if (dmar_domain->max_addr < max_addr) {
4036                 u64 end;
4037
4038                 /* check if minimum agaw is sufficient for mapped address */
4039                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4040                 if (end < max_addr) {
4041                         printk(KERN_ERR "%s: iommu width (%d) is not "
4042                                "sufficient for the mapped address (%llx)\n",
4043                                __func__, dmar_domain->gaw, max_addr);
4044                         return -EFAULT;
4045                 }
4046                 dmar_domain->max_addr = max_addr;
4047         }
4048         /* Round up size to next multiple of PAGE_SIZE, if it and
4049            the low bits of hpa would take us onto the next page */
4050         size = aligned_nrpages(hpa, size);
4051         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4052                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4053         return ret;
4054 }
4055
4056 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4057                              unsigned long iova, size_t size)
4058 {
4059         struct dmar_domain *dmar_domain = domain->priv;
4060         int order;
4061
4062         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4063                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4064
4065         if (dmar_domain->max_addr == iova + size)
4066                 dmar_domain->max_addr = iova;
4067
4068         return PAGE_SIZE << order;
4069 }
4070
4071 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4072                                             dma_addr_t iova)
4073 {
4074         struct dmar_domain *dmar_domain = domain->priv;
4075         struct dma_pte *pte;
4076         u64 phys = 0;
4077
4078         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4079         if (pte)
4080                 phys = dma_pte_addr(pte);
4081
4082         return phys;
4083 }
4084
4085 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4086                                       unsigned long cap)
4087 {
4088         struct dmar_domain *dmar_domain = domain->priv;
4089
4090         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4091                 return dmar_domain->iommu_snooping;
4092         if (cap == IOMMU_CAP_INTR_REMAP)
4093                 return irq_remapping_enabled;
4094
4095         return 0;
4096 }
4097
4098 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4099
4100 static int intel_iommu_add_device(struct device *dev)
4101 {
4102         struct pci_dev *pdev = to_pci_dev(dev);
4103         struct pci_dev *bridge, *dma_pdev = NULL;
4104         struct iommu_group *group;
4105         int ret;
4106
4107         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4108                              pdev->bus->number, pdev->devfn))
4109                 return -ENODEV;
4110
4111         bridge = pci_find_upstream_pcie_bridge(pdev);
4112         if (bridge) {
4113                 if (pci_is_pcie(bridge))
4114                         dma_pdev = pci_get_domain_bus_and_slot(
4115                                                 pci_domain_nr(pdev->bus),
4116                                                 bridge->subordinate->number, 0);
4117                 if (!dma_pdev)
4118                         dma_pdev = pci_dev_get(bridge);
4119         } else
4120                 dma_pdev = pci_dev_get(pdev);
4121
4122         /* Account for quirked devices */
4123         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4124
4125         /*
4126          * If it's a multifunction device that does not support our
4127          * required ACS flags, add to the same group as lowest numbered
4128          * function that also does not suport the required ACS flags.
4129          */
4130         if (dma_pdev->multifunction &&
4131             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4132                 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4133
4134                 for (i = 0; i < 8; i++) {
4135                         struct pci_dev *tmp;
4136
4137                         tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4138                         if (!tmp)
4139                                 continue;
4140
4141                         if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4142                                 swap_pci_ref(&dma_pdev, tmp);
4143                                 break;
4144                         }
4145                         pci_dev_put(tmp);
4146                 }
4147         }
4148
4149         /*
4150          * Devices on the root bus go through the iommu.  If that's not us,
4151          * find the next upstream device and test ACS up to the root bus.
4152          * Finding the next device may require skipping virtual buses.
4153          */
4154         while (!pci_is_root_bus(dma_pdev->bus)) {
4155                 struct pci_bus *bus = dma_pdev->bus;
4156
4157                 while (!bus->self) {
4158                         if (!pci_is_root_bus(bus))
4159                                 bus = bus->parent;
4160                         else
4161                                 goto root_bus;
4162                 }
4163
4164                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4165                         break;
4166
4167                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4168         }
4169
4170 root_bus:
4171         group = iommu_group_get(&dma_pdev->dev);
4172         pci_dev_put(dma_pdev);
4173         if (!group) {
4174                 group = iommu_group_alloc();
4175                 if (IS_ERR(group))
4176                         return PTR_ERR(group);
4177         }
4178
4179         ret = iommu_group_add_device(group, dev);
4180
4181         iommu_group_put(group);
4182         return ret;
4183 }
4184
4185 static void intel_iommu_remove_device(struct device *dev)
4186 {
4187         iommu_group_remove_device(dev);
4188 }
4189
4190 static struct iommu_ops intel_iommu_ops = {
4191         .domain_init    = intel_iommu_domain_init,
4192         .domain_destroy = intel_iommu_domain_destroy,
4193         .attach_dev     = intel_iommu_attach_device,
4194         .detach_dev     = intel_iommu_detach_device,
4195         .map            = intel_iommu_map,
4196         .unmap          = intel_iommu_unmap,
4197         .iova_to_phys   = intel_iommu_iova_to_phys,
4198         .domain_has_cap = intel_iommu_domain_has_cap,
4199         .add_device     = intel_iommu_add_device,
4200         .remove_device  = intel_iommu_remove_device,
4201         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4202 };
4203
4204 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4205 {
4206         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4207         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4208         dmar_map_gfx = 0;
4209 }
4210
4211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4212 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4213 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4214 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4217 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4218
4219 static void quirk_iommu_rwbf(struct pci_dev *dev)
4220 {
4221         /*
4222          * Mobile 4 Series Chipset neglects to set RWBF capability,
4223          * but needs it. Same seems to hold for the desktop versions.
4224          */
4225         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4226         rwbf_quirk = 1;
4227 }
4228
4229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4236
4237 #define GGC 0x52
4238 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4239 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4240 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4241 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4242 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4243 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4244 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4245 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4246
4247 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4248 {
4249         unsigned short ggc;
4250
4251         if (pci_read_config_word(dev, GGC, &ggc))
4252                 return;
4253
4254         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4255                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4256                 dmar_map_gfx = 0;
4257         } else if (dmar_map_gfx) {
4258                 /* we have to ensure the gfx device is idle before we flush */
4259                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4260                 intel_iommu_strict = 1;
4261        }
4262 }
4263 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4264 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4267
4268 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4269    ISOCH DMAR unit for the Azalia sound device, but not give it any
4270    TLB entries, which causes it to deadlock. Check for that.  We do
4271    this in a function called from init_dmars(), instead of in a PCI
4272    quirk, because we don't want to print the obnoxious "BIOS broken"
4273    message if VT-d is actually disabled.
4274 */
4275 static void __init check_tylersburg_isoch(void)
4276 {
4277         struct pci_dev *pdev;
4278         uint32_t vtisochctrl;
4279
4280         /* If there's no Azalia in the system anyway, forget it. */
4281         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4282         if (!pdev)
4283                 return;
4284         pci_dev_put(pdev);
4285
4286         /* System Management Registers. Might be hidden, in which case
4287            we can't do the sanity check. But that's OK, because the
4288            known-broken BIOSes _don't_ actually hide it, so far. */
4289         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4290         if (!pdev)
4291                 return;
4292
4293         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4294                 pci_dev_put(pdev);
4295                 return;
4296         }
4297
4298         pci_dev_put(pdev);
4299
4300         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4301         if (vtisochctrl & 1)
4302                 return;
4303
4304         /* Drop all bits other than the number of TLB entries */
4305         vtisochctrl &= 0x1c;
4306
4307         /* If we have the recommended number of TLB entries (16), fine. */
4308         if (vtisochctrl == 0x10)
4309                 return;
4310
4311         /* Zero TLB entries? You get to ride the short bus to school. */
4312         if (!vtisochctrl) {
4313                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4314                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4315                      dmi_get_system_info(DMI_BIOS_VENDOR),
4316                      dmi_get_system_info(DMI_BIOS_VERSION),
4317                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4318                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4319                 return;
4320         }
4321         
4322         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4323                vtisochctrl);
4324 }