2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
46 #define ROOT_SIZE VTD_PAGE_SIZE
47 #define CONTEXT_SIZE VTD_PAGE_SIZE
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
53 #define IOAPIC_RANGE_START (0xfee00000)
54 #define IOAPIC_RANGE_END (0xfeefffff)
55 #define IOVA_START_ADDR (0x1000)
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
59 #define MAX_AGAW_WIDTH 64
61 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
67 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
70 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
74 /* page table handling */
75 #define LEVEL_STRIDE (9)
76 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
78 static inline int agaw_to_level(int agaw)
83 static inline int agaw_to_width(int agaw)
85 return 30 + agaw * LEVEL_STRIDE;
88 static inline int width_to_agaw(int width)
90 return (width - 30) / LEVEL_STRIDE;
93 static inline unsigned int level_to_offset_bits(int level)
95 return (level - 1) * LEVEL_STRIDE;
98 static inline int pfn_level_offset(unsigned long pfn, int level)
100 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
103 static inline unsigned long level_mask(int level)
105 return -1UL << level_to_offset_bits(level);
108 static inline unsigned long level_size(int level)
110 return 1UL << level_to_offset_bits(level);
113 static inline unsigned long align_to_level(unsigned long pfn, int level)
115 return (pfn + level_size(level) - 1) & level_mask(level);
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119 are never going to work. */
120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
122 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
127 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
129 static inline unsigned long page_to_dma_pfn(struct page *pg)
131 return mm_to_dma_pfn(page_to_pfn(pg));
133 static inline unsigned long virt_to_dma_pfn(void *p)
135 return page_to_dma_pfn(virt_to_page(p));
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu **g_iommus;
141 static void __init check_tylersburg_isoch(void);
142 static int rwbf_quirk;
147 * 12-63: Context Ptr (12 - (haw-1))
154 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
155 static inline bool root_present(struct root_entry *root)
157 return (root->val & 1);
159 static inline void set_root_present(struct root_entry *root)
163 static inline void set_root_value(struct root_entry *root, unsigned long value)
165 root->val |= value & VTD_PAGE_MASK;
168 static inline struct context_entry *
169 get_context_addr_from_root(struct root_entry *root)
171 return (struct context_entry *)
172 (root_present(root)?phys_to_virt(
173 root->val & VTD_PAGE_MASK) :
180 * 1: fault processing disable
181 * 2-3: translation type
182 * 12-63: address space root
188 struct context_entry {
193 static inline bool context_present(struct context_entry *context)
195 return (context->lo & 1);
197 static inline void context_set_present(struct context_entry *context)
202 static inline void context_set_fault_enable(struct context_entry *context)
204 context->lo &= (((u64)-1) << 2) | 1;
207 static inline void context_set_translation_type(struct context_entry *context,
210 context->lo &= (((u64)-1) << 4) | 3;
211 context->lo |= (value & 3) << 2;
214 static inline void context_set_address_root(struct context_entry *context,
217 context->lo |= value & VTD_PAGE_MASK;
220 static inline void context_set_address_width(struct context_entry *context,
223 context->hi |= value & 7;
226 static inline void context_set_domain_id(struct context_entry *context,
229 context->hi |= (value & ((1 << 16) - 1)) << 8;
232 static inline void context_clear_entry(struct context_entry *context)
245 * 12-63: Host physcial address
251 static inline void dma_clear_pte(struct dma_pte *pte)
256 static inline void dma_set_pte_readable(struct dma_pte *pte)
258 pte->val |= DMA_PTE_READ;
261 static inline void dma_set_pte_writable(struct dma_pte *pte)
263 pte->val |= DMA_PTE_WRITE;
266 static inline void dma_set_pte_snp(struct dma_pte *pte)
268 pte->val |= DMA_PTE_SNP;
271 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
273 pte->val = (pte->val & ~3) | (prot & 3);
276 static inline u64 dma_pte_addr(struct dma_pte *pte)
279 return pte->val & VTD_PAGE_MASK;
281 /* Must have a full atomic 64-bit read */
282 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
286 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
288 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
291 static inline bool dma_pte_present(struct dma_pte *pte)
293 return (pte->val & 3) != 0;
296 static inline int first_pte_in_page(struct dma_pte *pte)
298 return !((unsigned long)pte & ~VTD_PAGE_MASK);
302 * This domain is a statically identity mapping domain.
303 * 1. This domain creats a static 1:1 mapping to all usable memory.
304 * 2. It maps to each iommu if successful.
305 * 3. Each iommu mapps to this domain if successful.
307 static struct dmar_domain *si_domain;
308 static int hw_pass_through = 1;
310 /* devices under the same p2p bridge are owned in one domain */
311 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
313 /* domain represents a virtual machine, more than one devices
314 * across iommus may be owned in one domain, e.g. kvm guest.
316 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
318 /* si_domain contains mulitple devices */
319 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
322 int id; /* domain id */
323 int nid; /* node id */
324 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
326 struct list_head devices; /* all devices' list */
327 struct iova_domain iovad; /* iova's that belong to this domain */
329 struct dma_pte *pgd; /* virtual address */
330 int gaw; /* max guest address width */
332 /* adjusted guest address width, 0 is level 2 30-bit */
335 int flags; /* flags to find out type of domain */
337 int iommu_coherency;/* indicate coherency of iommu access */
338 int iommu_snooping; /* indicate snooping control feature*/
339 int iommu_count; /* reference count of iommu */
340 spinlock_t iommu_lock; /* protect iommu set in domain */
341 u64 max_addr; /* maximum mapped address */
344 /* PCI domain-device relationship */
345 struct device_domain_info {
346 struct list_head link; /* link to domain siblings */
347 struct list_head global; /* link to global list */
348 int segment; /* PCI domain */
349 u8 bus; /* PCI bus number */
350 u8 devfn; /* PCI devfn number */
351 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
352 struct intel_iommu *iommu; /* IOMMU used by this device */
353 struct dmar_domain *domain; /* pointer to domain */
356 static void flush_unmaps_timeout(unsigned long data);
358 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
360 #define HIGH_WATER_MARK 250
361 struct deferred_flush_tables {
363 struct iova *iova[HIGH_WATER_MARK];
364 struct dmar_domain *domain[HIGH_WATER_MARK];
367 static struct deferred_flush_tables *deferred_flush;
369 /* bitmap for indexing intel_iommus */
370 static int g_num_of_iommus;
372 static DEFINE_SPINLOCK(async_umap_flush_lock);
373 static LIST_HEAD(unmaps_to_do);
376 static long list_size;
378 static void domain_remove_dev_info(struct dmar_domain *domain);
380 #ifdef CONFIG_DMAR_DEFAULT_ON
381 int dmar_disabled = 0;
383 int dmar_disabled = 1;
384 #endif /*CONFIG_DMAR_DEFAULT_ON*/
386 static int dmar_map_gfx = 1;
387 static int dmar_forcedac;
388 static int intel_iommu_strict;
390 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
391 static DEFINE_SPINLOCK(device_domain_lock);
392 static LIST_HEAD(device_domain_list);
394 static struct iommu_ops intel_iommu_ops;
396 static int __init intel_iommu_setup(char *str)
401 if (!strncmp(str, "on", 2)) {
403 printk(KERN_INFO "Intel-IOMMU: enabled\n");
404 } else if (!strncmp(str, "off", 3)) {
406 printk(KERN_INFO "Intel-IOMMU: disabled\n");
407 } else if (!strncmp(str, "igfx_off", 8)) {
410 "Intel-IOMMU: disable GFX device mapping\n");
411 } else if (!strncmp(str, "forcedac", 8)) {
413 "Intel-IOMMU: Forcing DAC for PCI devices\n");
415 } else if (!strncmp(str, "strict", 6)) {
417 "Intel-IOMMU: disable batched IOTLB flush\n");
418 intel_iommu_strict = 1;
421 str += strcspn(str, ",");
427 __setup("intel_iommu=", intel_iommu_setup);
429 static struct kmem_cache *iommu_domain_cache;
430 static struct kmem_cache *iommu_devinfo_cache;
431 static struct kmem_cache *iommu_iova_cache;
433 static inline void *alloc_pgtable_page(int node)
438 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
440 vaddr = page_address(page);
444 static inline void free_pgtable_page(void *vaddr)
446 free_page((unsigned long)vaddr);
449 static inline void *alloc_domain_mem(void)
451 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
454 static void free_domain_mem(void *vaddr)
456 kmem_cache_free(iommu_domain_cache, vaddr);
459 static inline void * alloc_devinfo_mem(void)
461 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
464 static inline void free_devinfo_mem(void *vaddr)
466 kmem_cache_free(iommu_devinfo_cache, vaddr);
469 struct iova *alloc_iova_mem(void)
471 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
474 void free_iova_mem(struct iova *iova)
476 kmem_cache_free(iommu_iova_cache, iova);
480 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
485 sagaw = cap_sagaw(iommu->cap);
486 for (agaw = width_to_agaw(max_gaw);
488 if (test_bit(agaw, &sagaw))
496 * Calculate max SAGAW for each iommu.
498 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
500 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
504 * calculate agaw for each iommu.
505 * "SAGAW" may be different across iommus, use a default agaw, and
506 * get a supported less agaw for iommus that don't support the default agaw.
508 int iommu_calculate_agaw(struct intel_iommu *iommu)
510 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
513 /* This functionin only returns single iommu in a domain */
514 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
518 /* si_domain and vm domain should not get here. */
519 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
520 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
522 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
523 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
526 return g_iommus[iommu_id];
529 static void domain_update_iommu_coherency(struct dmar_domain *domain)
533 domain->iommu_coherency = 1;
535 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
536 if (!ecap_coherent(g_iommus[i]->ecap)) {
537 domain->iommu_coherency = 0;
543 static void domain_update_iommu_snooping(struct dmar_domain *domain)
547 domain->iommu_snooping = 1;
549 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
550 if (!ecap_sc_support(g_iommus[i]->ecap)) {
551 domain->iommu_snooping = 0;
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
560 domain_update_iommu_coherency(domain);
561 domain_update_iommu_snooping(domain);
564 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
566 struct dmar_drhd_unit *drhd = NULL;
569 for_each_drhd_unit(drhd) {
572 if (segment != drhd->segment)
575 for (i = 0; i < drhd->devices_cnt; i++) {
576 if (drhd->devices[i] &&
577 drhd->devices[i]->bus->number == bus &&
578 drhd->devices[i]->devfn == devfn)
580 if (drhd->devices[i] &&
581 drhd->devices[i]->subordinate &&
582 drhd->devices[i]->subordinate->number <= bus &&
583 drhd->devices[i]->subordinate->subordinate >= bus)
587 if (drhd->include_all)
594 static void domain_flush_cache(struct dmar_domain *domain,
595 void *addr, int size)
597 if (!domain->iommu_coherency)
598 clflush_cache_range(addr, size);
601 /* Gets context entry for a given bus and devfn */
602 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
605 struct root_entry *root;
606 struct context_entry *context;
607 unsigned long phy_addr;
610 spin_lock_irqsave(&iommu->lock, flags);
611 root = &iommu->root_entry[bus];
612 context = get_context_addr_from_root(root);
614 context = (struct context_entry *)
615 alloc_pgtable_page(iommu->node);
617 spin_unlock_irqrestore(&iommu->lock, flags);
620 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
621 phy_addr = virt_to_phys((void *)context);
622 set_root_value(root, phy_addr);
623 set_root_present(root);
624 __iommu_flush_cache(iommu, root, sizeof(*root));
626 spin_unlock_irqrestore(&iommu->lock, flags);
627 return &context[devfn];
630 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
632 struct root_entry *root;
633 struct context_entry *context;
637 spin_lock_irqsave(&iommu->lock, flags);
638 root = &iommu->root_entry[bus];
639 context = get_context_addr_from_root(root);
644 ret = context_present(&context[devfn]);
646 spin_unlock_irqrestore(&iommu->lock, flags);
650 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
652 struct root_entry *root;
653 struct context_entry *context;
656 spin_lock_irqsave(&iommu->lock, flags);
657 root = &iommu->root_entry[bus];
658 context = get_context_addr_from_root(root);
660 context_clear_entry(&context[devfn]);
661 __iommu_flush_cache(iommu, &context[devfn], \
664 spin_unlock_irqrestore(&iommu->lock, flags);
667 static void free_context_table(struct intel_iommu *iommu)
669 struct root_entry *root;
672 struct context_entry *context;
674 spin_lock_irqsave(&iommu->lock, flags);
675 if (!iommu->root_entry) {
678 for (i = 0; i < ROOT_ENTRY_NR; i++) {
679 root = &iommu->root_entry[i];
680 context = get_context_addr_from_root(root);
682 free_pgtable_page(context);
684 free_pgtable_page(iommu->root_entry);
685 iommu->root_entry = NULL;
687 spin_unlock_irqrestore(&iommu->lock, flags);
690 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
693 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
694 struct dma_pte *parent, *pte = NULL;
695 int level = agaw_to_level(domain->agaw);
698 BUG_ON(!domain->pgd);
699 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
700 parent = domain->pgd;
705 offset = pfn_level_offset(pfn, level);
706 pte = &parent[offset];
710 if (!dma_pte_present(pte)) {
713 tmp_page = alloc_pgtable_page(domain->nid);
718 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
719 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
720 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
721 /* Someone else set it while we were thinking; use theirs. */
722 free_pgtable_page(tmp_page);
725 domain_flush_cache(domain, pte, sizeof(*pte));
728 parent = phys_to_virt(dma_pte_addr(pte));
735 /* return address's pte at specific level */
736 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
740 struct dma_pte *parent, *pte = NULL;
741 int total = agaw_to_level(domain->agaw);
744 parent = domain->pgd;
745 while (level <= total) {
746 offset = pfn_level_offset(pfn, total);
747 pte = &parent[offset];
751 if (!dma_pte_present(pte))
753 parent = phys_to_virt(dma_pte_addr(pte));
759 /* clear last level pte, a tlb flush should be followed */
760 static void dma_pte_clear_range(struct dmar_domain *domain,
761 unsigned long start_pfn,
762 unsigned long last_pfn)
764 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
765 struct dma_pte *first_pte, *pte;
767 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
768 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
769 BUG_ON(start_pfn > last_pfn);
771 /* we don't need lock here; nobody else touches the iova range */
773 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
775 start_pfn = align_to_level(start_pfn + 1, 2);
782 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
784 domain_flush_cache(domain, first_pte,
785 (void *)pte - (void *)first_pte);
787 } while (start_pfn && start_pfn <= last_pfn);
790 /* free page table pages. last level pte should already be cleared */
791 static void dma_pte_free_pagetable(struct dmar_domain *domain,
792 unsigned long start_pfn,
793 unsigned long last_pfn)
795 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
796 struct dma_pte *first_pte, *pte;
797 int total = agaw_to_level(domain->agaw);
801 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
802 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
803 BUG_ON(start_pfn > last_pfn);
805 /* We don't need lock here; nobody else touches the iova range */
807 while (level <= total) {
808 tmp = align_to_level(start_pfn, level);
810 /* If we can't even clear one PTE at this level, we're done */
811 if (tmp + level_size(level) - 1 > last_pfn)
815 first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
817 tmp = align_to_level(tmp + 1, level + 1);
821 if (dma_pte_present(pte)) {
822 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
826 tmp += level_size(level);
827 } while (!first_pte_in_page(pte) &&
828 tmp + level_size(level) - 1 <= last_pfn);
830 domain_flush_cache(domain, first_pte,
831 (void *)pte - (void *)first_pte);
833 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
837 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
838 free_pgtable_page(domain->pgd);
844 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
846 struct root_entry *root;
849 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
853 __iommu_flush_cache(iommu, root, ROOT_SIZE);
855 spin_lock_irqsave(&iommu->lock, flags);
856 iommu->root_entry = root;
857 spin_unlock_irqrestore(&iommu->lock, flags);
862 static void iommu_set_root_entry(struct intel_iommu *iommu)
868 addr = iommu->root_entry;
870 spin_lock_irqsave(&iommu->register_lock, flag);
871 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
873 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
875 /* Make sure hardware complete it */
876 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
877 readl, (sts & DMA_GSTS_RTPS), sts);
879 spin_unlock_irqrestore(&iommu->register_lock, flag);
882 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
887 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
890 spin_lock_irqsave(&iommu->register_lock, flag);
891 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
893 /* Make sure hardware complete it */
894 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
895 readl, (!(val & DMA_GSTS_WBFS)), val);
897 spin_unlock_irqrestore(&iommu->register_lock, flag);
900 /* return value determine if we need a write buffer flush */
901 static void __iommu_flush_context(struct intel_iommu *iommu,
902 u16 did, u16 source_id, u8 function_mask,
909 case DMA_CCMD_GLOBAL_INVL:
910 val = DMA_CCMD_GLOBAL_INVL;
912 case DMA_CCMD_DOMAIN_INVL:
913 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
915 case DMA_CCMD_DEVICE_INVL:
916 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
917 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
924 spin_lock_irqsave(&iommu->register_lock, flag);
925 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
927 /* Make sure hardware complete it */
928 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
929 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
931 spin_unlock_irqrestore(&iommu->register_lock, flag);
934 /* return value determine if we need a write buffer flush */
935 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
936 u64 addr, unsigned int size_order, u64 type)
938 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
939 u64 val = 0, val_iva = 0;
943 case DMA_TLB_GLOBAL_FLUSH:
944 /* global flush doesn't need set IVA_REG */
945 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
947 case DMA_TLB_DSI_FLUSH:
948 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
950 case DMA_TLB_PSI_FLUSH:
951 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
952 /* Note: always flush non-leaf currently */
953 val_iva = size_order | addr;
958 /* Note: set drain read/write */
961 * This is probably to be super secure.. Looks like we can
962 * ignore it without any impact.
964 if (cap_read_drain(iommu->cap))
965 val |= DMA_TLB_READ_DRAIN;
967 if (cap_write_drain(iommu->cap))
968 val |= DMA_TLB_WRITE_DRAIN;
970 spin_lock_irqsave(&iommu->register_lock, flag);
971 /* Note: Only uses first TLB reg currently */
973 dmar_writeq(iommu->reg + tlb_offset, val_iva);
974 dmar_writeq(iommu->reg + tlb_offset + 8, val);
976 /* Make sure hardware complete it */
977 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
978 dmar_readq, (!(val & DMA_TLB_IVT)), val);
980 spin_unlock_irqrestore(&iommu->register_lock, flag);
982 /* check IOTLB invalidation granularity */
983 if (DMA_TLB_IAIG(val) == 0)
984 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
985 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
986 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
987 (unsigned long long)DMA_TLB_IIRG(type),
988 (unsigned long long)DMA_TLB_IAIG(val));
991 static struct device_domain_info *iommu_support_dev_iotlb(
992 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
996 struct device_domain_info *info;
997 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
999 if (!ecap_dev_iotlb_support(iommu->ecap))
1005 spin_lock_irqsave(&device_domain_lock, flags);
1006 list_for_each_entry(info, &domain->devices, link)
1007 if (info->bus == bus && info->devfn == devfn) {
1011 spin_unlock_irqrestore(&device_domain_lock, flags);
1013 if (!found || !info->dev)
1016 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1019 if (!dmar_find_matched_atsr_unit(info->dev))
1022 info->iommu = iommu;
1027 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1032 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1035 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1037 if (!info->dev || !pci_ats_enabled(info->dev))
1040 pci_disable_ats(info->dev);
1043 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1044 u64 addr, unsigned mask)
1047 unsigned long flags;
1048 struct device_domain_info *info;
1050 spin_lock_irqsave(&device_domain_lock, flags);
1051 list_for_each_entry(info, &domain->devices, link) {
1052 if (!info->dev || !pci_ats_enabled(info->dev))
1055 sid = info->bus << 8 | info->devfn;
1056 qdep = pci_ats_queue_depth(info->dev);
1057 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1059 spin_unlock_irqrestore(&device_domain_lock, flags);
1062 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1063 unsigned long pfn, unsigned int pages, int map)
1065 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1066 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1071 * Fallback to domain selective flush if no PSI support or the size is
1073 * PSI requires page size to be 2 ^ x, and the base address is naturally
1074 * aligned to the size
1076 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1077 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1080 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1084 * In caching mode, changes of pages from non-present to present require
1085 * flush. However, device IOTLB doesn't need to be flushed in this case.
1087 if (!cap_caching_mode(iommu->cap) || !map)
1088 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1091 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1094 unsigned long flags;
1096 spin_lock_irqsave(&iommu->register_lock, flags);
1097 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1098 pmen &= ~DMA_PMEN_EPM;
1099 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1101 /* wait for the protected region status bit to clear */
1102 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1103 readl, !(pmen & DMA_PMEN_PRS), pmen);
1105 spin_unlock_irqrestore(&iommu->register_lock, flags);
1108 static int iommu_enable_translation(struct intel_iommu *iommu)
1111 unsigned long flags;
1113 spin_lock_irqsave(&iommu->register_lock, flags);
1114 iommu->gcmd |= DMA_GCMD_TE;
1115 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1117 /* Make sure hardware complete it */
1118 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1119 readl, (sts & DMA_GSTS_TES), sts);
1121 spin_unlock_irqrestore(&iommu->register_lock, flags);
1125 static int iommu_disable_translation(struct intel_iommu *iommu)
1130 spin_lock_irqsave(&iommu->register_lock, flag);
1131 iommu->gcmd &= ~DMA_GCMD_TE;
1132 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1134 /* Make sure hardware complete it */
1135 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1136 readl, (!(sts & DMA_GSTS_TES)), sts);
1138 spin_unlock_irqrestore(&iommu->register_lock, flag);
1143 static int iommu_init_domains(struct intel_iommu *iommu)
1145 unsigned long ndomains;
1146 unsigned long nlongs;
1148 ndomains = cap_ndoms(iommu->cap);
1149 pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1151 nlongs = BITS_TO_LONGS(ndomains);
1153 spin_lock_init(&iommu->lock);
1155 /* TBD: there might be 64K domains,
1156 * consider other allocation for future chip
1158 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1159 if (!iommu->domain_ids) {
1160 printk(KERN_ERR "Allocating domain id array failed\n");
1163 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1165 if (!iommu->domains) {
1166 printk(KERN_ERR "Allocating domain array failed\n");
1171 * if Caching mode is set, then invalid translations are tagged
1172 * with domainid 0. Hence we need to pre-allocate it.
1174 if (cap_caching_mode(iommu->cap))
1175 set_bit(0, iommu->domain_ids);
1180 static void domain_exit(struct dmar_domain *domain);
1181 static void vm_domain_exit(struct dmar_domain *domain);
1183 void free_dmar_iommu(struct intel_iommu *iommu)
1185 struct dmar_domain *domain;
1187 unsigned long flags;
1189 if ((iommu->domains) && (iommu->domain_ids)) {
1190 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1191 domain = iommu->domains[i];
1192 clear_bit(i, iommu->domain_ids);
1194 spin_lock_irqsave(&domain->iommu_lock, flags);
1195 if (--domain->iommu_count == 0) {
1196 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1197 vm_domain_exit(domain);
1199 domain_exit(domain);
1201 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1205 if (iommu->gcmd & DMA_GCMD_TE)
1206 iommu_disable_translation(iommu);
1209 set_irq_data(iommu->irq, NULL);
1210 /* This will mask the irq */
1211 free_irq(iommu->irq, iommu);
1212 destroy_irq(iommu->irq);
1215 kfree(iommu->domains);
1216 kfree(iommu->domain_ids);
1218 g_iommus[iommu->seq_id] = NULL;
1220 /* if all iommus are freed, free g_iommus */
1221 for (i = 0; i < g_num_of_iommus; i++) {
1226 if (i == g_num_of_iommus)
1229 /* free context mapping */
1230 free_context_table(iommu);
1233 static struct dmar_domain *alloc_domain(void)
1235 struct dmar_domain *domain;
1237 domain = alloc_domain_mem();
1242 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1248 static int iommu_attach_domain(struct dmar_domain *domain,
1249 struct intel_iommu *iommu)
1252 unsigned long ndomains;
1253 unsigned long flags;
1255 ndomains = cap_ndoms(iommu->cap);
1257 spin_lock_irqsave(&iommu->lock, flags);
1259 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1260 if (num >= ndomains) {
1261 spin_unlock_irqrestore(&iommu->lock, flags);
1262 printk(KERN_ERR "IOMMU: no free domain ids\n");
1267 set_bit(num, iommu->domain_ids);
1268 set_bit(iommu->seq_id, &domain->iommu_bmp);
1269 iommu->domains[num] = domain;
1270 spin_unlock_irqrestore(&iommu->lock, flags);
1275 static void iommu_detach_domain(struct dmar_domain *domain,
1276 struct intel_iommu *iommu)
1278 unsigned long flags;
1282 spin_lock_irqsave(&iommu->lock, flags);
1283 ndomains = cap_ndoms(iommu->cap);
1284 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1285 if (iommu->domains[num] == domain) {
1292 clear_bit(num, iommu->domain_ids);
1293 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1294 iommu->domains[num] = NULL;
1296 spin_unlock_irqrestore(&iommu->lock, flags);
1299 static struct iova_domain reserved_iova_list;
1300 static struct lock_class_key reserved_rbtree_key;
1302 static void dmar_init_reserved_ranges(void)
1304 struct pci_dev *pdev = NULL;
1308 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1310 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1311 &reserved_rbtree_key);
1313 /* IOAPIC ranges shouldn't be accessed by DMA */
1314 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1315 IOVA_PFN(IOAPIC_RANGE_END));
1317 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1319 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1320 for_each_pci_dev(pdev) {
1323 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1324 r = &pdev->resource[i];
1325 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1327 iova = reserve_iova(&reserved_iova_list,
1331 printk(KERN_ERR "Reserve iova failed\n");
1337 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1339 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1342 static inline int guestwidth_to_adjustwidth(int gaw)
1345 int r = (gaw - 12) % 9;
1356 static int domain_init(struct dmar_domain *domain, int guest_width)
1358 struct intel_iommu *iommu;
1359 int adjust_width, agaw;
1360 unsigned long sagaw;
1362 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1363 spin_lock_init(&domain->iommu_lock);
1365 domain_reserve_special_ranges(domain);
1367 /* calculate AGAW */
1368 iommu = domain_get_iommu(domain);
1369 if (guest_width > cap_mgaw(iommu->cap))
1370 guest_width = cap_mgaw(iommu->cap);
1371 domain->gaw = guest_width;
1372 adjust_width = guestwidth_to_adjustwidth(guest_width);
1373 agaw = width_to_agaw(adjust_width);
1374 sagaw = cap_sagaw(iommu->cap);
1375 if (!test_bit(agaw, &sagaw)) {
1376 /* hardware doesn't support it, choose a bigger one */
1377 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1378 agaw = find_next_bit(&sagaw, 5, agaw);
1382 domain->agaw = agaw;
1383 INIT_LIST_HEAD(&domain->devices);
1385 if (ecap_coherent(iommu->ecap))
1386 domain->iommu_coherency = 1;
1388 domain->iommu_coherency = 0;
1390 if (ecap_sc_support(iommu->ecap))
1391 domain->iommu_snooping = 1;
1393 domain->iommu_snooping = 0;
1395 domain->iommu_count = 1;
1396 domain->nid = iommu->node;
1398 /* always allocate the top pgd */
1399 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1402 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1406 static void domain_exit(struct dmar_domain *domain)
1408 struct dmar_drhd_unit *drhd;
1409 struct intel_iommu *iommu;
1411 /* Domain 0 is reserved, so dont process it */
1415 domain_remove_dev_info(domain);
1417 put_iova_domain(&domain->iovad);
1420 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1422 /* free page tables */
1423 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1425 for_each_active_iommu(iommu, drhd)
1426 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1427 iommu_detach_domain(domain, iommu);
1429 free_domain_mem(domain);
1432 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1433 u8 bus, u8 devfn, int translation)
1435 struct context_entry *context;
1436 unsigned long flags;
1437 struct intel_iommu *iommu;
1438 struct dma_pte *pgd;
1440 unsigned long ndomains;
1443 struct device_domain_info *info = NULL;
1445 pr_debug("Set context mapping for %02x:%02x.%d\n",
1446 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1448 BUG_ON(!domain->pgd);
1449 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1450 translation != CONTEXT_TT_MULTI_LEVEL);
1452 iommu = device_to_iommu(segment, bus, devfn);
1456 context = device_to_context_entry(iommu, bus, devfn);
1459 spin_lock_irqsave(&iommu->lock, flags);
1460 if (context_present(context)) {
1461 spin_unlock_irqrestore(&iommu->lock, flags);
1468 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1469 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1472 /* find an available domain id for this device in iommu */
1473 ndomains = cap_ndoms(iommu->cap);
1474 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1475 if (iommu->domains[num] == domain) {
1483 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1484 if (num >= ndomains) {
1485 spin_unlock_irqrestore(&iommu->lock, flags);
1486 printk(KERN_ERR "IOMMU: no free domain ids\n");
1490 set_bit(num, iommu->domain_ids);
1491 iommu->domains[num] = domain;
1495 /* Skip top levels of page tables for
1496 * iommu which has less agaw than default.
1497 * Unnecessary for PT mode.
1499 if (translation != CONTEXT_TT_PASS_THROUGH) {
1500 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1501 pgd = phys_to_virt(dma_pte_addr(pgd));
1502 if (!dma_pte_present(pgd)) {
1503 spin_unlock_irqrestore(&iommu->lock, flags);
1510 context_set_domain_id(context, id);
1512 if (translation != CONTEXT_TT_PASS_THROUGH) {
1513 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1514 translation = info ? CONTEXT_TT_DEV_IOTLB :
1515 CONTEXT_TT_MULTI_LEVEL;
1518 * In pass through mode, AW must be programmed to indicate the largest
1519 * AGAW value supported by hardware. And ASR is ignored by hardware.
1521 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1522 context_set_address_width(context, iommu->msagaw);
1524 context_set_address_root(context, virt_to_phys(pgd));
1525 context_set_address_width(context, iommu->agaw);
1528 context_set_translation_type(context, translation);
1529 context_set_fault_enable(context);
1530 context_set_present(context);
1531 domain_flush_cache(domain, context, sizeof(*context));
1534 * It's a non-present to present mapping. If hardware doesn't cache
1535 * non-present entry we only need to flush the write-buffer. If the
1536 * _does_ cache non-present entries, then it does so in the special
1537 * domain #0, which we have to flush:
1539 if (cap_caching_mode(iommu->cap)) {
1540 iommu->flush.flush_context(iommu, 0,
1541 (((u16)bus) << 8) | devfn,
1542 DMA_CCMD_MASK_NOBIT,
1543 DMA_CCMD_DEVICE_INVL);
1544 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1546 iommu_flush_write_buffer(iommu);
1548 iommu_enable_dev_iotlb(info);
1549 spin_unlock_irqrestore(&iommu->lock, flags);
1551 spin_lock_irqsave(&domain->iommu_lock, flags);
1552 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1553 domain->iommu_count++;
1554 if (domain->iommu_count == 1)
1555 domain->nid = iommu->node;
1556 domain_update_iommu_cap(domain);
1558 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1563 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1567 struct pci_dev *tmp, *parent;
1569 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1570 pdev->bus->number, pdev->devfn,
1575 /* dependent device mapping */
1576 tmp = pci_find_upstream_pcie_bridge(pdev);
1579 /* Secondary interface's bus number and devfn 0 */
1580 parent = pdev->bus->self;
1581 while (parent != tmp) {
1582 ret = domain_context_mapping_one(domain,
1583 pci_domain_nr(parent->bus),
1584 parent->bus->number,
1585 parent->devfn, translation);
1588 parent = parent->bus->self;
1590 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1591 return domain_context_mapping_one(domain,
1592 pci_domain_nr(tmp->subordinate),
1593 tmp->subordinate->number, 0,
1595 else /* this is a legacy PCI bridge */
1596 return domain_context_mapping_one(domain,
1597 pci_domain_nr(tmp->bus),
1603 static int domain_context_mapped(struct pci_dev *pdev)
1606 struct pci_dev *tmp, *parent;
1607 struct intel_iommu *iommu;
1609 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1614 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1617 /* dependent device mapping */
1618 tmp = pci_find_upstream_pcie_bridge(pdev);
1621 /* Secondary interface's bus number and devfn 0 */
1622 parent = pdev->bus->self;
1623 while (parent != tmp) {
1624 ret = device_context_mapped(iommu, parent->bus->number,
1628 parent = parent->bus->self;
1630 if (pci_is_pcie(tmp))
1631 return device_context_mapped(iommu, tmp->subordinate->number,
1634 return device_context_mapped(iommu, tmp->bus->number,
1638 /* Returns a number of VTD pages, but aligned to MM page size */
1639 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1642 host_addr &= ~PAGE_MASK;
1643 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1646 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1647 struct scatterlist *sg, unsigned long phys_pfn,
1648 unsigned long nr_pages, int prot)
1650 struct dma_pte *first_pte = NULL, *pte = NULL;
1651 phys_addr_t uninitialized_var(pteval);
1652 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1653 unsigned long sg_res;
1655 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1657 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1660 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1665 sg_res = nr_pages + 1;
1666 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1669 while (nr_pages--) {
1673 sg_res = aligned_nrpages(sg->offset, sg->length);
1674 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1675 sg->dma_length = sg->length;
1676 pteval = page_to_phys(sg_page(sg)) | prot;
1679 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1683 /* We don't need lock here, nobody else
1684 * touches the iova range
1686 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1688 static int dumps = 5;
1689 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1690 iov_pfn, tmp, (unsigned long long)pteval);
1693 debug_dma_dump_mappings(NULL);
1698 if (!nr_pages || first_pte_in_page(pte)) {
1699 domain_flush_cache(domain, first_pte,
1700 (void *)pte - (void *)first_pte);
1704 pteval += VTD_PAGE_SIZE;
1712 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1713 struct scatterlist *sg, unsigned long nr_pages,
1716 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1719 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1720 unsigned long phys_pfn, unsigned long nr_pages,
1723 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1726 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1731 clear_context_table(iommu, bus, devfn);
1732 iommu->flush.flush_context(iommu, 0, 0, 0,
1733 DMA_CCMD_GLOBAL_INVL);
1734 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1737 static void domain_remove_dev_info(struct dmar_domain *domain)
1739 struct device_domain_info *info;
1740 unsigned long flags;
1741 struct intel_iommu *iommu;
1743 spin_lock_irqsave(&device_domain_lock, flags);
1744 while (!list_empty(&domain->devices)) {
1745 info = list_entry(domain->devices.next,
1746 struct device_domain_info, link);
1747 list_del(&info->link);
1748 list_del(&info->global);
1750 info->dev->dev.archdata.iommu = NULL;
1751 spin_unlock_irqrestore(&device_domain_lock, flags);
1753 iommu_disable_dev_iotlb(info);
1754 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1755 iommu_detach_dev(iommu, info->bus, info->devfn);
1756 free_devinfo_mem(info);
1758 spin_lock_irqsave(&device_domain_lock, flags);
1760 spin_unlock_irqrestore(&device_domain_lock, flags);
1765 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1767 static struct dmar_domain *
1768 find_domain(struct pci_dev *pdev)
1770 struct device_domain_info *info;
1772 /* No lock here, assumes no domain exit in normal case */
1773 info = pdev->dev.archdata.iommu;
1775 return info->domain;
1779 /* domain is initialized */
1780 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1782 struct dmar_domain *domain, *found = NULL;
1783 struct intel_iommu *iommu;
1784 struct dmar_drhd_unit *drhd;
1785 struct device_domain_info *info, *tmp;
1786 struct pci_dev *dev_tmp;
1787 unsigned long flags;
1788 int bus = 0, devfn = 0;
1792 domain = find_domain(pdev);
1796 segment = pci_domain_nr(pdev->bus);
1798 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1800 if (pci_is_pcie(dev_tmp)) {
1801 bus = dev_tmp->subordinate->number;
1804 bus = dev_tmp->bus->number;
1805 devfn = dev_tmp->devfn;
1807 spin_lock_irqsave(&device_domain_lock, flags);
1808 list_for_each_entry(info, &device_domain_list, global) {
1809 if (info->segment == segment &&
1810 info->bus == bus && info->devfn == devfn) {
1811 found = info->domain;
1815 spin_unlock_irqrestore(&device_domain_lock, flags);
1816 /* pcie-pci bridge already has a domain, uses it */
1823 domain = alloc_domain();
1827 /* Allocate new domain for the device */
1828 drhd = dmar_find_matched_drhd_unit(pdev);
1830 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1834 iommu = drhd->iommu;
1836 ret = iommu_attach_domain(domain, iommu);
1838 domain_exit(domain);
1842 if (domain_init(domain, gaw)) {
1843 domain_exit(domain);
1847 /* register pcie-to-pci device */
1849 info = alloc_devinfo_mem();
1851 domain_exit(domain);
1854 info->segment = segment;
1856 info->devfn = devfn;
1858 info->domain = domain;
1859 /* This domain is shared by devices under p2p bridge */
1860 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1862 /* pcie-to-pci bridge already has a domain, uses it */
1864 spin_lock_irqsave(&device_domain_lock, flags);
1865 list_for_each_entry(tmp, &device_domain_list, global) {
1866 if (tmp->segment == segment &&
1867 tmp->bus == bus && tmp->devfn == devfn) {
1868 found = tmp->domain;
1873 spin_unlock_irqrestore(&device_domain_lock, flags);
1874 free_devinfo_mem(info);
1875 domain_exit(domain);
1878 list_add(&info->link, &domain->devices);
1879 list_add(&info->global, &device_domain_list);
1880 spin_unlock_irqrestore(&device_domain_lock, flags);
1885 info = alloc_devinfo_mem();
1888 info->segment = segment;
1889 info->bus = pdev->bus->number;
1890 info->devfn = pdev->devfn;
1892 info->domain = domain;
1893 spin_lock_irqsave(&device_domain_lock, flags);
1894 /* somebody is fast */
1895 found = find_domain(pdev);
1896 if (found != NULL) {
1897 spin_unlock_irqrestore(&device_domain_lock, flags);
1898 if (found != domain) {
1899 domain_exit(domain);
1902 free_devinfo_mem(info);
1905 list_add(&info->link, &domain->devices);
1906 list_add(&info->global, &device_domain_list);
1907 pdev->dev.archdata.iommu = info;
1908 spin_unlock_irqrestore(&device_domain_lock, flags);
1911 /* recheck it here, maybe others set it */
1912 return find_domain(pdev);
1915 static int iommu_identity_mapping;
1916 #define IDENTMAP_ALL 1
1917 #define IDENTMAP_GFX 2
1918 #define IDENTMAP_AZALIA 4
1920 static int iommu_domain_identity_map(struct dmar_domain *domain,
1921 unsigned long long start,
1922 unsigned long long end)
1924 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1925 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1927 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1928 dma_to_mm_pfn(last_vpfn))) {
1929 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1933 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1934 start, end, domain->id);
1936 * RMRR range might have overlap with physical memory range,
1939 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1941 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1942 last_vpfn - first_vpfn + 1,
1943 DMA_PTE_READ|DMA_PTE_WRITE);
1946 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1947 unsigned long long start,
1948 unsigned long long end)
1950 struct dmar_domain *domain;
1953 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1957 /* For _hardware_ passthrough, don't bother. But for software
1958 passthrough, we do it anyway -- it may indicate a memory
1959 range which is reserved in E820, so which didn't get set
1960 up to start with in si_domain */
1961 if (domain == si_domain && hw_pass_through) {
1962 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1963 pci_name(pdev), start, end);
1968 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1969 pci_name(pdev), start, end);
1972 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1973 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1974 dmi_get_system_info(DMI_BIOS_VENDOR),
1975 dmi_get_system_info(DMI_BIOS_VERSION),
1976 dmi_get_system_info(DMI_PRODUCT_VERSION));
1981 if (end >> agaw_to_width(domain->agaw)) {
1982 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1983 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1984 agaw_to_width(domain->agaw),
1985 dmi_get_system_info(DMI_BIOS_VENDOR),
1986 dmi_get_system_info(DMI_BIOS_VERSION),
1987 dmi_get_system_info(DMI_PRODUCT_VERSION));
1992 ret = iommu_domain_identity_map(domain, start, end);
1996 /* context entry init */
1997 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2004 domain_exit(domain);
2008 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2009 struct pci_dev *pdev)
2011 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2013 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2014 rmrr->end_address + 1);
2017 #ifdef CONFIG_DMAR_FLOPPY_WA
2018 static inline void iommu_prepare_isa(void)
2020 struct pci_dev *pdev;
2023 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2027 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2028 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2031 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2032 "floppy might not work\n");
2036 static inline void iommu_prepare_isa(void)
2040 #endif /* !CONFIG_DMAR_FLPY_WA */
2042 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2044 static int __init si_domain_work_fn(unsigned long start_pfn,
2045 unsigned long end_pfn, void *datax)
2049 *ret = iommu_domain_identity_map(si_domain,
2050 (uint64_t)start_pfn << PAGE_SHIFT,
2051 (uint64_t)end_pfn << PAGE_SHIFT);
2056 static int __init si_domain_init(int hw)
2058 struct dmar_drhd_unit *drhd;
2059 struct intel_iommu *iommu;
2062 si_domain = alloc_domain();
2066 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2068 for_each_active_iommu(iommu, drhd) {
2069 ret = iommu_attach_domain(si_domain, iommu);
2071 domain_exit(si_domain);
2076 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2077 domain_exit(si_domain);
2081 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2086 for_each_online_node(nid) {
2087 work_with_active_regions(nid, si_domain_work_fn, &ret);
2095 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2096 struct pci_dev *pdev);
2097 static int identity_mapping(struct pci_dev *pdev)
2099 struct device_domain_info *info;
2101 if (likely(!iommu_identity_mapping))
2105 list_for_each_entry(info, &si_domain->devices, link)
2106 if (info->dev == pdev)
2111 static int domain_add_dev_info(struct dmar_domain *domain,
2112 struct pci_dev *pdev,
2115 struct device_domain_info *info;
2116 unsigned long flags;
2119 info = alloc_devinfo_mem();
2123 ret = domain_context_mapping(domain, pdev, translation);
2125 free_devinfo_mem(info);
2129 info->segment = pci_domain_nr(pdev->bus);
2130 info->bus = pdev->bus->number;
2131 info->devfn = pdev->devfn;
2133 info->domain = domain;
2135 spin_lock_irqsave(&device_domain_lock, flags);
2136 list_add(&info->link, &domain->devices);
2137 list_add(&info->global, &device_domain_list);
2138 pdev->dev.archdata.iommu = info;
2139 spin_unlock_irqrestore(&device_domain_lock, flags);
2144 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2146 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2149 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2152 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2156 * We want to start off with all devices in the 1:1 domain, and
2157 * take them out later if we find they can't access all of memory.
2159 * However, we can't do this for PCI devices behind bridges,
2160 * because all PCI devices behind the same bridge will end up
2161 * with the same source-id on their transactions.
2163 * Practically speaking, we can't change things around for these
2164 * devices at run-time, because we can't be sure there'll be no
2165 * DMA transactions in flight for any of their siblings.
2167 * So PCI devices (unless they're on the root bus) as well as
2168 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2169 * the 1:1 domain, just in _case_ one of their siblings turns out
2170 * not to be able to map all of memory.
2172 if (!pci_is_pcie(pdev)) {
2173 if (!pci_is_root_bus(pdev->bus))
2175 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2177 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2181 * At boot time, we don't yet know if devices will be 64-bit capable.
2182 * Assume that they will -- if they turn out not to be, then we can
2183 * take them out of the 1:1 domain later.
2186 return pdev->dma_mask > DMA_BIT_MASK(32);
2191 static int __init iommu_prepare_static_identity_mapping(int hw)
2193 struct pci_dev *pdev = NULL;
2196 ret = si_domain_init(hw);
2200 for_each_pci_dev(pdev) {
2201 if (iommu_should_identity_map(pdev, 1)) {
2202 printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2203 hw ? "hardware" : "software", pci_name(pdev));
2205 ret = domain_add_dev_info(si_domain, pdev,
2206 hw ? CONTEXT_TT_PASS_THROUGH :
2207 CONTEXT_TT_MULTI_LEVEL);
2216 int __init init_dmars(void)
2218 struct dmar_drhd_unit *drhd;
2219 struct dmar_rmrr_unit *rmrr;
2220 struct pci_dev *pdev;
2221 struct intel_iommu *iommu;
2227 * initialize and program root entry to not present
2230 for_each_drhd_unit(drhd) {
2233 * lock not needed as this is only incremented in the single
2234 * threaded kernel __init code path all other access are read
2239 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2242 printk(KERN_ERR "Allocating global iommu array failed\n");
2247 deferred_flush = kzalloc(g_num_of_iommus *
2248 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2249 if (!deferred_flush) {
2254 for_each_drhd_unit(drhd) {
2258 iommu = drhd->iommu;
2259 g_iommus[iommu->seq_id] = iommu;
2261 ret = iommu_init_domains(iommu);
2267 * we could share the same root & context tables
2268 * amoung all IOMMU's. Need to Split it later.
2270 ret = iommu_alloc_root_entry(iommu);
2272 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2275 if (!ecap_pass_through(iommu->ecap))
2276 hw_pass_through = 0;
2280 * Start from the sane iommu hardware state.
2282 for_each_drhd_unit(drhd) {
2286 iommu = drhd->iommu;
2289 * If the queued invalidation is already initialized by us
2290 * (for example, while enabling interrupt-remapping) then
2291 * we got the things already rolling from a sane state.
2297 * Clear any previous faults.
2299 dmar_fault(-1, iommu);
2301 * Disable queued invalidation if supported and already enabled
2302 * before OS handover.
2304 dmar_disable_qi(iommu);
2307 for_each_drhd_unit(drhd) {
2311 iommu = drhd->iommu;
2313 if (dmar_enable_qi(iommu)) {
2315 * Queued Invalidate not enabled, use Register Based
2318 iommu->flush.flush_context = __iommu_flush_context;
2319 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2320 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2323 (unsigned long long)drhd->reg_base_addr);
2325 iommu->flush.flush_context = qi_flush_context;
2326 iommu->flush.flush_iotlb = qi_flush_iotlb;
2327 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2330 (unsigned long long)drhd->reg_base_addr);
2334 if (iommu_pass_through)
2335 iommu_identity_mapping |= IDENTMAP_ALL;
2337 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2338 iommu_identity_mapping |= IDENTMAP_GFX;
2341 check_tylersburg_isoch();
2344 * If pass through is not set or not enabled, setup context entries for
2345 * identity mappings for rmrr, gfx, and isa and may fall back to static
2346 * identity mapping if iommu_identity_mapping is set.
2348 if (iommu_identity_mapping) {
2349 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2351 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2357 * for each dev attached to rmrr
2359 * locate drhd for dev, alloc domain for dev
2360 * allocate free domain
2361 * allocate page table entries for rmrr
2362 * if context not allocated for bus
2363 * allocate and init context
2364 * set present in root table for this bus
2365 * init context with domain, translation etc
2369 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2370 for_each_rmrr_units(rmrr) {
2371 for (i = 0; i < rmrr->devices_cnt; i++) {
2372 pdev = rmrr->devices[i];
2374 * some BIOS lists non-exist devices in DMAR
2379 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2382 "IOMMU: mapping reserved region failed\n");
2386 iommu_prepare_isa();
2391 * global invalidate context cache
2392 * global invalidate iotlb
2393 * enable translation
2395 for_each_drhd_unit(drhd) {
2398 iommu = drhd->iommu;
2400 iommu_flush_write_buffer(iommu);
2402 ret = dmar_set_interrupt(iommu);
2406 iommu_set_root_entry(iommu);
2408 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2409 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2411 ret = iommu_enable_translation(iommu);
2415 iommu_disable_protect_mem_regions(iommu);
2420 for_each_drhd_unit(drhd) {
2423 iommu = drhd->iommu;
2430 /* This takes a number of _MM_ pages, not VTD pages */
2431 static struct iova *intel_alloc_iova(struct device *dev,
2432 struct dmar_domain *domain,
2433 unsigned long nrpages, uint64_t dma_mask)
2435 struct pci_dev *pdev = to_pci_dev(dev);
2436 struct iova *iova = NULL;
2438 /* Restrict dma_mask to the width that the iommu can handle */
2439 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2441 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2443 * First try to allocate an io virtual address in
2444 * DMA_BIT_MASK(32) and if that fails then try allocating
2447 iova = alloc_iova(&domain->iovad, nrpages,
2448 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2452 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2453 if (unlikely(!iova)) {
2454 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2455 nrpages, pci_name(pdev));
2462 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2464 struct dmar_domain *domain;
2467 domain = get_domain_for_dev(pdev,
2468 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2471 "Allocating domain for %s failed", pci_name(pdev));
2475 /* make sure context mapping is ok */
2476 if (unlikely(!domain_context_mapped(pdev))) {
2477 ret = domain_context_mapping(domain, pdev,
2478 CONTEXT_TT_MULTI_LEVEL);
2481 "Domain context map for %s failed",
2490 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2492 struct device_domain_info *info;
2494 /* No lock here, assumes no domain exit in normal case */
2495 info = dev->dev.archdata.iommu;
2497 return info->domain;
2499 return __get_valid_domain_for_dev(dev);
2502 static int iommu_dummy(struct pci_dev *pdev)
2504 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2507 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2508 static int iommu_no_mapping(struct device *dev)
2510 struct pci_dev *pdev;
2513 if (unlikely(dev->bus != &pci_bus_type))
2516 pdev = to_pci_dev(dev);
2517 if (iommu_dummy(pdev))
2520 if (!iommu_identity_mapping)
2523 found = identity_mapping(pdev);
2525 if (iommu_should_identity_map(pdev, 0))
2529 * 32 bit DMA is removed from si_domain and fall back
2530 * to non-identity mapping.
2532 domain_remove_one_dev_info(si_domain, pdev);
2533 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2539 * In case of a detached 64 bit DMA device from vm, the device
2540 * is put into si_domain for identity mapping.
2542 if (iommu_should_identity_map(pdev, 0)) {
2544 ret = domain_add_dev_info(si_domain, pdev,
2546 CONTEXT_TT_PASS_THROUGH :
2547 CONTEXT_TT_MULTI_LEVEL);
2549 printk(KERN_INFO "64bit %s uses identity mapping\n",
2559 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2560 size_t size, int dir, u64 dma_mask)
2562 struct pci_dev *pdev = to_pci_dev(hwdev);
2563 struct dmar_domain *domain;
2564 phys_addr_t start_paddr;
2568 struct intel_iommu *iommu;
2569 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2571 BUG_ON(dir == DMA_NONE);
2573 if (iommu_no_mapping(hwdev))
2576 domain = get_valid_domain_for_dev(pdev);
2580 iommu = domain_get_iommu(domain);
2581 size = aligned_nrpages(paddr, size);
2583 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2589 * Check if DMAR supports zero-length reads on write only
2592 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2593 !cap_zlr(iommu->cap))
2594 prot |= DMA_PTE_READ;
2595 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2596 prot |= DMA_PTE_WRITE;
2598 * paddr - (paddr + size) might be partial page, we should map the whole
2599 * page. Note: if two part of one page are separately mapped, we
2600 * might have two guest_addr mapping to the same host paddr, but this
2601 * is not a big problem
2603 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2604 mm_to_dma_pfn(paddr_pfn), size, prot);
2608 /* it's a non-present to present mapping. Only flush if caching mode */
2609 if (cap_caching_mode(iommu->cap))
2610 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2612 iommu_flush_write_buffer(iommu);
2614 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2615 start_paddr += paddr & ~PAGE_MASK;
2620 __free_iova(&domain->iovad, iova);
2621 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2622 pci_name(pdev), size, (unsigned long long)paddr, dir);
2626 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2627 unsigned long offset, size_t size,
2628 enum dma_data_direction dir,
2629 struct dma_attrs *attrs)
2631 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2632 dir, to_pci_dev(dev)->dma_mask);
2635 static void flush_unmaps(void)
2641 /* just flush them all */
2642 for (i = 0; i < g_num_of_iommus; i++) {
2643 struct intel_iommu *iommu = g_iommus[i];
2647 if (!deferred_flush[i].next)
2650 /* In caching mode, global flushes turn emulation expensive */
2651 if (!cap_caching_mode(iommu->cap))
2652 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2653 DMA_TLB_GLOBAL_FLUSH);
2654 for (j = 0; j < deferred_flush[i].next; j++) {
2656 struct iova *iova = deferred_flush[i].iova[j];
2657 struct dmar_domain *domain = deferred_flush[i].domain[j];
2659 /* On real hardware multiple invalidations are expensive */
2660 if (cap_caching_mode(iommu->cap))
2661 iommu_flush_iotlb_psi(iommu, domain->id,
2662 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2664 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2665 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2666 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2668 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2670 deferred_flush[i].next = 0;
2676 static void flush_unmaps_timeout(unsigned long data)
2678 unsigned long flags;
2680 spin_lock_irqsave(&async_umap_flush_lock, flags);
2682 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2685 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2687 unsigned long flags;
2689 struct intel_iommu *iommu;
2691 spin_lock_irqsave(&async_umap_flush_lock, flags);
2692 if (list_size == HIGH_WATER_MARK)
2695 iommu = domain_get_iommu(dom);
2696 iommu_id = iommu->seq_id;
2698 next = deferred_flush[iommu_id].next;
2699 deferred_flush[iommu_id].domain[next] = dom;
2700 deferred_flush[iommu_id].iova[next] = iova;
2701 deferred_flush[iommu_id].next++;
2704 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2708 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2711 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2712 size_t size, enum dma_data_direction dir,
2713 struct dma_attrs *attrs)
2715 struct pci_dev *pdev = to_pci_dev(dev);
2716 struct dmar_domain *domain;
2717 unsigned long start_pfn, last_pfn;
2719 struct intel_iommu *iommu;
2721 if (iommu_no_mapping(dev))
2724 domain = find_domain(pdev);
2727 iommu = domain_get_iommu(domain);
2729 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2730 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2731 (unsigned long long)dev_addr))
2734 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2735 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2737 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2738 pci_name(pdev), start_pfn, last_pfn);
2740 /* clear the whole page */
2741 dma_pte_clear_range(domain, start_pfn, last_pfn);
2743 /* free page tables */
2744 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2746 if (intel_iommu_strict) {
2747 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2748 last_pfn - start_pfn + 1, 0);
2750 __free_iova(&domain->iovad, iova);
2752 add_unmap(domain, iova);
2754 * queue up the release of the unmap to save the 1/6th of the
2755 * cpu used up by the iotlb flush operation...
2760 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2761 dma_addr_t *dma_handle, gfp_t flags)
2766 size = PAGE_ALIGN(size);
2767 order = get_order(size);
2769 if (!iommu_no_mapping(hwdev))
2770 flags &= ~(GFP_DMA | GFP_DMA32);
2771 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2772 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2778 vaddr = (void *)__get_free_pages(flags, order);
2781 memset(vaddr, 0, size);
2783 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2785 hwdev->coherent_dma_mask);
2788 free_pages((unsigned long)vaddr, order);
2792 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2793 dma_addr_t dma_handle)
2797 size = PAGE_ALIGN(size);
2798 order = get_order(size);
2800 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2801 free_pages((unsigned long)vaddr, order);
2804 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2805 int nelems, enum dma_data_direction dir,
2806 struct dma_attrs *attrs)
2808 struct pci_dev *pdev = to_pci_dev(hwdev);
2809 struct dmar_domain *domain;
2810 unsigned long start_pfn, last_pfn;
2812 struct intel_iommu *iommu;
2814 if (iommu_no_mapping(hwdev))
2817 domain = find_domain(pdev);
2820 iommu = domain_get_iommu(domain);
2822 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2823 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2824 (unsigned long long)sglist[0].dma_address))
2827 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2828 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2830 /* clear the whole page */
2831 dma_pte_clear_range(domain, start_pfn, last_pfn);
2833 /* free page tables */
2834 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2836 if (intel_iommu_strict) {
2837 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2838 last_pfn - start_pfn + 1, 0);
2840 __free_iova(&domain->iovad, iova);
2842 add_unmap(domain, iova);
2844 * queue up the release of the unmap to save the 1/6th of the
2845 * cpu used up by the iotlb flush operation...
2850 static int intel_nontranslate_map_sg(struct device *hddev,
2851 struct scatterlist *sglist, int nelems, int dir)
2854 struct scatterlist *sg;
2856 for_each_sg(sglist, sg, nelems, i) {
2857 BUG_ON(!sg_page(sg));
2858 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2859 sg->dma_length = sg->length;
2864 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2865 enum dma_data_direction dir, struct dma_attrs *attrs)
2868 struct pci_dev *pdev = to_pci_dev(hwdev);
2869 struct dmar_domain *domain;
2872 struct iova *iova = NULL;
2874 struct scatterlist *sg;
2875 unsigned long start_vpfn;
2876 struct intel_iommu *iommu;
2878 BUG_ON(dir == DMA_NONE);
2879 if (iommu_no_mapping(hwdev))
2880 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2882 domain = get_valid_domain_for_dev(pdev);
2886 iommu = domain_get_iommu(domain);
2888 for_each_sg(sglist, sg, nelems, i)
2889 size += aligned_nrpages(sg->offset, sg->length);
2891 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2894 sglist->dma_length = 0;
2899 * Check if DMAR supports zero-length reads on write only
2902 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2903 !cap_zlr(iommu->cap))
2904 prot |= DMA_PTE_READ;
2905 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2906 prot |= DMA_PTE_WRITE;
2908 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2910 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2911 if (unlikely(ret)) {
2912 /* clear the page */
2913 dma_pte_clear_range(domain, start_vpfn,
2914 start_vpfn + size - 1);
2915 /* free page tables */
2916 dma_pte_free_pagetable(domain, start_vpfn,
2917 start_vpfn + size - 1);
2919 __free_iova(&domain->iovad, iova);
2923 /* it's a non-present to present mapping. Only flush if caching mode */
2924 if (cap_caching_mode(iommu->cap))
2925 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2927 iommu_flush_write_buffer(iommu);
2932 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2937 struct dma_map_ops intel_dma_ops = {
2938 .alloc_coherent = intel_alloc_coherent,
2939 .free_coherent = intel_free_coherent,
2940 .map_sg = intel_map_sg,
2941 .unmap_sg = intel_unmap_sg,
2942 .map_page = intel_map_page,
2943 .unmap_page = intel_unmap_page,
2944 .mapping_error = intel_mapping_error,
2947 static inline int iommu_domain_cache_init(void)
2951 iommu_domain_cache = kmem_cache_create("iommu_domain",
2952 sizeof(struct dmar_domain),
2957 if (!iommu_domain_cache) {
2958 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2965 static inline int iommu_devinfo_cache_init(void)
2969 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2970 sizeof(struct device_domain_info),
2974 if (!iommu_devinfo_cache) {
2975 printk(KERN_ERR "Couldn't create devinfo cache\n");
2982 static inline int iommu_iova_cache_init(void)
2986 iommu_iova_cache = kmem_cache_create("iommu_iova",
2987 sizeof(struct iova),
2991 if (!iommu_iova_cache) {
2992 printk(KERN_ERR "Couldn't create iova cache\n");
2999 static int __init iommu_init_mempool(void)
3002 ret = iommu_iova_cache_init();
3006 ret = iommu_domain_cache_init();
3010 ret = iommu_devinfo_cache_init();
3014 kmem_cache_destroy(iommu_domain_cache);
3016 kmem_cache_destroy(iommu_iova_cache);
3021 static void __init iommu_exit_mempool(void)
3023 kmem_cache_destroy(iommu_devinfo_cache);
3024 kmem_cache_destroy(iommu_domain_cache);
3025 kmem_cache_destroy(iommu_iova_cache);
3029 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3031 struct dmar_drhd_unit *drhd;
3035 /* We know that this device on this chipset has its own IOMMU.
3036 * If we find it under a different IOMMU, then the BIOS is lying
3037 * to us. Hope that the IOMMU for this device is actually
3038 * disabled, and it needs no translation...
3040 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3042 /* "can't" happen */
3043 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3046 vtbar &= 0xffff0000;
3048 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3049 drhd = dmar_find_matched_drhd_unit(pdev);
3050 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3051 TAINT_FIRMWARE_WORKAROUND,
3052 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3053 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3055 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3057 static void __init init_no_remapping_devices(void)
3059 struct dmar_drhd_unit *drhd;
3061 for_each_drhd_unit(drhd) {
3062 if (!drhd->include_all) {
3064 for (i = 0; i < drhd->devices_cnt; i++)
3065 if (drhd->devices[i] != NULL)
3067 /* ignore DMAR unit if no pci devices exist */
3068 if (i == drhd->devices_cnt)
3076 for_each_drhd_unit(drhd) {
3078 if (drhd->ignored || drhd->include_all)
3081 for (i = 0; i < drhd->devices_cnt; i++)
3082 if (drhd->devices[i] &&
3083 !IS_GFX_DEVICE(drhd->devices[i]))
3086 if (i < drhd->devices_cnt)
3089 /* bypass IOMMU if it is just for gfx devices */
3091 for (i = 0; i < drhd->devices_cnt; i++) {
3092 if (!drhd->devices[i])
3094 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3099 #ifdef CONFIG_SUSPEND
3100 static int init_iommu_hw(void)
3102 struct dmar_drhd_unit *drhd;
3103 struct intel_iommu *iommu = NULL;
3105 for_each_active_iommu(iommu, drhd)
3107 dmar_reenable_qi(iommu);
3109 for_each_active_iommu(iommu, drhd) {
3110 iommu_flush_write_buffer(iommu);
3112 iommu_set_root_entry(iommu);
3114 iommu->flush.flush_context(iommu, 0, 0, 0,
3115 DMA_CCMD_GLOBAL_INVL);
3116 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3117 DMA_TLB_GLOBAL_FLUSH);
3118 iommu_enable_translation(iommu);
3119 iommu_disable_protect_mem_regions(iommu);
3125 static void iommu_flush_all(void)
3127 struct dmar_drhd_unit *drhd;
3128 struct intel_iommu *iommu;
3130 for_each_active_iommu(iommu, drhd) {
3131 iommu->flush.flush_context(iommu, 0, 0, 0,
3132 DMA_CCMD_GLOBAL_INVL);
3133 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3134 DMA_TLB_GLOBAL_FLUSH);
3138 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3140 struct dmar_drhd_unit *drhd;
3141 struct intel_iommu *iommu = NULL;
3144 for_each_active_iommu(iommu, drhd) {
3145 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3147 if (!iommu->iommu_state)
3153 for_each_active_iommu(iommu, drhd) {
3154 iommu_disable_translation(iommu);
3156 spin_lock_irqsave(&iommu->register_lock, flag);
3158 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3159 readl(iommu->reg + DMAR_FECTL_REG);
3160 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3161 readl(iommu->reg + DMAR_FEDATA_REG);
3162 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3163 readl(iommu->reg + DMAR_FEADDR_REG);
3164 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3165 readl(iommu->reg + DMAR_FEUADDR_REG);
3167 spin_unlock_irqrestore(&iommu->register_lock, flag);
3172 for_each_active_iommu(iommu, drhd)
3173 kfree(iommu->iommu_state);
3178 static int iommu_resume(struct sys_device *dev)
3180 struct dmar_drhd_unit *drhd;
3181 struct intel_iommu *iommu = NULL;
3184 if (init_iommu_hw()) {
3185 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3189 for_each_active_iommu(iommu, drhd) {
3191 spin_lock_irqsave(&iommu->register_lock, flag);
3193 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3194 iommu->reg + DMAR_FECTL_REG);
3195 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3196 iommu->reg + DMAR_FEDATA_REG);
3197 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3198 iommu->reg + DMAR_FEADDR_REG);
3199 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3200 iommu->reg + DMAR_FEUADDR_REG);
3202 spin_unlock_irqrestore(&iommu->register_lock, flag);
3205 for_each_active_iommu(iommu, drhd)
3206 kfree(iommu->iommu_state);
3211 static struct sysdev_class iommu_sysclass = {
3213 .resume = iommu_resume,
3214 .suspend = iommu_suspend,
3217 static struct sys_device device_iommu = {
3218 .cls = &iommu_sysclass,
3221 static int __init init_iommu_sysfs(void)
3225 error = sysdev_class_register(&iommu_sysclass);
3229 error = sysdev_register(&device_iommu);
3231 sysdev_class_unregister(&iommu_sysclass);
3237 static int __init init_iommu_sysfs(void)
3241 #endif /* CONFIG_PM */
3244 * Here we only respond to action of unbound device from driver.
3246 * Added device is not attached to its DMAR domain here yet. That will happen
3247 * when mapping the device to iova.
3249 static int device_notifier(struct notifier_block *nb,
3250 unsigned long action, void *data)
3252 struct device *dev = data;
3253 struct pci_dev *pdev = to_pci_dev(dev);
3254 struct dmar_domain *domain;
3256 if (iommu_no_mapping(dev))
3259 domain = find_domain(pdev);
3263 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3264 domain_remove_one_dev_info(domain, pdev);
3269 static struct notifier_block device_nb = {
3270 .notifier_call = device_notifier,
3273 int __init intel_iommu_init(void)
3278 /* VT-d is required for a TXT/tboot launch, so enforce that */
3279 force_on = tboot_force_iommu();
3281 if (dmar_table_init()) {
3283 panic("tboot: Failed to initialize DMAR table\n");
3287 if (dmar_dev_scope_init()) {
3289 panic("tboot: Failed to initialize DMAR device scope\n");
3294 * Check the need for DMA-remapping initialization now.
3295 * Above initialization will also be used by Interrupt-remapping.
3297 if (no_iommu || dmar_disabled)
3300 iommu_init_mempool();
3301 dmar_init_reserved_ranges();
3303 init_no_remapping_devices();
3308 panic("tboot: Failed to initialize DMARs\n");
3309 printk(KERN_ERR "IOMMU: dmar init failed\n");
3310 put_iova_domain(&reserved_iova_list);
3311 iommu_exit_mempool();
3315 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3317 init_timer(&unmap_timer);
3318 #ifdef CONFIG_SWIOTLB
3321 dma_ops = &intel_dma_ops;
3325 register_iommu(&intel_iommu_ops);
3327 bus_register_notifier(&pci_bus_type, &device_nb);
3332 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3333 struct pci_dev *pdev)
3335 struct pci_dev *tmp, *parent;
3337 if (!iommu || !pdev)
3340 /* dependent device detach */
3341 tmp = pci_find_upstream_pcie_bridge(pdev);
3342 /* Secondary interface's bus number and devfn 0 */
3344 parent = pdev->bus->self;
3345 while (parent != tmp) {
3346 iommu_detach_dev(iommu, parent->bus->number,
3348 parent = parent->bus->self;
3350 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3351 iommu_detach_dev(iommu,
3352 tmp->subordinate->number, 0);
3353 else /* this is a legacy PCI bridge */
3354 iommu_detach_dev(iommu, tmp->bus->number,
3359 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3360 struct pci_dev *pdev)
3362 struct device_domain_info *info;
3363 struct intel_iommu *iommu;
3364 unsigned long flags;
3366 struct list_head *entry, *tmp;
3368 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3373 spin_lock_irqsave(&device_domain_lock, flags);
3374 list_for_each_safe(entry, tmp, &domain->devices) {
3375 info = list_entry(entry, struct device_domain_info, link);
3376 /* No need to compare PCI domain; it has to be the same */
3377 if (info->bus == pdev->bus->number &&
3378 info->devfn == pdev->devfn) {
3379 list_del(&info->link);
3380 list_del(&info->global);
3382 info->dev->dev.archdata.iommu = NULL;
3383 spin_unlock_irqrestore(&device_domain_lock, flags);
3385 iommu_disable_dev_iotlb(info);
3386 iommu_detach_dev(iommu, info->bus, info->devfn);
3387 iommu_detach_dependent_devices(iommu, pdev);
3388 free_devinfo_mem(info);
3390 spin_lock_irqsave(&device_domain_lock, flags);
3398 /* if there is no other devices under the same iommu
3399 * owned by this domain, clear this iommu in iommu_bmp
3400 * update iommu count and coherency
3402 if (iommu == device_to_iommu(info->segment, info->bus,
3408 unsigned long tmp_flags;
3409 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3410 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3411 domain->iommu_count--;
3412 domain_update_iommu_cap(domain);
3413 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3416 spin_unlock_irqrestore(&device_domain_lock, flags);
3419 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3421 struct device_domain_info *info;
3422 struct intel_iommu *iommu;
3423 unsigned long flags1, flags2;
3425 spin_lock_irqsave(&device_domain_lock, flags1);
3426 while (!list_empty(&domain->devices)) {
3427 info = list_entry(domain->devices.next,
3428 struct device_domain_info, link);
3429 list_del(&info->link);
3430 list_del(&info->global);
3432 info->dev->dev.archdata.iommu = NULL;
3434 spin_unlock_irqrestore(&device_domain_lock, flags1);
3436 iommu_disable_dev_iotlb(info);
3437 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3438 iommu_detach_dev(iommu, info->bus, info->devfn);
3439 iommu_detach_dependent_devices(iommu, info->dev);
3441 /* clear this iommu in iommu_bmp, update iommu count
3444 spin_lock_irqsave(&domain->iommu_lock, flags2);
3445 if (test_and_clear_bit(iommu->seq_id,
3446 &domain->iommu_bmp)) {
3447 domain->iommu_count--;
3448 domain_update_iommu_cap(domain);
3450 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3452 free_devinfo_mem(info);
3453 spin_lock_irqsave(&device_domain_lock, flags1);
3455 spin_unlock_irqrestore(&device_domain_lock, flags1);
3458 /* domain id for virtual machine, it won't be set in context */
3459 static unsigned long vm_domid;
3461 static struct dmar_domain *iommu_alloc_vm_domain(void)
3463 struct dmar_domain *domain;
3465 domain = alloc_domain_mem();
3469 domain->id = vm_domid++;
3471 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3472 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3477 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3481 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3482 spin_lock_init(&domain->iommu_lock);
3484 domain_reserve_special_ranges(domain);
3486 /* calculate AGAW */
3487 domain->gaw = guest_width;
3488 adjust_width = guestwidth_to_adjustwidth(guest_width);
3489 domain->agaw = width_to_agaw(adjust_width);
3491 INIT_LIST_HEAD(&domain->devices);
3493 domain->iommu_count = 0;
3494 domain->iommu_coherency = 0;
3495 domain->iommu_snooping = 0;
3496 domain->max_addr = 0;
3499 /* always allocate the top pgd */
3500 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3503 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3507 static void iommu_free_vm_domain(struct dmar_domain *domain)
3509 unsigned long flags;
3510 struct dmar_drhd_unit *drhd;
3511 struct intel_iommu *iommu;
3513 unsigned long ndomains;
3515 for_each_drhd_unit(drhd) {
3518 iommu = drhd->iommu;
3520 ndomains = cap_ndoms(iommu->cap);
3521 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3522 if (iommu->domains[i] == domain) {
3523 spin_lock_irqsave(&iommu->lock, flags);
3524 clear_bit(i, iommu->domain_ids);
3525 iommu->domains[i] = NULL;
3526 spin_unlock_irqrestore(&iommu->lock, flags);
3533 static void vm_domain_exit(struct dmar_domain *domain)
3535 /* Domain 0 is reserved, so dont process it */
3539 vm_domain_remove_all_dev_info(domain);
3541 put_iova_domain(&domain->iovad);
3544 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3546 /* free page tables */
3547 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3549 iommu_free_vm_domain(domain);
3550 free_domain_mem(domain);
3553 static int intel_iommu_domain_init(struct iommu_domain *domain)
3555 struct dmar_domain *dmar_domain;
3557 dmar_domain = iommu_alloc_vm_domain();
3560 "intel_iommu_domain_init: dmar_domain == NULL\n");
3563 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3565 "intel_iommu_domain_init() failed\n");
3566 vm_domain_exit(dmar_domain);
3569 domain->priv = dmar_domain;
3574 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3576 struct dmar_domain *dmar_domain = domain->priv;
3578 domain->priv = NULL;
3579 vm_domain_exit(dmar_domain);
3582 static int intel_iommu_attach_device(struct iommu_domain *domain,
3585 struct dmar_domain *dmar_domain = domain->priv;
3586 struct pci_dev *pdev = to_pci_dev(dev);
3587 struct intel_iommu *iommu;
3590 /* normally pdev is not mapped */
3591 if (unlikely(domain_context_mapped(pdev))) {
3592 struct dmar_domain *old_domain;
3594 old_domain = find_domain(pdev);
3596 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3597 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3598 domain_remove_one_dev_info(old_domain, pdev);
3600 domain_remove_dev_info(old_domain);
3604 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3609 /* check if this iommu agaw is sufficient for max mapped address */
3610 addr_width = agaw_to_width(iommu->agaw);
3611 if (addr_width > cap_mgaw(iommu->cap))
3612 addr_width = cap_mgaw(iommu->cap);
3614 if (dmar_domain->max_addr > (1LL << addr_width)) {
3615 printk(KERN_ERR "%s: iommu width (%d) is not "
3616 "sufficient for the mapped address (%llx)\n",
3617 __func__, addr_width, dmar_domain->max_addr);
3620 dmar_domain->gaw = addr_width;
3623 * Knock out extra levels of page tables if necessary
3625 while (iommu->agaw < dmar_domain->agaw) {
3626 struct dma_pte *pte;
3628 pte = dmar_domain->pgd;
3629 if (dma_pte_present(pte)) {
3630 free_pgtable_page(dmar_domain->pgd);
3631 dmar_domain->pgd = (struct dma_pte *)
3632 phys_to_virt(dma_pte_addr(pte));
3634 dmar_domain->agaw--;
3637 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3640 static void intel_iommu_detach_device(struct iommu_domain *domain,
3643 struct dmar_domain *dmar_domain = domain->priv;
3644 struct pci_dev *pdev = to_pci_dev(dev);
3646 domain_remove_one_dev_info(dmar_domain, pdev);
3649 static int intel_iommu_map(struct iommu_domain *domain,
3650 unsigned long iova, phys_addr_t hpa,
3651 int gfp_order, int iommu_prot)
3653 struct dmar_domain *dmar_domain = domain->priv;
3659 if (iommu_prot & IOMMU_READ)
3660 prot |= DMA_PTE_READ;
3661 if (iommu_prot & IOMMU_WRITE)
3662 prot |= DMA_PTE_WRITE;
3663 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3664 prot |= DMA_PTE_SNP;
3666 size = PAGE_SIZE << gfp_order;
3667 max_addr = iova + size;
3668 if (dmar_domain->max_addr < max_addr) {
3671 /* check if minimum agaw is sufficient for mapped address */
3672 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3673 if (end < max_addr) {
3674 printk(KERN_ERR "%s: iommu width (%d) is not "
3675 "sufficient for the mapped address (%llx)\n",
3676 __func__, dmar_domain->gaw, max_addr);
3679 dmar_domain->max_addr = max_addr;
3681 /* Round up size to next multiple of PAGE_SIZE, if it and
3682 the low bits of hpa would take us onto the next page */
3683 size = aligned_nrpages(hpa, size);
3684 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3685 hpa >> VTD_PAGE_SHIFT, size, prot);
3689 static int intel_iommu_unmap(struct iommu_domain *domain,
3690 unsigned long iova, int gfp_order)
3692 struct dmar_domain *dmar_domain = domain->priv;
3693 size_t size = PAGE_SIZE << gfp_order;
3695 dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3696 (iova + size - 1) >> VTD_PAGE_SHIFT);
3698 if (dmar_domain->max_addr == iova + size)
3699 dmar_domain->max_addr = iova;
3704 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3707 struct dmar_domain *dmar_domain = domain->priv;
3708 struct dma_pte *pte;
3711 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3713 phys = dma_pte_addr(pte);
3718 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3721 struct dmar_domain *dmar_domain = domain->priv;
3723 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3724 return dmar_domain->iommu_snooping;
3725 if (cap == IOMMU_CAP_INTR_REMAP)
3726 return intr_remapping_enabled;
3731 static struct iommu_ops intel_iommu_ops = {
3732 .domain_init = intel_iommu_domain_init,
3733 .domain_destroy = intel_iommu_domain_destroy,
3734 .attach_dev = intel_iommu_attach_device,
3735 .detach_dev = intel_iommu_detach_device,
3736 .map = intel_iommu_map,
3737 .unmap = intel_iommu_unmap,
3738 .iova_to_phys = intel_iommu_iova_to_phys,
3739 .domain_has_cap = intel_iommu_domain_has_cap,
3742 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3745 * Mobile 4 Series Chipset neglects to set RWBF capability,
3748 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3751 /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3752 if (dev->revision == 0x07) {
3753 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3758 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3761 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
3762 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
3763 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
3764 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
3765 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
3766 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
3767 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
3768 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
3770 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3774 if (pci_read_config_word(dev, GGC, &ggc))
3777 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3778 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3787 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3788 ISOCH DMAR unit for the Azalia sound device, but not give it any
3789 TLB entries, which causes it to deadlock. Check for that. We do
3790 this in a function called from init_dmars(), instead of in a PCI
3791 quirk, because we don't want to print the obnoxious "BIOS broken"
3792 message if VT-d is actually disabled.
3794 static void __init check_tylersburg_isoch(void)
3796 struct pci_dev *pdev;
3797 uint32_t vtisochctrl;
3799 /* If there's no Azalia in the system anyway, forget it. */
3800 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3805 /* System Management Registers. Might be hidden, in which case
3806 we can't do the sanity check. But that's OK, because the
3807 known-broken BIOSes _don't_ actually hide it, so far. */
3808 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3812 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3819 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3820 if (vtisochctrl & 1)
3823 /* Drop all bits other than the number of TLB entries */
3824 vtisochctrl &= 0x1c;
3826 /* If we have the recommended number of TLB entries (16), fine. */
3827 if (vtisochctrl == 0x10)
3830 /* Zero TLB entries? You get to ride the short bus to school. */
3832 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3833 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3834 dmi_get_system_info(DMI_BIOS_VENDOR),
3835 dmi_get_system_info(DMI_BIOS_VERSION),
3836 dmi_get_system_info(DMI_PRODUCT_VERSION));
3837 iommu_identity_mapping |= IDENTMAP_AZALIA;
3841 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",