2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
46 #define IOAPIC_RANGE_START (0xfee00000)
47 #define IOAPIC_RANGE_END (0xfeefffff)
48 #define IOVA_START_ADDR (0x1000)
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
52 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 static void flush_unmaps_timeout(unsigned long data);
59 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
61 static struct intel_iommu *g_iommus;
62 /* bitmap for indexing intel_iommus */
63 static unsigned long *g_iommus_to_flush;
64 static int g_num_of_iommus;
66 static DEFINE_SPINLOCK(async_umap_flush_lock);
67 static LIST_HEAD(unmaps_to_do);
70 static long list_size;
71 static int high_watermark;
73 static struct dentry *intel_iommu_debug, *debug;
76 static void domain_remove_dev_info(struct dmar_domain *domain);
78 static int dmar_disabled;
79 static int __initdata dmar_map_gfx = 1;
80 static int dmar_forcedac;
81 static int intel_iommu_strict;
83 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
84 static DEFINE_SPINLOCK(device_domain_lock);
85 static LIST_HEAD(device_domain_list);
87 static int __init intel_iommu_setup(char *str)
92 if (!strncmp(str, "off", 3)) {
94 printk(KERN_INFO"Intel-IOMMU: disabled\n");
95 } else if (!strncmp(str, "igfx_off", 8)) {
98 "Intel-IOMMU: disable GFX device mapping\n");
99 } else if (!strncmp(str, "forcedac", 8)) {
101 "Intel-IOMMU: Forcing DAC for PCI devices\n");
103 } else if (!strncmp(str, "strict", 6)) {
105 "Intel-IOMMU: disable batched IOTLB flush\n");
106 intel_iommu_strict = 1;
109 str += strcspn(str, ",");
115 __setup("intel_iommu=", intel_iommu_setup);
117 static struct kmem_cache *iommu_domain_cache;
118 static struct kmem_cache *iommu_devinfo_cache;
119 static struct kmem_cache *iommu_iova_cache;
121 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
126 /* trying to avoid low memory issues */
127 flags = current->flags & PF_MEMALLOC;
128 current->flags |= PF_MEMALLOC;
129 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
130 current->flags &= (~PF_MEMALLOC | flags);
135 static inline void *alloc_pgtable_page(void)
140 /* trying to avoid low memory issues */
141 flags = current->flags & PF_MEMALLOC;
142 current->flags |= PF_MEMALLOC;
143 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
144 current->flags &= (~PF_MEMALLOC | flags);
148 static inline void free_pgtable_page(void *vaddr)
150 free_page((unsigned long)vaddr);
153 static inline void *alloc_domain_mem(void)
155 return iommu_kmem_cache_alloc(iommu_domain_cache);
158 static inline void free_domain_mem(void *vaddr)
160 kmem_cache_free(iommu_domain_cache, vaddr);
163 static inline void * alloc_devinfo_mem(void)
165 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
168 static inline void free_devinfo_mem(void *vaddr)
170 kmem_cache_free(iommu_devinfo_cache, vaddr);
173 struct iova *alloc_iova_mem(void)
175 return iommu_kmem_cache_alloc(iommu_iova_cache);
178 void free_iova_mem(struct iova *iova)
180 kmem_cache_free(iommu_iova_cache, iova);
183 static inline void __iommu_flush_cache(
184 struct intel_iommu *iommu, void *addr, int size)
186 if (!ecap_coherent(iommu->ecap))
187 clflush_cache_range(addr, size);
190 /* Gets context entry for a given bus and devfn */
191 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
194 struct root_entry *root;
195 struct context_entry *context;
196 unsigned long phy_addr;
199 spin_lock_irqsave(&iommu->lock, flags);
200 root = &iommu->root_entry[bus];
201 context = get_context_addr_from_root(root);
203 context = (struct context_entry *)alloc_pgtable_page();
205 spin_unlock_irqrestore(&iommu->lock, flags);
208 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
209 phy_addr = virt_to_phys((void *)context);
210 set_root_value(root, phy_addr);
211 set_root_present(root);
212 __iommu_flush_cache(iommu, root, sizeof(*root));
214 spin_unlock_irqrestore(&iommu->lock, flags);
215 return &context[devfn];
218 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
220 struct root_entry *root;
221 struct context_entry *context;
225 spin_lock_irqsave(&iommu->lock, flags);
226 root = &iommu->root_entry[bus];
227 context = get_context_addr_from_root(root);
232 ret = context_present(context[devfn]);
234 spin_unlock_irqrestore(&iommu->lock, flags);
238 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
240 struct root_entry *root;
241 struct context_entry *context;
244 spin_lock_irqsave(&iommu->lock, flags);
245 root = &iommu->root_entry[bus];
246 context = get_context_addr_from_root(root);
248 context_clear_entry(context[devfn]);
249 __iommu_flush_cache(iommu, &context[devfn], \
252 spin_unlock_irqrestore(&iommu->lock, flags);
255 static void free_context_table(struct intel_iommu *iommu)
257 struct root_entry *root;
260 struct context_entry *context;
262 spin_lock_irqsave(&iommu->lock, flags);
263 if (!iommu->root_entry) {
266 for (i = 0; i < ROOT_ENTRY_NR; i++) {
267 root = &iommu->root_entry[i];
268 context = get_context_addr_from_root(root);
270 free_pgtable_page(context);
272 free_pgtable_page(iommu->root_entry);
273 iommu->root_entry = NULL;
275 spin_unlock_irqrestore(&iommu->lock, flags);
278 /* page table handling */
279 #define LEVEL_STRIDE (9)
280 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
282 static inline int agaw_to_level(int agaw)
287 static inline int agaw_to_width(int agaw)
289 return 30 + agaw * LEVEL_STRIDE;
293 static inline int width_to_agaw(int width)
295 return (width - 30) / LEVEL_STRIDE;
298 static inline unsigned int level_to_offset_bits(int level)
300 return (12 + (level - 1) * LEVEL_STRIDE);
303 static inline int address_level_offset(u64 addr, int level)
305 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
308 static inline u64 level_mask(int level)
310 return ((u64)-1 << level_to_offset_bits(level));
313 static inline u64 level_size(int level)
315 return ((u64)1 << level_to_offset_bits(level));
318 static inline u64 align_to_level(u64 addr, int level)
320 return ((addr + level_size(level) - 1) & level_mask(level));
323 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
325 int addr_width = agaw_to_width(domain->agaw);
326 struct dma_pte *parent, *pte = NULL;
327 int level = agaw_to_level(domain->agaw);
331 BUG_ON(!domain->pgd);
333 addr &= (((u64)1) << addr_width) - 1;
334 parent = domain->pgd;
336 spin_lock_irqsave(&domain->mapping_lock, flags);
340 offset = address_level_offset(addr, level);
341 pte = &parent[offset];
345 if (!dma_pte_present(*pte)) {
346 tmp_page = alloc_pgtable_page();
349 spin_unlock_irqrestore(&domain->mapping_lock,
353 __iommu_flush_cache(domain->iommu, tmp_page,
355 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
357 * high level table always sets r/w, last level page
358 * table control read/write
360 dma_set_pte_readable(*pte);
361 dma_set_pte_writable(*pte);
362 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
364 parent = phys_to_virt(dma_pte_addr(*pte));
368 spin_unlock_irqrestore(&domain->mapping_lock, flags);
372 /* return address's pte at specific level */
373 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
376 struct dma_pte *parent, *pte = NULL;
377 int total = agaw_to_level(domain->agaw);
380 parent = domain->pgd;
381 while (level <= total) {
382 offset = address_level_offset(addr, total);
383 pte = &parent[offset];
387 if (!dma_pte_present(*pte))
389 parent = phys_to_virt(dma_pte_addr(*pte));
395 /* clear one page's page table */
396 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
398 struct dma_pte *pte = NULL;
400 /* get last level pte */
401 pte = dma_addr_level_pte(domain, addr, 1);
405 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
409 /* clear last level pte, a tlb flush should be followed */
410 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
412 int addr_width = agaw_to_width(domain->agaw);
414 start &= (((u64)1) << addr_width) - 1;
415 end &= (((u64)1) << addr_width) - 1;
416 /* in case it's partial page */
417 start = PAGE_ALIGN_4K(start);
420 /* we don't need lock here, nobody else touches the iova range */
421 while (start < end) {
422 dma_pte_clear_one(domain, start);
423 start += PAGE_SIZE_4K;
427 /* free page table pages. last level pte should already be cleared */
428 static void dma_pte_free_pagetable(struct dmar_domain *domain,
431 int addr_width = agaw_to_width(domain->agaw);
433 int total = agaw_to_level(domain->agaw);
437 start &= (((u64)1) << addr_width) - 1;
438 end &= (((u64)1) << addr_width) - 1;
440 /* we don't need lock here, nobody else touches the iova range */
442 while (level <= total) {
443 tmp = align_to_level(start, level);
444 if (tmp >= end || (tmp + level_size(level) > end))
448 pte = dma_addr_level_pte(domain, tmp, level);
451 phys_to_virt(dma_pte_addr(*pte)));
453 __iommu_flush_cache(domain->iommu,
456 tmp += level_size(level);
461 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
462 free_pgtable_page(domain->pgd);
468 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
470 struct root_entry *root;
473 root = (struct root_entry *)alloc_pgtable_page();
477 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
479 spin_lock_irqsave(&iommu->lock, flags);
480 iommu->root_entry = root;
481 spin_unlock_irqrestore(&iommu->lock, flags);
486 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
488 unsigned long start_time = jiffies;\
490 sts = op (iommu->reg + offset);\
493 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
494 panic("DMAR hardware is malfunctioning\n");\
499 static void iommu_set_root_entry(struct intel_iommu *iommu)
505 addr = iommu->root_entry;
507 spin_lock_irqsave(&iommu->register_lock, flag);
508 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
510 cmd = iommu->gcmd | DMA_GCMD_SRTP;
511 writel(cmd, iommu->reg + DMAR_GCMD_REG);
513 /* Make sure hardware complete it */
514 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
515 readl, (sts & DMA_GSTS_RTPS), sts);
517 spin_unlock_irqrestore(&iommu->register_lock, flag);
520 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
525 if (!cap_rwbf(iommu->cap))
527 val = iommu->gcmd | DMA_GCMD_WBF;
529 spin_lock_irqsave(&iommu->register_lock, flag);
530 writel(val, iommu->reg + DMAR_GCMD_REG);
532 /* Make sure hardware complete it */
533 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
534 readl, (!(val & DMA_GSTS_WBFS)), val);
536 spin_unlock_irqrestore(&iommu->register_lock, flag);
539 /* return value determine if we need a write buffer flush */
540 static int __iommu_flush_context(struct intel_iommu *iommu,
541 u16 did, u16 source_id, u8 function_mask, u64 type,
542 int non_present_entry_flush)
548 * In the non-present entry flush case, if hardware doesn't cache
549 * non-present entry we do nothing and if hardware cache non-present
550 * entry, we flush entries of domain 0 (the domain id is used to cache
551 * any non-present entries)
553 if (non_present_entry_flush) {
554 if (!cap_caching_mode(iommu->cap))
561 case DMA_CCMD_GLOBAL_INVL:
562 val = DMA_CCMD_GLOBAL_INVL;
564 case DMA_CCMD_DOMAIN_INVL:
565 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
567 case DMA_CCMD_DEVICE_INVL:
568 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
569 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
576 spin_lock_irqsave(&iommu->register_lock, flag);
577 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
579 /* Make sure hardware complete it */
580 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
581 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
583 spin_unlock_irqrestore(&iommu->register_lock, flag);
585 /* flush context entry will implictly flush write buffer */
589 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
590 int non_present_entry_flush)
592 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
593 non_present_entry_flush);
596 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
597 int non_present_entry_flush)
599 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
600 non_present_entry_flush);
603 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
604 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
606 return __iommu_flush_context(iommu, did, source_id, function_mask,
607 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
610 /* return value determine if we need a write buffer flush */
611 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
612 u64 addr, unsigned int size_order, u64 type,
613 int non_present_entry_flush)
615 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
616 u64 val = 0, val_iva = 0;
620 * In the non-present entry flush case, if hardware doesn't cache
621 * non-present entry we do nothing and if hardware cache non-present
622 * entry, we flush entries of domain 0 (the domain id is used to cache
623 * any non-present entries)
625 if (non_present_entry_flush) {
626 if (!cap_caching_mode(iommu->cap))
633 case DMA_TLB_GLOBAL_FLUSH:
634 /* global flush doesn't need set IVA_REG */
635 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
637 case DMA_TLB_DSI_FLUSH:
638 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
640 case DMA_TLB_PSI_FLUSH:
641 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
642 /* Note: always flush non-leaf currently */
643 val_iva = size_order | addr;
648 /* Note: set drain read/write */
651 * This is probably to be super secure.. Looks like we can
652 * ignore it without any impact.
654 if (cap_read_drain(iommu->cap))
655 val |= DMA_TLB_READ_DRAIN;
657 if (cap_write_drain(iommu->cap))
658 val |= DMA_TLB_WRITE_DRAIN;
660 spin_lock_irqsave(&iommu->register_lock, flag);
661 /* Note: Only uses first TLB reg currently */
663 dmar_writeq(iommu->reg + tlb_offset, val_iva);
664 dmar_writeq(iommu->reg + tlb_offset + 8, val);
666 /* Make sure hardware complete it */
667 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
668 dmar_readq, (!(val & DMA_TLB_IVT)), val);
670 spin_unlock_irqrestore(&iommu->register_lock, flag);
672 /* check IOTLB invalidation granularity */
673 if (DMA_TLB_IAIG(val) == 0)
674 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
675 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
676 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
677 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
678 /* flush context entry will implictly flush write buffer */
682 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
683 int non_present_entry_flush)
685 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
686 non_present_entry_flush);
689 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
690 int non_present_entry_flush)
692 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
693 non_present_entry_flush);
696 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
697 u64 addr, unsigned int pages, int non_present_entry_flush)
701 BUG_ON(addr & (~PAGE_MASK_4K));
704 /* Fallback to domain selective flush if no PSI support */
705 if (!cap_pgsel_inv(iommu->cap))
706 return iommu_flush_iotlb_dsi(iommu, did,
707 non_present_entry_flush);
710 * PSI requires page size to be 2 ^ x, and the base address is naturally
711 * aligned to the size
713 mask = ilog2(__roundup_pow_of_two(pages));
714 /* Fallback to domain selective flush if size is too big */
715 if (mask > cap_max_amask_val(iommu->cap))
716 return iommu_flush_iotlb_dsi(iommu, did,
717 non_present_entry_flush);
719 return __iommu_flush_iotlb(iommu, did, addr, mask,
720 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
723 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
728 spin_lock_irqsave(&iommu->register_lock, flags);
729 pmen = readl(iommu->reg + DMAR_PMEN_REG);
730 pmen &= ~DMA_PMEN_EPM;
731 writel(pmen, iommu->reg + DMAR_PMEN_REG);
733 /* wait for the protected region status bit to clear */
734 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
735 readl, !(pmen & DMA_PMEN_PRS), pmen);
737 spin_unlock_irqrestore(&iommu->register_lock, flags);
740 static int iommu_enable_translation(struct intel_iommu *iommu)
745 spin_lock_irqsave(&iommu->register_lock, flags);
746 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
748 /* Make sure hardware complete it */
749 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
750 readl, (sts & DMA_GSTS_TES), sts);
752 iommu->gcmd |= DMA_GCMD_TE;
753 spin_unlock_irqrestore(&iommu->register_lock, flags);
757 static int iommu_disable_translation(struct intel_iommu *iommu)
762 spin_lock_irqsave(&iommu->register_lock, flag);
763 iommu->gcmd &= ~DMA_GCMD_TE;
764 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
766 /* Make sure hardware complete it */
767 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
768 readl, (!(sts & DMA_GSTS_TES)), sts);
770 spin_unlock_irqrestore(&iommu->register_lock, flag);
774 /* iommu interrupt handling. Most stuff are MSI-like. */
776 static const char *fault_reason_strings[] =
779 "Present bit in root entry is clear",
780 "Present bit in context entry is clear",
781 "Invalid context entry",
782 "Access beyond MGAW",
783 "PTE Write access is not set",
784 "PTE Read access is not set",
785 "Next page table ptr is invalid",
786 "Root table address invalid",
787 "Context table ptr is invalid",
788 "non-zero reserved fields in RTP",
789 "non-zero reserved fields in CTP",
790 "non-zero reserved fields in PTE",
792 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
794 const char *dmar_get_fault_reason(u8 fault_reason)
796 if (fault_reason > MAX_FAULT_REASON_IDX)
799 return fault_reason_strings[fault_reason];
802 void dmar_msi_unmask(unsigned int irq)
804 struct intel_iommu *iommu = get_irq_data(irq);
808 spin_lock_irqsave(&iommu->register_lock, flag);
809 writel(0, iommu->reg + DMAR_FECTL_REG);
810 /* Read a reg to force flush the post write */
811 readl(iommu->reg + DMAR_FECTL_REG);
812 spin_unlock_irqrestore(&iommu->register_lock, flag);
815 void dmar_msi_mask(unsigned int irq)
818 struct intel_iommu *iommu = get_irq_data(irq);
821 spin_lock_irqsave(&iommu->register_lock, flag);
822 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
823 /* Read a reg to force flush the post write */
824 readl(iommu->reg + DMAR_FECTL_REG);
825 spin_unlock_irqrestore(&iommu->register_lock, flag);
828 void dmar_msi_write(int irq, struct msi_msg *msg)
830 struct intel_iommu *iommu = get_irq_data(irq);
833 spin_lock_irqsave(&iommu->register_lock, flag);
834 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
835 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
836 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
837 spin_unlock_irqrestore(&iommu->register_lock, flag);
840 void dmar_msi_read(int irq, struct msi_msg *msg)
842 struct intel_iommu *iommu = get_irq_data(irq);
845 spin_lock_irqsave(&iommu->register_lock, flag);
846 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
847 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
848 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
849 spin_unlock_irqrestore(&iommu->register_lock, flag);
852 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
853 u8 fault_reason, u16 source_id, u64 addr)
857 reason = dmar_get_fault_reason(fault_reason);
860 "DMAR:[%s] Request device [%02x:%02x.%d] "
862 "DMAR:[fault reason %02d] %s\n",
863 (type ? "DMA Read" : "DMA Write"),
864 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
865 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
869 #define PRIMARY_FAULT_REG_LEN (16)
870 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
872 struct intel_iommu *iommu = dev_id;
873 int reg, fault_index;
877 spin_lock_irqsave(&iommu->register_lock, flag);
878 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
880 /* TBD: ignore advanced fault log currently */
881 if (!(fault_status & DMA_FSTS_PPF))
884 fault_index = dma_fsts_fault_record_index(fault_status);
885 reg = cap_fault_reg_offset(iommu->cap);
893 /* highest 32 bits */
894 data = readl(iommu->reg + reg +
895 fault_index * PRIMARY_FAULT_REG_LEN + 12);
896 if (!(data & DMA_FRCD_F))
899 fault_reason = dma_frcd_fault_reason(data);
900 type = dma_frcd_type(data);
902 data = readl(iommu->reg + reg +
903 fault_index * PRIMARY_FAULT_REG_LEN + 8);
904 source_id = dma_frcd_source_id(data);
906 guest_addr = dmar_readq(iommu->reg + reg +
907 fault_index * PRIMARY_FAULT_REG_LEN);
908 guest_addr = dma_frcd_page_addr(guest_addr);
909 /* clear the fault */
910 writel(DMA_FRCD_F, iommu->reg + reg +
911 fault_index * PRIMARY_FAULT_REG_LEN + 12);
913 spin_unlock_irqrestore(&iommu->register_lock, flag);
915 iommu_page_fault_do_one(iommu, type, fault_reason,
916 source_id, guest_addr);
919 if (fault_index > cap_num_fault_regs(iommu->cap))
921 spin_lock_irqsave(&iommu->register_lock, flag);
924 /* clear primary fault overflow */
925 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
926 if (fault_status & DMA_FSTS_PFO)
927 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
929 spin_unlock_irqrestore(&iommu->register_lock, flag);
933 int dmar_set_interrupt(struct intel_iommu *iommu)
939 printk(KERN_ERR "IOMMU: no free vectors\n");
943 set_irq_data(irq, iommu);
946 ret = arch_setup_dmar_msi(irq);
948 set_irq_data(irq, NULL);
954 /* Force fault register is cleared */
955 iommu_page_fault(irq, iommu);
957 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
959 printk(KERN_ERR "IOMMU: can't request irq\n");
963 static int iommu_init_domains(struct intel_iommu *iommu)
965 unsigned long ndomains;
966 unsigned long nlongs;
968 ndomains = cap_ndoms(iommu->cap);
969 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
970 nlongs = BITS_TO_LONGS(ndomains);
972 /* TBD: there might be 64K domains,
973 * consider other allocation for future chip
975 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
976 if (!iommu->domain_ids) {
977 printk(KERN_ERR "Allocating domain id array failed\n");
980 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
982 if (!iommu->domains) {
983 printk(KERN_ERR "Allocating domain array failed\n");
984 kfree(iommu->domain_ids);
989 * if Caching mode is set, then invalid translations are tagged
990 * with domainid 0. Hence we need to pre-allocate it.
992 if (cap_caching_mode(iommu->cap))
993 set_bit(0, iommu->domain_ids);
996 static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
997 struct dmar_drhd_unit *drhd)
1003 iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
1005 printk(KERN_ERR "IOMMU: can't map the region\n");
1008 iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
1009 iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
1011 /* the registers might be more than one page */
1012 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
1013 cap_max_fault_reg_offset(iommu->cap));
1014 map_size = PAGE_ALIGN_4K(map_size);
1015 if (map_size > PAGE_SIZE_4K) {
1016 iounmap(iommu->reg);
1017 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
1019 printk(KERN_ERR "IOMMU: can't map the region\n");
1024 ver = readl(iommu->reg + DMAR_VER_REG);
1025 pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1026 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1027 iommu->cap, iommu->ecap);
1028 ret = iommu_init_domains(iommu);
1031 spin_lock_init(&iommu->lock);
1032 spin_lock_init(&iommu->register_lock);
1034 drhd->iommu = iommu;
1037 iounmap(iommu->reg);
1043 static void domain_exit(struct dmar_domain *domain);
1044 static void free_iommu(struct intel_iommu *iommu)
1046 struct dmar_domain *domain;
1052 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1053 for (; i < cap_ndoms(iommu->cap); ) {
1054 domain = iommu->domains[i];
1055 clear_bit(i, iommu->domain_ids);
1056 domain_exit(domain);
1057 i = find_next_bit(iommu->domain_ids,
1058 cap_ndoms(iommu->cap), i+1);
1061 if (iommu->gcmd & DMA_GCMD_TE)
1062 iommu_disable_translation(iommu);
1065 set_irq_data(iommu->irq, NULL);
1066 /* This will mask the irq */
1067 free_irq(iommu->irq, iommu);
1068 destroy_irq(iommu->irq);
1071 kfree(iommu->domains);
1072 kfree(iommu->domain_ids);
1074 /* free context mapping */
1075 free_context_table(iommu);
1078 iounmap(iommu->reg);
1082 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1085 unsigned long ndomains;
1086 struct dmar_domain *domain;
1087 unsigned long flags;
1089 domain = alloc_domain_mem();
1093 ndomains = cap_ndoms(iommu->cap);
1095 spin_lock_irqsave(&iommu->lock, flags);
1096 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1097 if (num >= ndomains) {
1098 spin_unlock_irqrestore(&iommu->lock, flags);
1099 free_domain_mem(domain);
1100 printk(KERN_ERR "IOMMU: no free domain ids\n");
1104 set_bit(num, iommu->domain_ids);
1106 domain->iommu = iommu;
1107 iommu->domains[num] = domain;
1108 spin_unlock_irqrestore(&iommu->lock, flags);
1113 static void iommu_free_domain(struct dmar_domain *domain)
1115 unsigned long flags;
1117 spin_lock_irqsave(&domain->iommu->lock, flags);
1118 clear_bit(domain->id, domain->iommu->domain_ids);
1119 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1122 static struct iova_domain reserved_iova_list;
1123 static struct lock_class_key reserved_alloc_key;
1124 static struct lock_class_key reserved_rbtree_key;
1126 static void dmar_init_reserved_ranges(void)
1128 struct pci_dev *pdev = NULL;
1133 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1135 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1136 &reserved_alloc_key);
1137 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1138 &reserved_rbtree_key);
1140 /* IOAPIC ranges shouldn't be accessed by DMA */
1141 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1142 IOVA_PFN(IOAPIC_RANGE_END));
1144 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1146 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1147 for_each_pci_dev(pdev) {
1150 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1151 r = &pdev->resource[i];
1152 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1155 addr &= PAGE_MASK_4K;
1156 size = r->end - addr;
1157 size = PAGE_ALIGN_4K(size);
1158 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1159 IOVA_PFN(size + addr) - 1);
1161 printk(KERN_ERR "Reserve iova failed\n");
1167 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1169 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1172 static inline int guestwidth_to_adjustwidth(int gaw)
1175 int r = (gaw - 12) % 9;
1186 static int domain_init(struct dmar_domain *domain, int guest_width)
1188 struct intel_iommu *iommu;
1189 int adjust_width, agaw;
1190 unsigned long sagaw;
1192 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1193 spin_lock_init(&domain->mapping_lock);
1195 domain_reserve_special_ranges(domain);
1197 /* calculate AGAW */
1198 iommu = domain->iommu;
1199 if (guest_width > cap_mgaw(iommu->cap))
1200 guest_width = cap_mgaw(iommu->cap);
1201 domain->gaw = guest_width;
1202 adjust_width = guestwidth_to_adjustwidth(guest_width);
1203 agaw = width_to_agaw(adjust_width);
1204 sagaw = cap_sagaw(iommu->cap);
1205 if (!test_bit(agaw, &sagaw)) {
1206 /* hardware doesn't support it, choose a bigger one */
1207 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1208 agaw = find_next_bit(&sagaw, 5, agaw);
1212 domain->agaw = agaw;
1213 INIT_LIST_HEAD(&domain->devices);
1215 /* always allocate the top pgd */
1216 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1219 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1223 static void domain_exit(struct dmar_domain *domain)
1227 /* Domain 0 is reserved, so dont process it */
1231 domain_remove_dev_info(domain);
1233 put_iova_domain(&domain->iovad);
1234 end = DOMAIN_MAX_ADDR(domain->gaw);
1235 end = end & (~PAGE_MASK_4K);
1238 dma_pte_clear_range(domain, 0, end);
1240 /* free page tables */
1241 dma_pte_free_pagetable(domain, 0, end);
1243 iommu_free_domain(domain);
1244 free_domain_mem(domain);
1247 static int domain_context_mapping_one(struct dmar_domain *domain,
1250 struct context_entry *context;
1251 struct intel_iommu *iommu = domain->iommu;
1252 unsigned long flags;
1254 pr_debug("Set context mapping for %02x:%02x.%d\n",
1255 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1256 BUG_ON(!domain->pgd);
1257 context = device_to_context_entry(iommu, bus, devfn);
1260 spin_lock_irqsave(&iommu->lock, flags);
1261 if (context_present(*context)) {
1262 spin_unlock_irqrestore(&iommu->lock, flags);
1266 context_set_domain_id(*context, domain->id);
1267 context_set_address_width(*context, domain->agaw);
1268 context_set_address_root(*context, virt_to_phys(domain->pgd));
1269 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1270 context_set_fault_enable(*context);
1271 context_set_present(*context);
1272 __iommu_flush_cache(iommu, context, sizeof(*context));
1274 /* it's a non-present to present mapping */
1275 if (iommu_flush_context_device(iommu, domain->id,
1276 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1277 iommu_flush_write_buffer(iommu);
1279 iommu_flush_iotlb_dsi(iommu, 0, 0);
1280 spin_unlock_irqrestore(&iommu->lock, flags);
1285 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1288 struct pci_dev *tmp, *parent;
1290 ret = domain_context_mapping_one(domain, pdev->bus->number,
1295 /* dependent device mapping */
1296 tmp = pci_find_upstream_pcie_bridge(pdev);
1299 /* Secondary interface's bus number and devfn 0 */
1300 parent = pdev->bus->self;
1301 while (parent != tmp) {
1302 ret = domain_context_mapping_one(domain, parent->bus->number,
1306 parent = parent->bus->self;
1308 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1309 return domain_context_mapping_one(domain,
1310 tmp->subordinate->number, 0);
1311 else /* this is a legacy PCI bridge */
1312 return domain_context_mapping_one(domain,
1313 tmp->bus->number, tmp->devfn);
1316 static int domain_context_mapped(struct dmar_domain *domain,
1317 struct pci_dev *pdev)
1320 struct pci_dev *tmp, *parent;
1322 ret = device_context_mapped(domain->iommu,
1323 pdev->bus->number, pdev->devfn);
1326 /* dependent device mapping */
1327 tmp = pci_find_upstream_pcie_bridge(pdev);
1330 /* Secondary interface's bus number and devfn 0 */
1331 parent = pdev->bus->self;
1332 while (parent != tmp) {
1333 ret = device_context_mapped(domain->iommu, parent->bus->number,
1337 parent = parent->bus->self;
1340 return device_context_mapped(domain->iommu,
1341 tmp->subordinate->number, 0);
1343 return device_context_mapped(domain->iommu,
1344 tmp->bus->number, tmp->devfn);
1348 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1349 u64 hpa, size_t size, int prot)
1351 u64 start_pfn, end_pfn;
1352 struct dma_pte *pte;
1355 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1357 iova &= PAGE_MASK_4K;
1358 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1359 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1361 while (start_pfn < end_pfn) {
1362 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1365 /* We don't need lock here, nobody else
1366 * touches the iova range
1368 BUG_ON(dma_pte_addr(*pte));
1369 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1370 dma_set_pte_prot(*pte, prot);
1371 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1378 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1380 clear_context_table(domain->iommu, bus, devfn);
1381 iommu_flush_context_global(domain->iommu, 0);
1382 iommu_flush_iotlb_global(domain->iommu, 0);
1385 static void domain_remove_dev_info(struct dmar_domain *domain)
1387 struct device_domain_info *info;
1388 unsigned long flags;
1390 spin_lock_irqsave(&device_domain_lock, flags);
1391 while (!list_empty(&domain->devices)) {
1392 info = list_entry(domain->devices.next,
1393 struct device_domain_info, link);
1394 list_del(&info->link);
1395 list_del(&info->global);
1397 info->dev->dev.archdata.iommu = NULL;
1398 spin_unlock_irqrestore(&device_domain_lock, flags);
1400 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1401 free_devinfo_mem(info);
1403 spin_lock_irqsave(&device_domain_lock, flags);
1405 spin_unlock_irqrestore(&device_domain_lock, flags);
1410 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1412 struct dmar_domain *
1413 find_domain(struct pci_dev *pdev)
1415 struct device_domain_info *info;
1417 /* No lock here, assumes no domain exit in normal case */
1418 info = pdev->dev.archdata.iommu;
1420 return info->domain;
1424 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1425 struct pci_dev *dev)
1430 for (index = 0; index < cnt; index++)
1431 if (dev == devices[index])
1434 /* Check our parent */
1435 dev = dev->bus->self;
1441 static struct dmar_drhd_unit *
1442 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1444 struct dmar_drhd_unit *drhd = NULL;
1446 list_for_each_entry(drhd, &dmar_drhd_units, list) {
1447 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1448 drhd->devices_cnt, dev))
1455 /* domain is initialized */
1456 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1458 struct dmar_domain *domain, *found = NULL;
1459 struct intel_iommu *iommu;
1460 struct dmar_drhd_unit *drhd;
1461 struct device_domain_info *info, *tmp;
1462 struct pci_dev *dev_tmp;
1463 unsigned long flags;
1464 int bus = 0, devfn = 0;
1466 domain = find_domain(pdev);
1470 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1472 if (dev_tmp->is_pcie) {
1473 bus = dev_tmp->subordinate->number;
1476 bus = dev_tmp->bus->number;
1477 devfn = dev_tmp->devfn;
1479 spin_lock_irqsave(&device_domain_lock, flags);
1480 list_for_each_entry(info, &device_domain_list, global) {
1481 if (info->bus == bus && info->devfn == devfn) {
1482 found = info->domain;
1486 spin_unlock_irqrestore(&device_domain_lock, flags);
1487 /* pcie-pci bridge already has a domain, uses it */
1494 /* Allocate new domain for the device */
1495 drhd = dmar_find_matched_drhd_unit(pdev);
1497 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1501 iommu = drhd->iommu;
1503 domain = iommu_alloc_domain(iommu);
1507 if (domain_init(domain, gaw)) {
1508 domain_exit(domain);
1512 /* register pcie-to-pci device */
1514 info = alloc_devinfo_mem();
1516 domain_exit(domain);
1520 info->devfn = devfn;
1522 info->domain = domain;
1523 /* This domain is shared by devices under p2p bridge */
1524 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1526 /* pcie-to-pci bridge already has a domain, uses it */
1528 spin_lock_irqsave(&device_domain_lock, flags);
1529 list_for_each_entry(tmp, &device_domain_list, global) {
1530 if (tmp->bus == bus && tmp->devfn == devfn) {
1531 found = tmp->domain;
1536 free_devinfo_mem(info);
1537 domain_exit(domain);
1540 list_add(&info->link, &domain->devices);
1541 list_add(&info->global, &device_domain_list);
1543 spin_unlock_irqrestore(&device_domain_lock, flags);
1547 info = alloc_devinfo_mem();
1550 info->bus = pdev->bus->number;
1551 info->devfn = pdev->devfn;
1553 info->domain = domain;
1554 spin_lock_irqsave(&device_domain_lock, flags);
1555 /* somebody is fast */
1556 found = find_domain(pdev);
1557 if (found != NULL) {
1558 spin_unlock_irqrestore(&device_domain_lock, flags);
1559 if (found != domain) {
1560 domain_exit(domain);
1563 free_devinfo_mem(info);
1566 list_add(&info->link, &domain->devices);
1567 list_add(&info->global, &device_domain_list);
1568 pdev->dev.archdata.iommu = info;
1569 spin_unlock_irqrestore(&device_domain_lock, flags);
1572 /* recheck it here, maybe others set it */
1573 return find_domain(pdev);
1576 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1578 struct dmar_domain *domain;
1584 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1585 pci_name(pdev), start, end);
1586 /* page table init */
1587 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1591 /* The address might not be aligned */
1592 base = start & PAGE_MASK_4K;
1594 size = PAGE_ALIGN_4K(size);
1595 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1596 IOVA_PFN(base + size) - 1)) {
1597 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1602 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1603 size, base, pci_name(pdev));
1605 * RMRR range might have overlap with physical memory range,
1608 dma_pte_clear_range(domain, base, base + size);
1610 ret = domain_page_mapping(domain, base, base, size,
1611 DMA_PTE_READ|DMA_PTE_WRITE);
1615 /* context entry init */
1616 ret = domain_context_mapping(domain, pdev);
1620 domain_exit(domain);
1625 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1626 struct pci_dev *pdev)
1628 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1630 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1631 rmrr->end_address + 1);
1634 #ifdef CONFIG_DMAR_GFX_WA
1635 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1636 static void __init iommu_prepare_gfx_mapping(void)
1638 struct pci_dev *pdev = NULL;
1643 for_each_pci_dev(pdev) {
1644 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1645 !IS_GFX_DEVICE(pdev))
1647 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1649 slot = arch_get_ram_range(0, &base, &size);
1651 ret = iommu_prepare_identity_map(pdev,
1655 slot = arch_get_ram_range(slot, &base, &size);
1659 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1664 #ifdef CONFIG_DMAR_FLOPPY_WA
1665 static inline void iommu_prepare_isa(void)
1667 struct pci_dev *pdev;
1670 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1674 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1675 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1678 printk("IOMMU: Failed to create 0-64M identity map, "
1679 "floppy might not work\n");
1683 static inline void iommu_prepare_isa(void)
1687 #endif /* !CONFIG_DMAR_FLPY_WA */
1689 int __init init_dmars(void)
1691 struct dmar_drhd_unit *drhd;
1692 struct dmar_rmrr_unit *rmrr;
1693 struct pci_dev *pdev;
1694 struct intel_iommu *iommu;
1695 int nlongs, i, ret, unit = 0;
1700 * initialize and program root entry to not present
1703 for_each_drhd_unit(drhd) {
1708 * lock not needed as this is only incremented in the single
1709 * threaded kernel __init code path all other access are read
1714 nlongs = BITS_TO_LONGS(g_num_of_iommus);
1715 g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
1716 if (!g_iommus_to_flush) {
1717 printk(KERN_ERR "Intel-IOMMU: "
1718 "Allocating bitmap array failed\n");
1722 g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
1724 kfree(g_iommus_to_flush);
1730 for_each_drhd_unit(drhd) {
1733 iommu = alloc_iommu(&g_iommus[i], drhd);
1742 * we could share the same root & context tables
1743 * amoung all IOMMU's. Need to Split it later.
1745 ret = iommu_alloc_root_entry(iommu);
1747 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1754 * for each dev attached to rmrr
1756 * locate drhd for dev, alloc domain for dev
1757 * allocate free domain
1758 * allocate page table entries for rmrr
1759 * if context not allocated for bus
1760 * allocate and init context
1761 * set present in root table for this bus
1762 * init context with domain, translation etc
1766 for_each_rmrr_units(rmrr) {
1767 for (i = 0; i < rmrr->devices_cnt; i++) {
1768 pdev = rmrr->devices[i];
1769 /* some BIOS lists non-exist devices in DMAR table */
1772 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1775 "IOMMU: mapping reserved region failed\n");
1779 iommu_prepare_gfx_mapping();
1781 iommu_prepare_isa();
1786 * global invalidate context cache
1787 * global invalidate iotlb
1788 * enable translation
1790 for_each_drhd_unit(drhd) {
1793 iommu = drhd->iommu;
1794 sprintf (iommu->name, "dmar%d", unit++);
1796 iommu_flush_write_buffer(iommu);
1798 ret = dmar_set_interrupt(iommu);
1802 iommu_set_root_entry(iommu);
1804 iommu_flush_context_global(iommu, 0);
1805 iommu_flush_iotlb_global(iommu, 0);
1807 iommu_disable_protect_mem_regions(iommu);
1809 ret = iommu_enable_translation(iommu);
1816 for_each_drhd_unit(drhd) {
1819 iommu = drhd->iommu;
1826 static inline u64 aligned_size(u64 host_addr, size_t size)
1829 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1830 return PAGE_ALIGN_4K(addr);
1834 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1838 /* Make sure it's in range */
1839 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1840 if (!size || (IOVA_START_ADDR + size > end))
1843 piova = alloc_iova(&domain->iovad,
1844 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1848 static struct iova *
1849 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1852 struct pci_dev *pdev = to_pci_dev(dev);
1853 struct iova *iova = NULL;
1855 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1856 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1859 * First try to allocate an io virtual address in
1860 * DMA_32BIT_MASK and if that fails then try allocating
1863 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1865 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1869 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1876 static struct dmar_domain *
1877 get_valid_domain_for_dev(struct pci_dev *pdev)
1879 struct dmar_domain *domain;
1882 domain = get_domain_for_dev(pdev,
1883 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1886 "Allocating domain for %s failed", pci_name(pdev));
1890 /* make sure context mapping is ok */
1891 if (unlikely(!domain_context_mapped(domain, pdev))) {
1892 ret = domain_context_mapping(domain, pdev);
1895 "Domain context map for %s failed",
1904 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1905 size_t size, int dir)
1907 struct pci_dev *pdev = to_pci_dev(hwdev);
1909 struct dmar_domain *domain;
1910 unsigned long start_addr;
1914 BUG_ON(dir == DMA_NONE);
1915 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1916 return virt_to_bus(addr);
1918 domain = get_valid_domain_for_dev(pdev);
1922 addr = (void *)virt_to_phys(addr);
1923 size = aligned_size((u64)addr, size);
1925 iova = __intel_alloc_iova(hwdev, domain, size);
1929 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1932 * Check if DMAR supports zero-length reads on write only
1935 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1936 !cap_zlr(domain->iommu->cap))
1937 prot |= DMA_PTE_READ;
1938 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1939 prot |= DMA_PTE_WRITE;
1941 * addr - (addr + size) might be partial page, we should map the whole
1942 * page. Note: if two part of one page are separately mapped, we
1943 * might have two guest_addr mapping to the same host addr, but this
1944 * is not a big problem
1946 ret = domain_page_mapping(domain, start_addr,
1947 ((u64)addr) & PAGE_MASK_4K, size, prot);
1951 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1952 pci_name(pdev), size, (u64)addr,
1953 size, (u64)start_addr, dir);
1955 /* it's a non-present to present mapping */
1956 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1957 start_addr, size >> PAGE_SHIFT_4K, 1);
1959 iommu_flush_write_buffer(domain->iommu);
1961 return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1965 __free_iova(&domain->iovad, iova);
1966 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1967 pci_name(pdev), size, (u64)addr, dir);
1971 static void flush_unmaps(void)
1973 struct iova *node, *n;
1974 unsigned long flags;
1977 spin_lock_irqsave(&async_umap_flush_lock, flags);
1980 /* just flush them all */
1981 for (i = 0; i < g_num_of_iommus; i++) {
1982 if (test_and_clear_bit(i, g_iommus_to_flush))
1983 iommu_flush_iotlb_global(&g_iommus[i], 0);
1986 list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
1988 list_del(&node->list);
1989 __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
1993 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1996 static void flush_unmaps_timeout(unsigned long data)
2001 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2003 unsigned long flags;
2005 spin_lock_irqsave(&async_umap_flush_lock, flags);
2007 list_add(&iova->list, &unmaps_to_do);
2008 set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
2011 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2015 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2018 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
2019 size_t size, int dir)
2021 struct pci_dev *pdev = to_pci_dev(dev);
2022 struct dmar_domain *domain;
2023 unsigned long start_addr;
2026 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2028 domain = find_domain(pdev);
2031 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2035 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2036 size = aligned_size((u64)dev_addr, size);
2038 pr_debug("Device %s unmapping: %lx@%llx\n",
2039 pci_name(pdev), size, (u64)start_addr);
2041 /* clear the whole page */
2042 dma_pte_clear_range(domain, start_addr, start_addr + size);
2043 /* free page tables */
2044 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2045 if (intel_iommu_strict) {
2046 if (iommu_flush_iotlb_psi(domain->iommu,
2047 domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
2048 iommu_flush_write_buffer(domain->iommu);
2050 __free_iova(&domain->iovad, iova);
2052 add_unmap(domain, iova);
2054 * queue up the release of the unmap to save the 1/6th of the
2055 * cpu used up by the iotlb flush operation...
2057 if (list_size > high_watermark)
2062 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2063 dma_addr_t *dma_handle, gfp_t flags)
2068 size = PAGE_ALIGN_4K(size);
2069 order = get_order(size);
2070 flags &= ~(GFP_DMA | GFP_DMA32);
2072 vaddr = (void *)__get_free_pages(flags, order);
2075 memset(vaddr, 0, size);
2077 *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
2080 free_pages((unsigned long)vaddr, order);
2084 static void intel_free_coherent(struct device *hwdev, size_t size,
2085 void *vaddr, dma_addr_t dma_handle)
2089 size = PAGE_ALIGN_4K(size);
2090 order = get_order(size);
2092 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2093 free_pages((unsigned long)vaddr, order);
2096 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2097 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2098 int nelems, int dir)
2101 struct pci_dev *pdev = to_pci_dev(hwdev);
2102 struct dmar_domain *domain;
2103 unsigned long start_addr;
2107 struct scatterlist *sg;
2109 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2112 domain = find_domain(pdev);
2114 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2117 for_each_sg(sglist, sg, nelems, i) {
2118 addr = SG_ENT_VIRT_ADDRESS(sg);
2119 size += aligned_size((u64)addr, sg->length);
2122 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2124 /* clear the whole page */
2125 dma_pte_clear_range(domain, start_addr, start_addr + size);
2126 /* free page tables */
2127 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2129 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2130 size >> PAGE_SHIFT_4K, 0))
2131 iommu_flush_write_buffer(domain->iommu);
2134 __free_iova(&domain->iovad, iova);
2137 static int intel_nontranslate_map_sg(struct device *hddev,
2138 struct scatterlist *sglist, int nelems, int dir)
2141 struct scatterlist *sg;
2143 for_each_sg(sglist, sg, nelems, i) {
2144 BUG_ON(!sg_page(sg));
2145 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2146 sg->dma_length = sg->length;
2151 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2152 int nelems, int dir)
2156 struct pci_dev *pdev = to_pci_dev(hwdev);
2157 struct dmar_domain *domain;
2161 struct iova *iova = NULL;
2163 struct scatterlist *sg;
2164 unsigned long start_addr;
2166 BUG_ON(dir == DMA_NONE);
2167 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2168 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2170 domain = get_valid_domain_for_dev(pdev);
2174 for_each_sg(sglist, sg, nelems, i) {
2175 addr = SG_ENT_VIRT_ADDRESS(sg);
2176 addr = (void *)virt_to_phys(addr);
2177 size += aligned_size((u64)addr, sg->length);
2180 iova = __intel_alloc_iova(hwdev, domain, size);
2182 sglist->dma_length = 0;
2187 * Check if DMAR supports zero-length reads on write only
2190 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2191 !cap_zlr(domain->iommu->cap))
2192 prot |= DMA_PTE_READ;
2193 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2194 prot |= DMA_PTE_WRITE;
2196 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2198 for_each_sg(sglist, sg, nelems, i) {
2199 addr = SG_ENT_VIRT_ADDRESS(sg);
2200 addr = (void *)virt_to_phys(addr);
2201 size = aligned_size((u64)addr, sg->length);
2202 ret = domain_page_mapping(domain, start_addr + offset,
2203 ((u64)addr) & PAGE_MASK_4K,
2206 /* clear the page */
2207 dma_pte_clear_range(domain, start_addr,
2208 start_addr + offset);
2209 /* free page tables */
2210 dma_pte_free_pagetable(domain, start_addr,
2211 start_addr + offset);
2213 __free_iova(&domain->iovad, iova);
2216 sg->dma_address = start_addr + offset +
2217 ((u64)addr & (~PAGE_MASK_4K));
2218 sg->dma_length = sg->length;
2222 /* it's a non-present to present mapping */
2223 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2224 start_addr, offset >> PAGE_SHIFT_4K, 1))
2225 iommu_flush_write_buffer(domain->iommu);
2229 static struct dma_mapping_ops intel_dma_ops = {
2230 .alloc_coherent = intel_alloc_coherent,
2231 .free_coherent = intel_free_coherent,
2232 .map_single = intel_map_single,
2233 .unmap_single = intel_unmap_single,
2234 .map_sg = intel_map_sg,
2235 .unmap_sg = intel_unmap_sg,
2238 static inline int iommu_domain_cache_init(void)
2242 iommu_domain_cache = kmem_cache_create("iommu_domain",
2243 sizeof(struct dmar_domain),
2248 if (!iommu_domain_cache) {
2249 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2256 static inline int iommu_devinfo_cache_init(void)
2260 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2261 sizeof(struct device_domain_info),
2266 if (!iommu_devinfo_cache) {
2267 printk(KERN_ERR "Couldn't create devinfo cache\n");
2274 static inline int iommu_iova_cache_init(void)
2278 iommu_iova_cache = kmem_cache_create("iommu_iova",
2279 sizeof(struct iova),
2284 if (!iommu_iova_cache) {
2285 printk(KERN_ERR "Couldn't create iova cache\n");
2292 static int __init iommu_init_mempool(void)
2295 ret = iommu_iova_cache_init();
2299 ret = iommu_domain_cache_init();
2303 ret = iommu_devinfo_cache_init();
2307 kmem_cache_destroy(iommu_domain_cache);
2309 kmem_cache_destroy(iommu_iova_cache);
2314 static void __init iommu_exit_mempool(void)
2316 kmem_cache_destroy(iommu_devinfo_cache);
2317 kmem_cache_destroy(iommu_domain_cache);
2318 kmem_cache_destroy(iommu_iova_cache);
2322 void __init detect_intel_iommu(void)
2324 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2326 if (early_dmar_detect()) {
2331 static void __init init_no_remapping_devices(void)
2333 struct dmar_drhd_unit *drhd;
2335 for_each_drhd_unit(drhd) {
2336 if (!drhd->include_all) {
2338 for (i = 0; i < drhd->devices_cnt; i++)
2339 if (drhd->devices[i] != NULL)
2341 /* ignore DMAR unit if no pci devices exist */
2342 if (i == drhd->devices_cnt)
2350 for_each_drhd_unit(drhd) {
2352 if (drhd->ignored || drhd->include_all)
2355 for (i = 0; i < drhd->devices_cnt; i++)
2356 if (drhd->devices[i] &&
2357 !IS_GFX_DEVICE(drhd->devices[i]))
2360 if (i < drhd->devices_cnt)
2363 /* bypass IOMMU if it is just for gfx devices */
2365 for (i = 0; i < drhd->devices_cnt; i++) {
2366 if (!drhd->devices[i])
2368 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2373 int __init intel_iommu_init(void)
2377 if (no_iommu || swiotlb || dmar_disabled)
2380 if (dmar_table_init())
2383 high_watermark = 250;
2384 intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
2385 debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
2386 intel_iommu_debug, &high_watermark);
2387 iommu_init_mempool();
2388 dmar_init_reserved_ranges();
2390 init_no_remapping_devices();
2394 printk(KERN_ERR "IOMMU: dmar init failed\n");
2395 put_iova_domain(&reserved_iova_list);
2396 iommu_exit_mempool();
2400 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2402 init_timer(&unmap_timer);
2404 dma_ops = &intel_dma_ops;