TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.
To address this, POWER8 CPU (actually, IODA2) supports multi-level
TCE tables, up to 5 levels which splits the table into a tree of
smaller subtables.
This adds multi-level TCE tables support to
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages()
helpers.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
struct iommu_table {
unsigned long it_busno; /* Bus number this table belongs to */
unsigned long it_size; /* Size of iommu table in entries */
struct iommu_table {
unsigned long it_busno; /* Bus number this table belongs to */
unsigned long it_size; /* Size of iommu table in entries */
+ unsigned long it_indirect_levels;
+ unsigned long it_level_size;
unsigned long it_offset; /* Offset into global table */
unsigned long it_base; /* mapped address of tce table */
unsigned long it_index; /* which iommu table this is */
unsigned long it_offset; /* Offset into global table */
unsigned long it_base; /* mapped address of tce table */
unsigned long it_index; /* which iommu table this is */
/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
+#define POWERNV_IOMMU_DEFAULT_LEVELS 1
+#define POWERNV_IOMMU_MAX_LEVELS 5
+
static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
table_group);
struct pnv_phb *phb = pe->phb;
int64_t rc;
table_group);
struct pnv_phb *phb = pe->phb;
int64_t rc;
+ const unsigned long size = tbl->it_indirect_levels ?
+ tbl->it_level_size : tbl->it_size;
const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
const __u64 win_size = tbl->it_size << tbl->it_page_shift;
const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
const __u64 win_size = tbl->it_size << tbl->it_page_shift;
rc = opal_pci_map_pe_dma_window(phb->opal_id,
pe->pe_number,
pe->pe_number << 1,
rc = opal_pci_map_pe_dma_window(phb->opal_id,
pe->pe_number,
pe->pe_number << 1,
+ tbl->it_indirect_levels + 1,
IOMMU_PAGE_SIZE(tbl));
if (rc) {
pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
IOMMU_PAGE_SIZE(tbl));
if (rc) {
pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
}
phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
}
-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+ unsigned levels, unsigned long limit,
+ unsigned long *current_offset)
{
struct page *tce_mem = NULL;
{
struct page *tce_mem = NULL;
unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+ unsigned long allocated = 1UL << (order + PAGE_SHIFT);
+ unsigned entries = 1UL << (shift - 3);
+ long i;
tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
if (!tce_mem) {
tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
if (!tce_mem) {
return NULL;
}
addr = page_address(tce_mem);
return NULL;
}
addr = page_address(tce_mem);
- memset(addr, 0, 1UL << (order + PAGE_SHIFT));
+ memset(addr, 0, allocated);
+
+ --levels;
+ if (!levels) {
+ *current_offset += allocated;
+ return addr;
+ }
+
+ for (i = 0; i < entries; ++i) {
+ tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+ levels, limit, current_offset);
+ if (!tmp)
+ break;
+
+ addr[i] = cpu_to_be64(__pa(tmp) |
+ TCE_PCI_READ | TCE_PCI_WRITE);
+
+ if (*current_offset >= limit)
+ break;
+ }
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+ unsigned long size, unsigned level);
+
static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
- __u32 page_shift, __u64 window_size, struct iommu_table *tbl)
+ __u32 page_shift, __u64 window_size, __u32 levels,
+ struct iommu_table *tbl)
+ unsigned long offset = 0, level_shift;
const unsigned window_shift = ilog2(window_size);
unsigned entries_shift = window_shift - page_shift;
unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
const unsigned long tce_table_size = 1UL << table_shift;
const unsigned window_shift = ilog2(window_size);
unsigned entries_shift = window_shift - page_shift;
unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
const unsigned long tce_table_size = 1UL << table_shift;
+ if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
+ return -EINVAL;
+
if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
return -EINVAL;
if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
return -EINVAL;
+ /* Adjust direct table size from window_size and levels */
+ entries_shift = (entries_shift + levels - 1) / levels;
+ level_shift = entries_shift + 3;
+ level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
+
- addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
+ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+ levels, tce_table_size, &offset);
+
+ /* addr==NULL means that the first level allocation failed */
if (!addr)
return -ENOMEM;
if (!addr)
return -ENOMEM;
+ /*
+ * First level was allocated but some lower level failed as
+ * we did not allocate as much as we wanted,
+ * release partially allocated table.
+ */
+ if (offset < tce_table_size) {
+ pnv_pci_ioda2_table_do_free_pages(addr,
+ 1ULL << (level_shift - 3), levels - 1);
+ return -ENOMEM;
+ }
+
/* Setup linux iommu table */
pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
page_shift);
/* Setup linux iommu table */
pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
page_shift);
+ tbl->it_level_size = 1ULL << (level_shift - 3);
+ tbl->it_indirect_levels = levels - 1;
pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
window_size, tce_table_size, bus_offset);
pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
window_size, tce_table_size, bus_offset);
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+ unsigned long size, unsigned level)
+{
+ const unsigned long addr_ul = (unsigned long) addr &
+ ~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+ if (level) {
+ long i;
+ u64 *tmp = (u64 *) addr_ul;
+
+ for (i = 0; i < size; ++i) {
+ unsigned long hpa = be64_to_cpu(tmp[i]);
+
+ if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
+ continue;
+
+ pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
+ level - 1);
+ }
+ }
+
+ free_pages(addr_ul, get_order(size << 3));
+}
+
static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
{
static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
{
+ const unsigned long size = tbl->it_indirect_levels ?
+ tbl->it_level_size : tbl->it_size;
+
if (!tbl->it_size)
return;
if (!tbl->it_size)
return;
- free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+ pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
+ tbl->it_indirect_levels);
}
static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
}
static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
/* Setup linux iommu table */
rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
/* Setup linux iommu table */
rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
- 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
+ 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
+ POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
if (rc) {
pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
goto fail;
if (rc) {
pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
goto fail;
static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
{
__be64 *tmp = ((__be64 *)tbl->it_base);
static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
{
__be64 *tmp = ((__be64 *)tbl->it_base);
+ int level = tbl->it_indirect_levels;
+ const long shift = ilog2(tbl->it_level_size);
+ unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
+
+ while (level) {
+ int n = (idx & mask) >> (level * shift);
+ unsigned long tce = be64_to_cpu(tmp[n]);
+
+ tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
+ idx &= ~mask;
+ mask >>= shift;
+ --level;
+ }