*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
- * chunk is consisted of num_possible_cpus() units and the first chunk
- * is used for static percpu variables in the kernel image (special
- * boot time alloc/init handling necessary as these areas need to be
- * brought up before allocation services are running). Unit grows as
- * necessary and all units grow or shrink in unison. When a chunk is
- * filled up, another chunk is allocated. ie. in vmalloc area
+ * chunk is consisted of boot-time determined number of units and the
+ * first chunk is used for static percpu variables in the kernel image
+ * (special boot time alloc/init handling necessary as these areas
+ * need to be brought up before allocation services are running).
+ * Unit grows as necessary and all units grow or shrink in unison.
+ * When a chunk is filled up, another chunk is allocated. ie. in
+ * vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
*
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
- * percpu base registers pcpu_unit_size apart.
+ * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
+ * cpus. On NUMA, the mapping can be non-linear and even sparse.
+ * Percpu access can be done by configuring percpu base registers
+ * according to cpu to unit mapping and pcpu_unit_size.
*
- * There are usually many small percpu allocations many of them as
- * small as 4 bytes. The allocator organizes chunks into lists
+ * There are usually many small percpu allocations many of them being
+ * as small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/list.h>
+#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
bool immutable; /* no [de]population allowed */
- struct page **page; /* points to page array */
- struct page *page_ar[]; /* #cpus * UNIT_PAGES */
+ unsigned long populated[]; /* populated bitmap */
};
static int pcpu_unit_pages __read_mostly;
static int pcpu_unit_size __read_mostly;
+static int pcpu_nr_units __read_mostly;
static int pcpu_chunk_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
+/* cpus with the lowest and highest unit numbers */
+static unsigned int pcpu_first_unit_cpu __read_mostly;
+static unsigned int pcpu_last_unit_cpu __read_mostly;
+
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
+/* cpu -> unit map */
+const int *pcpu_unit_map __read_mostly;
+
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
* Synchronization rules.
*
* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
- * protects allocation/reclaim paths, chunks and chunk->page arrays.
- * The latter is a spinlock and protects the index data structures -
- * chunk slots, chunks and area maps in chunks.
+ * protects allocation/reclaim paths, chunks, populated bitmap and
+ * vmalloc mapping. The latter is a spinlock and protects the index
+ * data structures - chunk slots, chunks and area maps in chunks.
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
static int pcpu_page_idx(unsigned int cpu, int page_idx)
{
- return cpu * pcpu_unit_pages + page_idx;
-}
-
-static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
-{
- return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+ return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
}
-static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
- int page_idx)
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
{
- return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+ /* must not be used on pre-mapped chunk */
+ WARN_ON(chunk->immutable);
+
+ return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}
/* set the pointer to a chunk in a page struct */
return (struct pcpu_chunk *)page->index;
}
+static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+ *rs = find_next_zero_bit(chunk->populated, end, *rs);
+ *re = find_next_bit(chunk->populated, end, *rs + 1);
+}
+
+static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+ *rs = find_next_bit(chunk->populated, end, *rs);
+ *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+}
+
+/*
+ * (Un)populated page region iterators. Iterate over (un)populated
+ * page regions betwen @start and @end in @chunk. @rs and @re should
+ * be integer variables and will be set to start and end page index of
+ * the current region.
+ */
+#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+
/**
* pcpu_mem_alloc - allocate memory
* @size: bytes to allocate
return pcpu_first_chunk;
}
+ /*
+ * The address is relative to unit0 which might be unused and
+ * thus unmapped. Offset the address to the unit space of the
+ * current processor before looking it up in the vmalloc
+ * space. Note that any possible cpu id can be used here, so
+ * there's no need to worry about preemption or cpu hotplug.
+ */
+ addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
return pcpu_get_page_chunk(vmalloc_to_page(addr));
}
}
/**
- * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
* @chunk: chunk of interest
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- * @flush_tlb: whether to flush tlb or not
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
*
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * If @flush is true, vcache is flushed before unmapping and tlb
- * after.
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx(). The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated. Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
*/
-static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
- bool flush_tlb)
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+ unsigned long **bitmapp,
+ bool may_alloc)
{
- unsigned int last = num_possible_cpus() - 1;
- unsigned int cpu;
+ static struct page **pages;
+ static unsigned long *bitmap;
+ size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+ size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+ sizeof(unsigned long);
+
+ if (!pages || !bitmap) {
+ if (may_alloc && !pages)
+ pages = pcpu_mem_alloc(pages_size);
+ if (may_alloc && !bitmap)
+ bitmap = pcpu_mem_alloc(bitmap_size);
+ if (!pages || !bitmap)
+ return NULL;
+ }
- /* unmap must not be done on immutable chunk */
- WARN_ON(chunk->immutable);
+ memset(pages, 0, pages_size);
+ bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
- /*
- * Each flushing trial can be very expensive, issue flush on
- * the whole region at once rather than doing it for each cpu.
- * This could be an overkill but is more scalable.
- */
- flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+ *bitmapp = bitmap;
+ return pages;
+}
- for_each_possible_cpu(cpu)
- unmap_kernel_range_noflush(
- pcpu_chunk_addr(chunk, cpu, page_start),
- (page_end - page_start) << PAGE_SHIFT);
-
- /* ditto as flush_cache_vunmap() */
- if (flush_tlb)
- flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+ if (page)
+ __free_page(page);
+ }
+ }
}
/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk. If @flush is true, vcache is flushed before unmapping
- * and tlb after.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk. Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
*/
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
- bool flush)
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- int unmap_start = -1;
- int uninitialized_var(unmap_end);
+ const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
unsigned int cpu;
int i;
- for (i = page_start; i < page_end; i++) {
- for_each_possible_cpu(cpu) {
- struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ if (!*pagep) {
+ pcpu_free_pages(chunk, pages, populated,
+ page_start, page_end);
+ return -ENOMEM;
+ }
+ }
+ }
+ return 0;
+}
- if (!*pagep)
- continue;
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped. Flush cache. As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu. This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vunmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
- __free_page(*pagep);
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+ unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
- /*
- * If it's partial depopulation, it might get
- * populated or depopulated again. Mark the
- * page gone.
- */
- *pagep = NULL;
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished. The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
- unmap_start = unmap_start < 0 ? i : unmap_start;
- unmap_end = i + 1;
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page;
+
+ page = pcpu_chunk_page(chunk, cpu, i);
+ WARN_ON(!page);
+ pages[pcpu_page_idx(cpu, i)] = page;
}
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ page_end - page_start);
}
- if (unmap_start >= 0)
- pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+ for (i = page_start; i < page_end; i++)
+ __clear_bit(i, populated);
+}
+
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
+ * TLB for the regions. This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_tlb_kernel_range(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
}
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
}
/**
- * pcpu_map - map pages into a pcpu_chunk
+ * pcpu_map_pages - map pages into a pcpu_chunk
* @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
- * For each cpu, map pages [@page_start,@page_end) into @chunk.
- * vcache is flushed afterwards.
+ * For each cpu, map pages [@page_start,@page_end) into @chunk. The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
*/
-static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
{
- unsigned int last = num_possible_cpus() - 1;
- unsigned int cpu;
- int err;
-
- /* map must not be done on immutable chunk */
- WARN_ON(chunk->immutable);
+ unsigned int cpu, tcpu;
+ int i, err;
for_each_possible_cpu(cpu) {
err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
- pcpu_chunk_pagep(chunk, cpu, page_start),
+ &pages[pcpu_page_idx(cpu, page_start)],
page_end - page_start);
if (err < 0)
- return err;
+ goto err;
+ }
+
+ /* mapping successful, link chunk and mark populated */
+ for (i = page_start; i < page_end; i++) {
+ for_each_possible_cpu(cpu)
+ pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ chunk);
+ __set_bit(i, populated);
}
- /* flush at once, please read comments in pcpu_unmap() */
- flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
return 0;
+
+err:
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ page_end - page_start);
+ }
+ return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk. If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ struct page **pages;
+ unsigned long *populated;
+ int rs, re;
+
+ /* quick path, check whether it's empty already */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ if (rs == page_start && re == page_end)
+ return;
+ break;
+ }
+
+ /* immutable chunks can't be depopulated */
+ WARN_ON(chunk->immutable);
+
+ /*
+ * If control reaches here, there must have been at least one
+ * successful population attempt so the temp pages array must
+ * be available now.
+ */
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ BUG_ON(!pages);
+
+ /* unmap and free */
+ pcpu_pre_unmap_flush(chunk, page_start, page_end);
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+
+ /* no need to flush tlb, vmalloc will handle it lazily */
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
}
/**
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
- const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
- int map_start = -1;
- int uninitialized_var(map_end);
+ int free_end = page_start, unmap_end = page_start;
+ struct page **pages;
+ unsigned long *populated;
unsigned int cpu;
- int i;
+ int rs, re, rc;
- for (i = page_start; i < page_end; i++) {
- if (pcpu_chunk_page_occupied(chunk, i)) {
- if (map_start >= 0) {
- if (pcpu_map(chunk, map_start, map_end))
- goto err;
- map_start = -1;
- }
- continue;
- }
+ /* quick path, check whether all pages are already there */
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
+ if (rs == page_start && re == page_end)
+ goto clear;
+ break;
+ }
- map_start = map_start < 0 ? i : map_start;
- map_end = i + 1;
+ /* need to allocate and map pages, this chunk can't be immutable */
+ WARN_ON(chunk->immutable);
- for_each_possible_cpu(cpu) {
- struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ if (!pages)
+ return -ENOMEM;
- *pagep = alloc_pages_node(cpu_to_node(cpu),
- alloc_mask, 0);
- if (!*pagep)
- goto err;
- pcpu_set_page_chunk(*pagep, chunk);
- }
+ /* alloc and map */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_free;
+ free_end = re;
}
- if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
- goto err;
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_unmap;
+ unmap_end = re;
+ }
+ pcpu_post_map_flush(chunk, page_start, page_end);
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
for_each_possible_cpu(cpu)
- memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
- size);
-
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
return 0;
-err:
- /* likely under heavy memory pressure, give memory back */
- pcpu_depopulate_chunk(chunk, off, size, true);
- return -ENOMEM;
+
+err_unmap:
+ pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+ return rc;
}
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
- chunk->page = chunk->page_ar;
- chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+ chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
mutex_unlock(&pcpu_alloc_mutex);
+ /* return address relative to unit0 */
return __addr_to_pcpu_ptr(chunk->vm->addr + off);
fail_unlock:
}
spin_unlock_irq(&pcpu_lock);
- mutex_unlock(&pcpu_alloc_mutex);
list_for_each_entry_safe(chunk, next, &todo, list) {
- pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+ pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
+
+ mutex_unlock(&pcpu_alloc_mutex);
}
/**
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
- * @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes, 0 for none
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
* @base_addr: mapped address
+ * @unit_map: cpu -> unit map, NULL for sequential mapping
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
* setup path.
*
- * @get_page_fn() should return pointer to percpu page given cpu
- * number and page number. It should at least return enough pages to
- * cover the static area. The returned pages for static area should
- * have been initialized with valid data. It can also return pages
- * after the static area. NULL return indicates end of pages for the
- * cpu. Note that @get_page_fn() must return the same number of pages
- * for all cpus.
- *
* @reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
-size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- size_t static_size, size_t reserved_size,
+size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
ssize_t dyn_size, size_t unit_size,
- void *base_addr)
+ void *base_addr, const int *unit_map)
{
static struct vm_struct first_vm;
static int smap[2], dmap[2];
size_t size_sum = static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0);
struct pcpu_chunk *schunk, *dchunk = NULL;
- unsigned int cpu;
- int i, nr_pages;
+ unsigned int cpu, tcpu;
+ int i;
- /* santiy checks */
+ /* sanity checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+ /* determine number of units and verify and initialize pcpu_unit_map */
+ if (unit_map) {
+ int first_unit = INT_MAX, last_unit = INT_MIN;
+
+ for_each_possible_cpu(cpu) {
+ int unit = unit_map[cpu];
+
+ BUG_ON(unit < 0);
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ /* the mapping should be one-to-one */
+ BUG_ON(unit_map[tcpu] == unit);
+ }
+
+ if (unit < first_unit) {
+ pcpu_first_unit_cpu = cpu;
+ first_unit = unit;
+ }
+ if (unit > last_unit) {
+ pcpu_last_unit_cpu = cpu;
+ last_unit = unit;
+ }
+ }
+ pcpu_nr_units = last_unit + 1;
+ pcpu_unit_map = unit_map;
+ } else {
+ int *identity_map;
+
+ /* #units == #cpus, identity mapped */
+ identity_map = alloc_bootmem(nr_cpu_ids *
+ sizeof(identity_map[0]));
+
+ for_each_possible_cpu(cpu)
+ identity_map[cpu] = cpu;
+
+ pcpu_first_unit_cpu = 0;
+ pcpu_last_unit_cpu = pcpu_nr_units - 1;
+ pcpu_nr_units = nr_cpu_ids;
+ pcpu_unit_map = identity_map;
+ }
+
+ /* determine basic parameters */
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
- pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+ pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
+ pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
if (dyn_size < 0)
dyn_size = pcpu_unit_size - static_size - reserved_size;
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
- schunk->page = schunk->page_ar;
schunk->immutable = true;
+ bitmap_fill(schunk->populated, pcpu_unit_pages);
if (reserved_size) {
schunk->free_size = reserved_size;
/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+ dchunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
- dchunk->page = schunk->page_ar; /* share page map with schunk */
dchunk->immutable = true;
+ bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
- /* assign pages */
- nr_pages = -1;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < pcpu_unit_pages; i++) {
- struct page *page = get_page_fn(cpu, i);
-
- if (!page)
- break;
- *pcpu_chunk_pagep(schunk, cpu, i) = page;
- }
-
- BUG_ON(i < PFN_UP(static_size));
-
- if (nr_pages < 0)
- nr_pages = i;
- else
- BUG_ON(nr_pages != i);
- }
-
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
/* we're done */
- pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
+ pcpu_base_addr = schunk->vm->addr;
return pcpu_unit_size;
}
-static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
- ssize_t *dyn_sizep)
+static inline size_t pcpu_calc_fc_sizes(size_t static_size,
+ size_t reserved_size,
+ ssize_t *dyn_sizep)
{
size_t size_sum;
return size_sum;
}
-/*
- * Embedding first chunk setup helper.
- */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_size __initdata;
-static size_t pcpue_unit_size __initdata;
-
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
-{
- size_t off = (size_t)pageno << PAGE_SHIFT;
-
- if (off >= pcpue_size)
- return NULL;
-
- return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
-}
-
+#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
+ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @static_size: the size of static percpu area in bytes
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
ssize_t dyn_size)
{
- size_t chunk_size;
+ size_t size_sum, unit_size, chunk_size;
+ void *base;
unsigned int cpu;
/* determine parameters and allocate */
- pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
- pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
- chunk_size = pcpue_unit_size * num_possible_cpus();
+ unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+ chunk_size = unit_size * nr_cpu_ids;
- pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
- if (!pcpue_ptr) {
+ base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+ if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
return -ENOMEM;
}
/* return the leftover and copy */
- for_each_possible_cpu(cpu) {
- void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
-
- free_bootmem(__pa(ptr + pcpue_size),
- pcpue_unit_size - pcpue_size);
- memcpy(ptr, __per_cpu_load, static_size);
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ void *ptr = base + cpu * unit_size;
+
+ if (cpu_possible(cpu)) {
+ free_bootmem(__pa(ptr + size_sum),
+ unit_size - size_sum);
+ memcpy(ptr, __per_cpu_load, static_size);
+ } else
+ free_bootmem(__pa(ptr), unit_size);
}
/* we're ready, commit */
- pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
- pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+ pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
+ PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size,
+ unit_size);
- return pcpu_setup_first_chunk(pcpue_get_page, static_size,
- reserved_size, dyn_size,
- pcpue_unit_size, pcpue_ptr);
-}
-
-/*
- * 4k page first chunk setup helper.
- */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_unit_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
- if (pageno < pcpu4k_unit_pages)
- return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
- return NULL;
+ return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ unit_size, base, NULL);
}
+#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
+ !CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/**
- * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: funtion to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
- * This is a helper to ease setting up embedded first percpu chunk and
- * can be called where pcpu_setup_first_chunk() is expected.
+ * This is a helper to ease setting up page-remapped first percpu
+ * chunk and can be called where pcpu_setup_first_chunk() is expected.
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page into vmalloc area.
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
-ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
- pcpu_fc_alloc_fn_t alloc_fn,
- pcpu_fc_free_fn_t free_fn,
- pcpu_fc_populate_pte_fn_t populate_pte_fn)
+ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct vm;
+ char psize_str[16];
+ int unit_pages;
size_t pages_size;
+ struct page **pages;
unsigned int cpu;
int i, j;
ssize_t ret;
- pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
- PCPU_MIN_UNIT_SIZE));
+ snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
+
+ unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
+ PCPU_MIN_UNIT_SIZE));
/* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() *
- sizeof(pcpu4k_pages[0]));
- pcpu4k_pages = alloc_bootmem(pages_size);
+ pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
+ pages = alloc_bootmem(pages_size);
/* allocate pages */
j = 0;
for_each_possible_cpu(cpu)
- for (i = 0; i < pcpu4k_unit_pages; i++) {
+ for (i = 0; i < unit_pages; i++) {
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE);
if (!ptr) {
- pr_warning("PERCPU: failed to allocate "
- "4k page for cpu%u\n", cpu);
+ pr_warning("PERCPU: failed to allocate %s page "
+ "for cpu%u\n", psize_str, cpu);
goto enomem;
}
- pcpu4k_pages[j++] = virt_to_page(ptr);
+ pages[j++] = virt_to_page(ptr);
}
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
- vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT;
+ vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
vm_area_register_early(&vm, PAGE_SIZE);
for_each_possible_cpu(cpu) {
unsigned long unit_addr = (unsigned long)vm.addr +
- (cpu * pcpu4k_unit_pages << PAGE_SHIFT);
+ (cpu * unit_pages << PAGE_SHIFT);
- for (i = 0; i < pcpu4k_unit_pages; i++)
+ for (i = 0; i < unit_pages; i++)
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
- ret = __pcpu_map_pages(unit_addr,
- &pcpu4k_pages[cpu * pcpu4k_unit_pages],
- pcpu4k_unit_pages);
+ ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+ unit_pages);
if (ret < 0)
panic("failed to map percpu area, err=%zd\n", ret);
}
/* we're ready, commit */
- pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
- pcpu4k_unit_pages, static_size);
+ pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n",
+ unit_pages, psize_str, vm.addr, static_size, reserved_size);
- ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
- reserved_size, -1,
- pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
+ ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
+ unit_pages << PAGE_SHIFT, vm.addr, NULL);
goto out_free_ar;
enomem:
while (--j >= 0)
- free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE);
+ free_fn(page_address(pages[j]), PAGE_SIZE);
ret = -ENOMEM;
out_free_ar:
- free_bootmem(__pa(pcpu4k_pages), pages_size);
+ free_bootmem(__pa(pages), pages_size);
return ret;
}
+#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
-/*
- * Large page remapping first chunk setup helper
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
+/**
+ * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ * @unit_sizep: out parameter for unit size
+ * @unit_map: unit_map to be filled
+ * @cpu_distance_fn: callback to determine distance between cpus
+ *
+ * This function builds cpu -> unit map and determine other parameters
+ * considering needed percpu size, large page size and distances
+ * between CPUs in NUMA.
+ *
+ * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ * may share units in the same large page. The returned configuration
+ * is guaranteed to have CPUs on different nodes on different large
+ * pages and >=75% usage of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ * returns the number of units to be allocated. -errno on failure.
*/
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
+ ssize_t *dyn_sizep, size_t *unit_sizep,
+ size_t lpage_size, int *unit_map,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ int group_cnt_max = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs;
+ unsigned int cpu, tcpu;
+ int group, unit;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of lpage_size and is the smallest
+ * which can accomodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, lpage_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ group_cnt_max = max(group_cnt_max, group_cnt[group]);
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group_cnt[group]; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 25%. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ *unit_sizep = alloc_size / best_upa;
+
+ /* assign units to cpus accordingly */
+ unit = 0;
+ for (group = 0; group_cnt[group]; group++) {
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ unit_map[cpu] = unit++;
+ unit = roundup(unit, best_upa);
+ }
+
+ return unit; /* unit contains aligned number of units */
+}
+
struct pcpul_ent {
- unsigned int cpu;
void *ptr;
+ void *map_addr;
};
static size_t pcpul_size;
-static size_t pcpul_unit_size;
+static size_t pcpul_lpage_size;
+static int pcpul_nr_lpages;
static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+ unsigned int *cpup)
{
- size_t off = (size_t)pageno << PAGE_SHIFT;
+ unsigned int cpu;
- if (off >= pcpul_size)
- return NULL;
+ for_each_possible_cpu(cpu)
+ if (unit_map[cpu] == unit) {
+ if (cpup)
+ *cpup = cpu;
+ return true;
+ }
+
+ return false;
+}
- return virt_to_page(pcpul_map[cpu].ptr + off);
+static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+ size_t reserved_size, size_t dyn_size,
+ size_t unit_size, size_t lpage_size,
+ const int *unit_map, int nr_units)
+{
+ int width = 1, v = nr_units;
+ char empty_str[] = "--------";
+ int upl, lpl; /* units per lpage, lpage per line */
+ unsigned int cpu;
+ int lpage, unit;
+
+ while (v /= 10)
+ width++;
+ empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+
+ upl = max_t(int, lpage_size / unit_size, 1);
+ lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+
+ printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+ static_size, reserved_size, dyn_size, unit_size, lpage_size);
+
+ for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+ if (!(unit % upl)) {
+ if (!(lpage++ % lpl)) {
+ printk("\n");
+ printk("%spcpu-lpage: ", lvl);
+ } else
+ printk("| ");
+ }
+ if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ printk("%0*d ", width, cpu);
+ else
+ printk("%s ", empty_str);
+ }
+ printk("\n");
}
/**
* pcpu_lpage_first_chunk - remap the first percpu chunk using large page
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes
+ * @unit_size: unit size in bytes
* @lpage_size: the size of a large page
+ * @unit_map: cpu -> unit mapping
+ * @nr_units: the number of units
* @alloc_fn: function to allocate percpu lpage, always called with lpage_size
* @free_fn: function to free percpu memory, @size <= lpage_size
* @map_fn: function to map percpu lpage, always called with lpage_size
*
- * This allocator uses large page as unit. A large page is allocated
- * for each cpu and each is remapped into vmalloc area using large
- * page mapping. As large page can be quite large, only part of it is
- * used for the first chunk. Unused part is returned to the bootmem
- * allocator.
- *
- * So, the large pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk. The double
- * mapping does add one more large TLB entry pressure but still is
- * much better than only using 4k mappings while still being NUMA
- * friendly.
+ * This allocator uses large page to build and map the first chunk.
+ * Unlike other helpers, the caller should always specify @dyn_size
+ * and @unit_size. These parameters along with @unit_map and
+ * @nr_units can be determined using pcpu_lpage_build_unit_map().
+ * This two stage initialization is to allow arch code to evaluate the
+ * parameters before committing to it.
+ *
+ * Large pages are allocated as directed by @unit_map and other
+ * parameters and mapped to vmalloc space. Unused holes are returned
+ * to the page allocator. Note that these holes end up being actively
+ * mapped twice - once to the physical mapping and to the vmalloc area
+ * for the first percpu chunk. Depending on architecture, this might
+ * cause problem when changing page attributes of the returned area.
+ * These double mapped areas can be detected using
+ * pcpu_lpage_remapped().
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
- ssize_t dyn_size, size_t lpage_size,
+ size_t dyn_size, size_t unit_size,
+ size_t lpage_size, const int *unit_map,
+ int nr_units,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_map_fn_t map_fn)
{
- size_t size_sum;
+ static struct vm_struct vm;
+ size_t chunk_size = unit_size * nr_units;
size_t map_size;
unsigned int cpu;
- int i, j;
ssize_t ret;
+ int i, j, unit;
- /*
- * Currently supports only single page. Supporting multiple
- * pages won't be too difficult if it ever becomes necessary.
- */
- size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+ pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
+ unit_size, lpage_size, unit_map, nr_units);
- pcpul_unit_size = lpage_size;
- pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
- if (pcpul_size > pcpul_unit_size) {
- pr_warning("PERCPU: static data is larger than large page, "
- "can't use large page\n");
- return -EINVAL;
- }
+ BUG_ON(chunk_size % lpage_size);
+
+ pcpul_size = static_size + reserved_size + dyn_size;
+ pcpul_lpage_size = lpage_size;
+ pcpul_nr_lpages = chunk_size / lpage_size;
/* allocate pointer array and alloc large pages */
- map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+ map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
pcpul_map = alloc_bootmem(map_size);
- for_each_possible_cpu(cpu) {
+ /* allocate all pages */
+ for (i = 0; i < pcpul_nr_lpages; i++) {
+ size_t offset = i * lpage_size;
+ int first_unit = offset / unit_size;
+ int last_unit = (offset + lpage_size - 1) / unit_size;
void *ptr;
+ /* find out which cpu is mapped to this unit */
+ for (unit = first_unit; unit <= last_unit; unit++)
+ if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ goto found;
+ continue;
+ found:
ptr = alloc_fn(cpu, lpage_size);
if (!ptr) {
pr_warning("PERCPU: failed to allocate large page "
goto enomem;
}
- /*
- * Only use pcpul_size bytes and give back the rest.
- *
- * Ingo: The lpage_size up-rounding bootmem is needed
- * to make sure the partial lpage is still fully RAM -
- * it's not well-specified to have a incompatible area
- * (unmapped RAM, device memory, etc.) in that hole.
- */
- free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
-
- pcpul_map[cpu].cpu = cpu;
- pcpul_map[cpu].ptr = ptr;
+ pcpul_map[i].ptr = ptr;
+ }
- memcpy(ptr, __per_cpu_load, static_size);
+ /* return unused holes */
+ for (unit = 0; unit < nr_units; unit++) {
+ size_t start = unit * unit_size;
+ size_t end = start + unit_size;
+ size_t off, next;
+
+ /* don't free used part of occupied unit */
+ if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+ start += pcpul_size;
+
+ /* unit can span more than one page, punch the holes */
+ for (off = start; off < end; off = next) {
+ void *ptr = pcpul_map[off / lpage_size].ptr;
+ next = min(roundup(off + 1, lpage_size), end);
+ if (ptr)
+ free_fn(ptr + off % lpage_size, next - off);
+ }
}
- /* allocate address and map */
- pcpul_vm.flags = VM_ALLOC;
- pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
- vm_area_register_early(&pcpul_vm, pcpul_unit_size);
+ /* allocate address, map and copy */
+ vm.flags = VM_ALLOC;
+ vm.size = chunk_size;
+ vm_area_register_early(&vm, unit_size);
+
+ for (i = 0; i < pcpul_nr_lpages; i++) {
+ if (!pcpul_map[i].ptr)
+ continue;
+ pcpul_map[i].map_addr = vm.addr + i * lpage_size;
+ map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
+ }
for_each_possible_cpu(cpu)
- map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
- pcpul_vm.addr + cpu * pcpul_unit_size);
+ memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
+ static_size);
/* we're ready, commit */
- pr_info("PERCPU: Remapped at %p with large pages, static data "
- "%zu bytes\n", pcpul_vm.addr, static_size);
-
- ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
- reserved_size, dyn_size, pcpul_unit_size,
- pcpul_vm.addr);
-
- /* sort pcpul_map array for pcpu_lpage_remapped() */
- for (i = 0; i < num_possible_cpus() - 1; i++)
- for (j = i + 1; j < num_possible_cpus(); j++)
- if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
- struct pcpul_ent tmp = pcpul_map[i];
- pcpul_map[i] = pcpul_map[j];
- pcpul_map[j] = tmp;
- }
+ pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
+ vm.addr, static_size, reserved_size, dyn_size, unit_size);
+
+ ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ unit_size, vm.addr, unit_map);
+
+ /*
+ * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
+ * lpages are pushed to the end and trimmed.
+ */
+ for (i = 0; i < pcpul_nr_lpages - 1; i++)
+ for (j = i + 1; j < pcpul_nr_lpages; j++) {
+ struct pcpul_ent tmp;
+
+ if (!pcpul_map[j].ptr)
+ continue;
+ if (pcpul_map[i].ptr &&
+ pcpul_map[i].ptr < pcpul_map[j].ptr)
+ continue;
+
+ tmp = pcpul_map[i];
+ pcpul_map[i] = pcpul_map[j];
+ pcpul_map[j] = tmp;
+ }
+
+ while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
+ pcpul_nr_lpages--;
return ret;
enomem:
- for_each_possible_cpu(cpu)
- if (pcpul_map[cpu].ptr)
- free_fn(pcpul_map[cpu].ptr, pcpul_size);
+ for (i = 0; i < pcpul_nr_lpages; i++)
+ if (pcpul_map[i].ptr)
+ free_fn(pcpul_map[i].ptr, lpage_size);
free_bootmem(__pa(pcpul_map), map_size);
return -ENOMEM;
}
*/
void *pcpu_lpage_remapped(void *kaddr)
{
- unsigned long unit_mask = pcpul_unit_size - 1;
- void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
- unsigned long offset = (unsigned long)kaddr & unit_mask;
- int left = 0, right = num_possible_cpus() - 1;
+ unsigned long lpage_mask = pcpul_lpage_size - 1;
+ void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
+ unsigned long offset = (unsigned long)kaddr & lpage_mask;
+ int left = 0, right = pcpul_nr_lpages - 1;
int pos;
/* pcpul in use at all? */
left = pos + 1;
else if (pcpul_map[pos].ptr > lpage_addr)
right = pos - 1;
- else {
- /* it shouldn't be in the area for the first chunk */
- WARN_ON(offset < pcpul_size);
-
- return pcpul_vm.addr +
- pcpul_map[pos].cpu * pcpul_unit_size + offset;
- }
+ else
+ return pcpul_map[pos].map_addr + offset;
}
return NULL;
}
-#endif
+#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */
/*
* Generic percpu area setup.