]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
memory-hotplug: common APIs to support page tables hot-remove
authorWen Congyang <wency@cn.fujitsu.com>
Wed, 20 Feb 2013 02:14:15 +0000 (13:14 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Wed, 20 Feb 2013 05:52:30 +0000 (16:52 +1100)
When memory is removed, the corresponding pagetables should alse be
removed.  This patch introduces some common APIs to support vmemmap
pagetable and x86_64 architecture pagetable removing.

All pages of virtual mapping in removed memory cannot be freed if some
pages used as PGD/PUD includes not only removed memory but also other
memory.  So the patch uses the following way to check whether page can be
freed or not.

 1. When removing memory, the page structs of the removed memory are filled
    with 0FD.
 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
    In this case, the page used as PT/PMD can be freed.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Wu Jianguo <wujianguo@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
arch/x86/include/asm/pgtable_types.h
arch/x86/mm/init_64.c
arch/x86/mm/pageattr.c
include/linux/bootmem.h

index e6423002c10b5211af8fd45cb1cefa120d45c534..567b5d0632b2e4563781fdfaad508cd1c6c6665a 100644 (file)
@@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
 extern phys_addr_t slow_virt_to_phys(void *__address);
 
 #endif /* !__ASSEMBLY__ */
index f17aa76dc1ae2d435b3e889522250f6c771bc1b3..dd56c0f7d7bda84dc862c96c7f7d35239c3e3a55 100644 (file)
@@ -707,6 +707,305 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+       struct zone *zone;
+       bool bootmem = false;
+       unsigned long magic;
+       unsigned int nr_pages = 1 << order;
+
+       /* bootmem page has reserved flag */
+       if (PageReserved(page)) {
+               __ClearPageReserved(page);
+               bootmem = true;
+
+               magic = (unsigned long)page->lru.next;
+               if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+                       while (nr_pages--)
+                               put_page_bootmem(page++);
+               } else
+                       __free_pages_bootmem(page, order);
+       } else
+               free_pages((unsigned long)page_address(page), order);
+
+       /*
+        * SECTION_INFO pages and MIX_SECTION_INFO pages
+        * are all allocated by bootmem.
+        */
+       if (bootmem) {
+               zone = page_zone(page);
+               zone_span_writelock(zone);
+               zone->present_pages += nr_pages;
+               zone_span_writeunlock(zone);
+               totalram_pages += nr_pages;
+       }
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+       pte_t *pte;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PTE; i++) {
+               pte = pte_start + i;
+               if (pte_val(*pte))
+                       return;
+       }
+
+       /* free a pte talbe */
+       free_pagetable(pmd_page(*pmd), 0);
+       spin_lock(&init_mm.page_table_lock);
+       pmd_clear(pmd);
+       spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+       pmd_t *pmd;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PMD; i++) {
+               pmd = pmd_start + i;
+               if (pmd_val(*pmd))
+                       return;
+       }
+
+       /* free a pmd talbe */
+       free_pagetable(pud_page(*pud), 0);
+       spin_lock(&init_mm.page_table_lock);
+       pud_clear(pud);
+       spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+       pud_t *pud;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PUD; i++) {
+               pud = pud_start + i;
+               if (pud_val(*pud))
+                       return false;
+       }
+
+       /* free a pud table */
+       free_pagetable(pgd_page(*pgd), 0);
+       spin_lock(&init_mm.page_table_lock);
+       pgd_clear(pgd);
+       spin_unlock(&init_mm.page_table_lock);
+
+       return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+                bool direct)
+{
+       unsigned long next, pages = 0;
+       pte_t *pte;
+       void *page_addr;
+       phys_addr_t phys_addr;
+
+       pte = pte_start + pte_index(addr);
+       for (; addr < end; addr = next, pte++) {
+               next = (addr + PAGE_SIZE) & PAGE_MASK;
+               if (next > end)
+                       next = end;
+
+               if (!pte_present(*pte))
+                       continue;
+
+               /*
+                * We mapped [0,1G) memory as identity mapping when
+                * initializing, in arch/x86/kernel/head_64.S. These
+                * pagetables cannot be removed.
+                */
+               phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+               if (phys_addr < (phys_addr_t)0x40000000)
+                       return;
+
+               if (IS_ALIGNED(addr, PAGE_SIZE) &&
+                   IS_ALIGNED(next, PAGE_SIZE)) {
+                       if (!direct) {
+                               free_pagetable(pte_page(*pte), 0);
+                               pages++;
+                       }
+
+                       spin_lock(&init_mm.page_table_lock);
+                       pte_clear(&init_mm, addr, pte);
+                       spin_unlock(&init_mm.page_table_lock);
+               } else {
+                       /*
+                        * If we are not removing the whole page, it means
+                        * other ptes in this page are being used and we canot
+                        * remove them. So fill the unused ptes with 0xFD, and
+                        * remove the page when it is wholly filled with 0xFD.
+                        */
+                       memset((void *)addr, PAGE_INUSE, next - addr);
+                       page_addr = page_address(pte_page(*pte));
+
+                       if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+                               free_pagetable(pte_page(*pte), 0);
+                               pages++;
+
+                               spin_lock(&init_mm.page_table_lock);
+                               pte_clear(&init_mm, addr, pte);
+                               spin_unlock(&init_mm.page_table_lock);
+                       }
+               }
+       }
+
+       /* Call free_pte_table() in remove_pmd_table(). */
+       flush_tlb_all();
+       if (direct)
+               update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+                bool direct)
+{
+       unsigned long pte_phys, next, pages = 0;
+       pte_t *pte_base;
+       pmd_t *pmd;
+
+       pmd = pmd_start + pmd_index(addr);
+       for (; addr < end; addr = next, pmd++) {
+               next = pmd_addr_end(addr, end);
+
+               if (!pmd_present(*pmd))
+                       continue;
+
+               if (pmd_large(*pmd)) {
+                       if (IS_ALIGNED(addr, PMD_SIZE) &&
+                           IS_ALIGNED(next, PMD_SIZE)) {
+                               if (!direct) {
+                                       free_pagetable(pmd_page(*pmd),
+                                                      get_order(PMD_SIZE));
+                                       pages++;
+                               }
+
+                               spin_lock(&init_mm.page_table_lock);
+                               pmd_clear(pmd);
+                               spin_unlock(&init_mm.page_table_lock);
+                               continue;
+                       }
+
+                       /*
+                        * We use 2M page, but we need to remove part of them,
+                        * so split 2M page to 4K page.
+                        */
+                       pte_base = (pte_t *)alloc_low_page(&pte_phys);
+                       BUG_ON(!pte_base);
+                       __split_large_page((pte_t *)pmd, addr,
+                                          (pte_t *)pte_base);
+
+                       spin_lock(&init_mm.page_table_lock);
+                       pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+                       spin_unlock(&init_mm.page_table_lock);
+
+                       flush_tlb_all();
+               }
+
+               pte_base = (pte_t *)map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+               remove_pte_table(pte_base, addr, next, direct);
+               free_pte_table(pte_base, pmd);
+               unmap_low_page(pte_base);
+       }
+
+       /* Call free_pmd_table() in remove_pud_table(). */
+       if (direct)
+               update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+                bool direct)
+{
+       unsigned long pmd_phys, next, pages = 0;
+       pmd_t *pmd_base;
+       pud_t *pud;
+
+       pud = pud_start + pud_index(addr);
+       for (; addr < end; addr = next, pud++) {
+               next = pud_addr_end(addr, end);
+
+               if (!pud_present(*pud))
+                       continue;
+
+               if (pud_large(*pud)) {
+                       if (IS_ALIGNED(addr, PUD_SIZE) &&
+                           IS_ALIGNED(next, PUD_SIZE)) {
+                               if (!direct) {
+                                       free_pagetable(pud_page(*pud),
+                                                      get_order(PUD_SIZE));
+                                       pages++;
+                               }
+
+                               spin_lock(&init_mm.page_table_lock);
+                               pud_clear(pud);
+                               spin_unlock(&init_mm.page_table_lock);
+                               continue;
+                       }
+
+                       /*
+                        * We use 1G page, but we need to remove part of them,
+                        * so split 1G page to 2M page.
+                        */
+                       pmd_base = (pmd_t *)alloc_low_page(&pmd_phys);
+                       BUG_ON(!pmd_base);
+                       __split_large_page((pte_t *)pud, addr,
+                                          (pte_t *)pmd_base);
+
+                       spin_lock(&init_mm.page_table_lock);
+                       pud_populate(&init_mm, pud, __va(pmd_phys));
+                       spin_unlock(&init_mm.page_table_lock);
+
+                       flush_tlb_all();
+               }
+
+               pmd_base = (pmd_t *)map_low_page((pmd_t *)pud_page_vaddr(*pud));
+               remove_pmd_table(pmd_base, addr, next, direct);
+               free_pmd_table(pmd_base, pud);
+               unmap_low_page(pmd_base);
+       }
+
+       if (direct)
+               update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+       unsigned long next;
+       pgd_t *pgd;
+       pud_t *pud;
+       bool pgd_changed = false;
+
+       for (; start < end; start = next) {
+               pgd = pgd_offset_k(start);
+               if (!pgd_present(*pgd))
+                       continue;
+
+               next = pgd_addr_end(start, end);
+
+               pud = (pud_t *)map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+               remove_pud_table(pud, start, next, direct);
+               if (free_pud_table(pud, pgd))
+                       pgd_changed = true;
+               unmap_low_page(pud);
+       }
+
+       if (pgd_changed)
+               sync_global_pgds(start, end - 1);
+
+       flush_tlb_all();
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 int __ref arch_remove_memory(u64 start, u64 size)
 {
index 4a3de7ce0bf2382e050efc9a516c27bd719206aa..091934e1d0d97ca26570eb9962aa1890e08e7bd8 100644 (file)
@@ -542,21 +542,13 @@ out_unlock:
        return do_split;
 }
 
-static int split_large_page(pte_t *kpte, unsigned long address)
+int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
 {
        unsigned long pfn, pfninc = 1;
        unsigned int i, level;
-       pte_t *pbase, *tmp;
+       pte_t *tmp;
        pgprot_t ref_prot;
-       struct page *base;
-
-       if (!debug_pagealloc)
-               spin_unlock(&cpa_lock);
-       base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
-       if (!debug_pagealloc)
-               spin_lock(&cpa_lock);
-       if (!base)
-               return -ENOMEM;
+       struct page *base = virt_to_page(pbase);
 
        spin_lock(&pgd_lock);
        /*
@@ -564,10 +556,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
         * up for us already:
         */
        tmp = lookup_address(address, &level);
-       if (tmp != kpte)
-               goto out_unlock;
+       if (tmp != kpte) {
+               spin_unlock(&pgd_lock);
+               return 1;
+       }
 
-       pbase = (pte_t *)page_address(base);
        paravirt_alloc_pte(&init_mm, page_to_pfn(base));
        ref_prot = pte_pgprot(pte_clrhuge(*kpte));
        /*
@@ -633,17 +626,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
         * going on.
         */
        __flush_tlb_all();
+       spin_unlock(&pgd_lock);
 
-       base = NULL;
+       return 0;
+}
 
-out_unlock:
-       /*
-        * If we dropped out via the lookup_address check under
-        * pgd_lock then stick the page back into the pool:
-        */
-       if (base)
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+       pte_t *pbase;
+       struct page *base;
+
+       if (!debug_pagealloc)
+               spin_unlock(&cpa_lock);
+       base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+       if (!debug_pagealloc)
+               spin_lock(&cpa_lock);
+       if (!base)
+               return -ENOMEM;
+
+       pbase = (pte_t *)page_address(base);
+       if (__split_large_page(kpte, address, pbase))
                __free_page(base);
-       spin_unlock(&pgd_lock);
 
        return 0;
 }
index 3cd16ba82f15119701ebd477320304d714aba971..cdc3bab01832530db0c3c2e8865a1a342a44f7ed 100644 (file)
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
                              unsigned long size);
 extern void free_bootmem(unsigned long physaddr, unsigned long size);
 extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,