2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/config.h>
10 #include <linux/signal.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/errno.h>
14 #include <linux/string.h>
15 #include <linux/types.h>
16 #include <linux/ptrace.h>
17 #include <linux/mman.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/pagemap.h>
23 #include <linux/bootmem.h>
24 #include <linux/proc_fs.h>
26 #include <asm/processor.h>
27 #include <asm/system.h>
28 #include <asm/uaccess.h>
29 #include <asm/pgtable.h>
30 #include <asm/pgalloc.h>
32 #include <asm/fixmap.h>
36 #include <asm/mmu_context.h>
37 #include <asm/proto.h>
44 #ifdef CONFIG_GART_IOMMU
50 static unsigned long dma_reserve __initdata;
52 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
55 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
56 * physical space so we can cache the place of the first one and move
57 * around without checking the pgd every time.
62 long i, total = 0, reserved = 0;
63 long shared = 0, cached = 0;
67 printk(KERN_INFO "Mem-info:\n");
69 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
71 for_each_pgdat(pgdat) {
72 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
73 page = pfn_to_page(pgdat->node_start_pfn + i);
75 if (PageReserved(page))
77 else if (PageSwapCache(page))
79 else if (page_count(page))
80 shared += page_count(page) - 1;
83 printk(KERN_INFO "%lu pages of RAM\n", total);
84 printk(KERN_INFO "%lu reserved pages\n",reserved);
85 printk(KERN_INFO "%lu pages shared\n",shared);
86 printk(KERN_INFO "%lu pages swap cached\n",cached);
89 /* References to section boundaries */
91 extern char _text, _etext, _edata, __bss_start, _end[];
92 extern char __init_begin, __init_end;
96 static void *spp_getpage(void)
100 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
102 ptr = alloc_bootmem_pages(PAGE_SIZE);
103 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
104 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
106 Dprintk("spp_getpage %p\n", ptr);
110 static void set_pte_phys(unsigned long vaddr,
111 unsigned long phys, pgprot_t prot)
118 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
120 pgd = pgd_offset_k(vaddr);
121 if (pgd_none(*pgd)) {
122 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
125 pud = pud_offset(pgd, vaddr);
126 if (pud_none(*pud)) {
127 pmd = (pmd_t *) spp_getpage();
128 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
129 if (pmd != pmd_offset(pud, 0)) {
130 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
134 pmd = pmd_offset(pud, vaddr);
135 if (pmd_none(*pmd)) {
136 pte = (pte_t *) spp_getpage();
137 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
138 if (pte != pte_offset_kernel(pmd, 0)) {
139 printk("PAGETABLE BUG #02!\n");
143 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
145 pte = pte_offset_kernel(pmd, vaddr);
146 if (!pte_none(*pte) &&
147 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
149 set_pte(pte, new_pte);
152 * It's enough to flush this one mapping.
153 * (PGE mappings get flushed as well)
155 __flush_tlb_one(vaddr);
158 /* NOTE: this is meant to be run only at boot */
159 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
161 unsigned long address = __fix_to_virt(idx);
163 if (idx >= __end_of_fixed_addresses) {
164 printk("Invalid __set_fixmap\n");
167 set_pte_phys(address, phys, prot);
170 unsigned long __initdata table_start, table_end;
172 extern pmd_t temp_boot_pmds[];
174 static struct temp_map {
178 } temp_mappings[] __initdata = {
179 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
180 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
184 static __init void *alloc_low_page(int *index, unsigned long *phys)
188 unsigned long pfn = table_end++, paddr;
192 panic("alloc_low_page: ran out of memory");
193 for (i = 0; temp_mappings[i].allocated; i++) {
194 if (!temp_mappings[i].pmd)
195 panic("alloc_low_page: ran out of temp mappings");
197 ti = &temp_mappings[i];
198 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
199 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
202 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
204 *phys = pfn * PAGE_SIZE;
208 static __init void unmap_low_page(int i)
210 struct temp_map *ti = &temp_mappings[i];
211 set_pmd(ti->pmd, __pmd(0));
215 static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
219 i = pud_index(address);
221 for (; i < PTRS_PER_PUD; pud++, i++) {
223 unsigned long paddr, pmd_phys;
226 paddr = address + i*PUD_SIZE;
228 for (; i < PTRS_PER_PUD; i++, pud++)
229 set_pud(pud, __pud(0));
233 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
234 set_pud(pud, __pud(0));
238 pmd = alloc_low_page(&map, &pmd_phys);
239 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
240 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
244 for (; j < PTRS_PER_PMD; j++, pmd++)
245 set_pmd(pmd, __pmd(0));
248 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
249 pe &= __supported_pte_mask;
250 set_pmd(pmd, __pmd(pe));
257 static void __init find_early_table_space(unsigned long end)
259 unsigned long puds, pmds, tables;
261 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
262 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
263 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
264 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
266 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
267 if (table_start == -1UL)
268 panic("Cannot find space for the kernel page tables");
270 table_start >>= PAGE_SHIFT;
271 table_end = table_start;
274 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
275 This runs before bootmem is initialized and gets pages directly from the
276 physical memory. To access them they are temporarily mapped. */
277 void __init init_memory_mapping(unsigned long start, unsigned long end)
281 Dprintk("init_memory_mapping\n");
284 * Find space for the kernel direct mapping tables.
285 * Later we should allocate these tables in the local node of the memory
286 * mapped. Unfortunately this is done currently before the nodes are
289 find_early_table_space(end);
291 start = (unsigned long)__va(start);
292 end = (unsigned long)__va(end);
294 for (; start < end; start = next) {
296 unsigned long pud_phys;
297 pud_t *pud = alloc_low_page(&map, &pud_phys);
298 next = start + PGDIR_SIZE;
301 phys_pud_init(pud, __pa(start), __pa(next));
302 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
306 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
308 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
309 table_start<<PAGE_SHIFT,
310 table_end<<PAGE_SHIFT);
313 extern struct x8664_pda cpu_pda[NR_CPUS];
315 /* Assumes all CPUs still execute in init_mm */
316 void zap_low_mappings(void)
318 pgd_t *pgd = pgd_offset_k(0UL);
323 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
325 size_zones(unsigned long *z, unsigned long *h,
326 unsigned long start_pfn, unsigned long end_pfn)
331 for (i = 0; i < MAX_NR_ZONES; i++)
334 if (start_pfn < MAX_DMA_PFN)
335 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
336 if (start_pfn < MAX_DMA32_PFN) {
337 unsigned long dma32_pfn = MAX_DMA32_PFN;
338 if (dma32_pfn > end_pfn)
340 z[ZONE_DMA32] = dma32_pfn - start_pfn;
342 z[ZONE_NORMAL] = end_pfn - start_pfn;
344 /* Remove lower zones from higher ones. */
346 for (i = 0; i < MAX_NR_ZONES; i++) {
354 for (i = 0; i < MAX_NR_ZONES; i++) {
357 h[i] = e820_hole_size(s, w);
360 /* Add the space pace needed for mem_map to the holes too. */
361 for (i = 0; i < MAX_NR_ZONES; i++)
362 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
364 /* The 16MB DMA zone has the kernel and other misc mappings.
367 h[ZONE_DMA] += dma_reserve;
368 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
370 "Kernel too large and filling up ZONE_DMA?\n");
371 h[ZONE_DMA] = z[ZONE_DMA];
377 void __init paging_init(void)
379 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
380 size_zones(zones, holes, 0, end_pfn);
381 free_area_init_node(0, NODE_DATA(0), zones,
382 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
386 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
387 from the CPU leading to inconsistent cache lines. address and size
388 must be aligned to 2MB boundaries.
389 Does nothing when the mapping doesn't exist. */
390 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
392 unsigned long end = address + size;
394 BUG_ON(address & ~LARGE_PAGE_MASK);
395 BUG_ON(size & ~LARGE_PAGE_MASK);
397 for (; address < end; address += LARGE_PAGE_SIZE) {
398 pgd_t *pgd = pgd_offset_k(address);
403 pud = pud_offset(pgd, address);
406 pmd = pmd_offset(pud, address);
407 if (!pmd || pmd_none(*pmd))
409 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
410 /* Could handle this, but it should not happen currently. */
412 "clear_kernel_mapping: mapping has been split. will leak memory\n");
415 set_pmd(pmd, __pmd(0));
420 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
423 void __init mem_init(void)
425 long codesize, reservedpages, datasize, initsize;
427 #ifdef CONFIG_SWIOTLB
428 if (!iommu_aperture &&
429 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
435 /* How many end-of-memory variables you have, grandma! */
436 max_low_pfn = end_pfn;
438 num_physpages = end_pfn;
439 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
441 /* clear the zero-page */
442 memset(empty_zero_page, 0, PAGE_SIZE);
446 /* this will put all low memory onto the freelists */
448 totalram_pages = numa_free_all_bootmem();
450 totalram_pages = free_all_bootmem();
452 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
456 codesize = (unsigned long) &_etext - (unsigned long) &_text;
457 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
458 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
460 /* Register memory areas for /proc/kcore */
461 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
462 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
463 VMALLOC_END-VMALLOC_START);
464 kclist_add(&kcore_kernel, &_stext, _end - _stext);
465 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
466 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
467 VSYSCALL_END - VSYSCALL_START);
469 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
470 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
471 end_pfn << (PAGE_SHIFT-10),
473 reservedpages << (PAGE_SHIFT-10),
478 * Subtle. SMP is doing its boot stuff late (because it has to
479 * fork idle threads) - but it also needs low mappings for the
480 * protected-mode entry to work. We zap these entries only after
481 * the WP-bit has been tested.
488 extern char __initdata_begin[], __initdata_end[];
490 void free_initmem(void)
494 addr = (unsigned long)(&__init_begin);
495 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
496 ClearPageReserved(virt_to_page(addr));
497 set_page_count(virt_to_page(addr), 1);
498 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
502 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
503 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
506 #ifdef CONFIG_BLK_DEV_INITRD
507 void free_initrd_mem(unsigned long start, unsigned long end)
509 if (start < (unsigned long)&_end)
511 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
512 for (; start < end; start += PAGE_SIZE) {
513 ClearPageReserved(virt_to_page(start));
514 set_page_count(virt_to_page(start), 1);
521 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
523 /* Should check here against the e820 map to avoid double free */
525 int nid = phys_to_nid(phys);
526 reserve_bootmem_node(NODE_DATA(nid), phys, len);
528 reserve_bootmem(phys, len);
530 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
531 dma_reserve += len / PAGE_SIZE;
534 int kern_addr_valid(unsigned long addr)
536 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
542 if (above != 0 && above != -1UL)
545 pgd = pgd_offset_k(addr);
549 pud = pud_offset(pgd, addr);
553 pmd = pmd_offset(pud, addr);
557 return pfn_valid(pmd_pfn(*pmd));
559 pte = pte_offset_kernel(pmd, addr);
562 return pfn_valid(pte_pfn(*pte));
566 #include <linux/sysctl.h>
568 extern int exception_trace, page_fault_trace;
570 static ctl_table debug_table2[] = {
571 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
573 #ifdef CONFIG_CHECKING
574 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
580 static ctl_table debug_root_table2[] = {
581 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
582 .child = debug_table2 },
586 static __init int x8664_sysctl_init(void)
588 register_sysctl_table(debug_root_table2, 1);
591 __initcall(x8664_sysctl_init);
594 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
595 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
596 not need special handling anymore. */
598 static struct vm_area_struct gate_vma = {
599 .vm_start = VSYSCALL_START,
600 .vm_end = VSYSCALL_END,
601 .vm_page_prot = PAGE_READONLY
604 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
606 #ifdef CONFIG_IA32_EMULATION
607 if (test_tsk_thread_flag(tsk, TIF_IA32))
613 int in_gate_area(struct task_struct *task, unsigned long addr)
615 struct vm_area_struct *vma = get_gate_vma(task);
618 return (addr >= vma->vm_start) && (addr < vma->vm_end);
621 /* Use this when you have no reliable task/vma, typically from interrupt
622 * context. It is less reliable than using the task's vma and may give
625 int in_gate_area_no_task(unsigned long addr)
627 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);